X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FARM%2FARMLoadStoreOptimizer.cpp;h=6e7e47b8706ae81ca4831247410efcca4cc9e989;hb=f5d467572cb0ac0b80696330ed9d220dc1431bb5;hp=cb1b2a217223d741e099779589769fce0ae8da66;hpb=397fc4874efe9c17e737d4c5c50bd19dc3bf27f5;p=oota-llvm.git diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index cb1b2a21722..6e7e47b8706 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -7,41 +7,47 @@ // //===----------------------------------------------------------------------===// // -// This file contains a pass that performs load / store related peephole -// optimizations. This pass should be run after register allocation. +/// \file This file contains a pass that performs load / store related peephole +/// optimizations. This pass should be run after register allocation. // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "arm-ldst-opt" #include "ARM.h" #include "ARMBaseInstrInfo.h" #include "ARMBaseRegisterInfo.h" +#include "ARMISelLowering.h" #include "ARMMachineFunctionInfo.h" +#include "ARMSubtarget.h" #include "MCTargetDesc/ARMAddressingModes.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Function.h" +#include "ThumbRegisterInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/SelectionDAGNodes.h" -#include "llvm/Target/TargetData.h" +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" using namespace llvm; +#define DEBUG_TYPE "arm-ldst-opt" + STATISTIC(NumLDMGened , "Number of ldm instructions generated"); STATISTIC(NumSTMGened , "Number of stm instructions generated"); STATISTIC(NumVLDMGened, "Number of vldm instructions generated"); @@ -54,87 +60,154 @@ STATISTIC(NumSTRD2STM, "Number of strd instructions turned back into stm"); STATISTIC(NumLDRD2LDR, "Number of ldrd instructions turned back into ldr's"); STATISTIC(NumSTRD2STR, "Number of strd instructions turned back into str's"); -/// ARMAllocLoadStoreOpt - Post- register allocation pass the combine -/// load / store instructions to form ldm / stm instructions. +namespace llvm { +void initializeARMLoadStoreOptPass(PassRegistry &); +} + +#define ARM_LOAD_STORE_OPT_NAME "ARM load / store optimization pass" namespace { + /// Post- register allocation pass the combine load / store instructions to + /// form ldm / stm instructions. struct ARMLoadStoreOpt : public MachineFunctionPass { static char ID; - ARMLoadStoreOpt() : MachineFunctionPass(ID) {} + ARMLoadStoreOpt() : MachineFunctionPass(ID) { + initializeARMLoadStoreOptPass(*PassRegistry::getPassRegistry()); + } + const MachineFunction *MF; const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; const ARMSubtarget *STI; + const TargetLowering *TL; ARMFunctionInfo *AFI; - RegScavenger *RS; - bool isThumb2; + LivePhysRegs LiveRegs; + RegisterClassInfo RegClassInfo; + MachineBasicBlock::const_iterator LiveRegPos; + bool LiveRegsValid; + bool RegClassInfoValid; + bool isThumb1, isThumb2; - virtual bool runOnMachineFunction(MachineFunction &Fn); + bool runOnMachineFunction(MachineFunction &Fn) override; - virtual const char *getPassName() const { - return "ARM load / store optimization pass"; + const char *getPassName() const override { + return ARM_LOAD_STORE_OPT_NAME; } private: + /// A set of load/store MachineInstrs with same base register sorted by + /// offset. struct MemOpQueueEntry { - int Offset; - unsigned Reg; - bool isKill; - unsigned Position; - MachineBasicBlock::iterator MBBI; - bool Merged; - MemOpQueueEntry(int o, unsigned r, bool k, unsigned p, - MachineBasicBlock::iterator i) - : Offset(o), Reg(r), isKill(k), Position(p), MBBI(i), Merged(false) {} + MachineInstr *MI; + int Offset; ///< Load/Store offset. + unsigned Position; ///< Position as counted from end of basic block. + MemOpQueueEntry(MachineInstr *MI, int Offset, unsigned Position) + : MI(MI), Offset(Offset), Position(Position) {} }; typedef SmallVector MemOpQueue; - typedef MemOpQueue::iterator MemOpQueueIter; - - bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - int Offset, unsigned Base, bool BaseKill, int Opcode, - ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch, - DebugLoc dl, - ArrayRef > Regs, - ArrayRef ImpDefs); - void MergeOpsUpdate(MachineBasicBlock &MBB, - MemOpQueue &MemOps, - unsigned memOpsBegin, - unsigned memOpsEnd, - unsigned insertAfter, - int Offset, - unsigned Base, - bool BaseKill, - int Opcode, - ARMCC::CondCodes Pred, - unsigned PredReg, - unsigned Scratch, - DebugLoc dl, - SmallVector &Merges); - void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base, - int Opcode, unsigned Size, - ARMCC::CondCodes Pred, unsigned PredReg, - unsigned Scratch, MemOpQueue &MemOps, - SmallVector &Merges); - - void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps); + + /// A set of MachineInstrs that fulfill (nearly all) conditions to get + /// merged into a LDM/STM. + struct MergeCandidate { + /// List of instructions ordered by load/store offset. + SmallVector Instrs; + /// Index in Instrs of the instruction being latest in the schedule. + unsigned LatestMIIdx; + /// Index in Instrs of the instruction being earliest in the schedule. + unsigned EarliestMIIdx; + /// Index into the basic block where the merged instruction will be + /// inserted. (See MemOpQueueEntry.Position) + unsigned InsertPos; + /// Whether the instructions can be merged into a ldm/stm instruction. + bool CanMergeToLSMulti; + /// Whether the instructions can be merged into a ldrd/strd instruction. + bool CanMergeToLSDouble; + }; + SpecificBumpPtrAllocator Allocator; + SmallVector Candidates; + SmallVector MergeBaseCandidates; + + void moveLiveRegsBefore(const MachineBasicBlock &MBB, + MachineBasicBlock::const_iterator Before); + unsigned findFreeReg(const TargetRegisterClass &RegClass); + void UpdateBaseRegUses(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, unsigned Base, unsigned WordOffset, + ARMCC::CondCodes Pred, unsigned PredReg); + MachineInstr *CreateLoadStoreMulti(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, + bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, + DebugLoc DL, ArrayRef> Regs); + MachineInstr *CreateLoadStoreDouble(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, + bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, + DebugLoc DL, ArrayRef> Regs) const; + void FormCandidates(const MemOpQueue &MemOps); + MachineInstr *MergeOpsUpdate(const MergeCandidate &Cand); bool FixInvalidRegPairOp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI); - bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - const TargetInstrInfo *TII, - bool &Advance, - MachineBasicBlock::iterator &I); - bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - bool &Advance, - MachineBasicBlock::iterator &I); + bool MergeBaseUpdateLoadStore(MachineInstr *MI); + bool MergeBaseUpdateLSMultiple(MachineInstr *MI); + bool MergeBaseUpdateLSDouble(MachineInstr &MI) const; bool LoadStoreMultipleOpti(MachineBasicBlock &MBB); bool MergeReturnIntoLDM(MachineBasicBlock &MBB); + bool CombineMovBx(MachineBasicBlock &MBB); }; char ARMLoadStoreOpt::ID = 0; } -static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) { +INITIALIZE_PASS(ARMLoadStoreOpt, "arm-load-store-opt", ARM_LOAD_STORE_OPT_NAME, false, false) + +static bool definesCPSR(const MachineInstr *MI) { + for (const auto &MO : MI->operands()) { + if (!MO.isReg()) + continue; + if (MO.isDef() && MO.getReg() == ARM::CPSR && !MO.isDead()) + // If the instruction has live CPSR def, then it's not safe to fold it + // into load / store. + return true; + } + + return false; +} + +static int getMemoryOpOffset(const MachineInstr *MI) { + unsigned Opcode = MI->getOpcode(); + bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD; + unsigned NumOperands = MI->getDesc().getNumOperands(); + unsigned OffField = MI->getOperand(NumOperands-3).getImm(); + + if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 || + Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 || + Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8 || + Opcode == ARM::LDRi12 || Opcode == ARM::STRi12) + return OffField; + + // Thumb1 immediate offsets are scaled by 4 + if (Opcode == ARM::tLDRi || Opcode == ARM::tSTRi || + Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) + return OffField * 4; + + int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField) + : ARM_AM::getAM5Offset(OffField) * 4; + ARM_AM::AddrOpc Op = isAM3 ? ARM_AM::getAM3Op(OffField) + : ARM_AM::getAM5Op(OffField); + + if (Op == ARM_AM::sub) + return -Offset; + + return Offset; +} + +static const MachineOperand &getLoadStoreBaseOp(const MachineInstr &MI) { + return MI.getOperand(1); +} + +static const MachineOperand &getLoadStoreRegOp(const MachineInstr &MI) { + return MI.getOperand(0); +} + +static int getLoadStoreMultipleOpcode(unsigned Opcode, ARM_AM::AMSubMode Mode) { switch (Opcode) { default: llvm_unreachable("Unhandled opcode!"); case ARM::LDRi12: @@ -155,6 +228,23 @@ static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) { case ARM_AM::db: return ARM::STMDB; case ARM_AM::ib: return ARM::STMIB; } + case ARM::tLDRi: + case ARM::tLDRspi: + // tLDMIA is writeback-only - unless the base register is in the input + // reglist. + ++NumLDMGened; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::tLDMIA; + } + case ARM::tSTRi: + case ARM::tSTRspi: + // There is no non-writeback tSTMIA either. + ++NumSTMGened; + switch (Mode) { + default: llvm_unreachable("Unhandled submode!"); + case ARM_AM::ia: return ARM::tSTMIA_UPD; + } case ARM::t2LDRi8: case ARM::t2LDRi12: ++NumLDMGened; @@ -202,10 +292,7 @@ static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) { } } -namespace llvm { - namespace ARM_AM { - -AMSubMode getLoadStoreMultipleSubMode(int Opcode) { +static ARM_AM::AMSubMode getLoadStoreMultipleSubMode(unsigned Opcode) { switch (Opcode) { default: llvm_unreachable("Unhandled opcode!"); case ARM::LDMIA_RET: @@ -213,6 +300,9 @@ AMSubMode getLoadStoreMultipleSubMode(int Opcode) { case ARM::LDMIA_UPD: case ARM::STMIA: case ARM::STMIA_UPD: + case ARM::tLDMIA: + case ARM::tLDMIA_UPD: + case ARM::tSTMIA_UPD: case ARM::t2LDMIA_RET: case ARM::t2LDMIA: case ARM::t2LDMIA_UPD: @@ -256,15 +346,20 @@ AMSubMode getLoadStoreMultipleSubMode(int Opcode) { } } - } // end namespace ARM_AM -} // end namespace llvm +static bool isT1i32Load(unsigned Opc) { + return Opc == ARM::tLDRi || Opc == ARM::tLDRspi; +} static bool isT2i32Load(unsigned Opc) { return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8; } static bool isi32Load(unsigned Opc) { - return Opc == ARM::LDRi12 || isT2i32Load(Opc); + return Opc == ARM::LDRi12 || isT1i32Load(Opc) || isT2i32Load(Opc) ; +} + +static bool isT1i32Store(unsigned Opc) { + return Opc == ARM::tSTRi || Opc == ARM::tSTRspi; } static bool isT2i32Store(unsigned Opc) { @@ -272,362 +367,650 @@ static bool isT2i32Store(unsigned Opc) { } static bool isi32Store(unsigned Opc) { - return Opc == ARM::STRi12 || isT2i32Store(Opc); + return Opc == ARM::STRi12 || isT1i32Store(Opc) || isT2i32Store(Opc); } -/// MergeOps - Create and insert a LDM or STM with Base as base register and -/// registers in Regs as the register operands that would be loaded / stored. -/// It returns true if the transformation is done. -bool -ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - int Offset, unsigned Base, bool BaseKill, - int Opcode, ARMCC::CondCodes Pred, - unsigned PredReg, unsigned Scratch, DebugLoc dl, - ArrayRef > Regs, - ArrayRef ImpDefs) { - // Only a single register to load / store. Don't bother. +static bool isLoadSingle(unsigned Opc) { + return isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD; +} + +static unsigned getImmScale(unsigned Opc) { + switch (Opc) { + default: llvm_unreachable("Unhandled opcode!"); + case ARM::tLDRi: + case ARM::tSTRi: + case ARM::tLDRspi: + case ARM::tSTRspi: + return 1; + case ARM::tLDRHi: + case ARM::tSTRHi: + return 2; + case ARM::tLDRBi: + case ARM::tSTRBi: + return 4; + } +} + +static unsigned getLSMultipleTransferSize(const MachineInstr *MI) { + switch (MI->getOpcode()) { + default: return 0; + case ARM::LDRi12: + case ARM::STRi12: + case ARM::tLDRi: + case ARM::tSTRi: + case ARM::tLDRspi: + case ARM::tSTRspi: + case ARM::t2LDRi8: + case ARM::t2LDRi12: + case ARM::t2STRi8: + case ARM::t2STRi12: + case ARM::VLDRS: + case ARM::VSTRS: + return 4; + case ARM::VLDRD: + case ARM::VSTRD: + return 8; + case ARM::LDMIA: + case ARM::LDMDA: + case ARM::LDMDB: + case ARM::LDMIB: + case ARM::STMIA: + case ARM::STMDA: + case ARM::STMDB: + case ARM::STMIB: + case ARM::tLDMIA: + case ARM::tLDMIA_UPD: + case ARM::tSTMIA_UPD: + case ARM::t2LDMIA: + case ARM::t2LDMDB: + case ARM::t2STMIA: + case ARM::t2STMDB: + case ARM::VLDMSIA: + case ARM::VSTMSIA: + return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4; + case ARM::VLDMDIA: + case ARM::VSTMDIA: + return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8; + } +} + +/// Update future uses of the base register with the offset introduced +/// due to writeback. This function only works on Thumb1. +void +ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, unsigned Base, + unsigned WordOffset, + ARMCC::CondCodes Pred, unsigned PredReg) { + assert(isThumb1 && "Can only update base register uses for Thumb1!"); + // Start updating any instructions with immediate offsets. Insert a SUB before + // the first non-updateable instruction (if any). + for (; MBBI != MBB.end(); ++MBBI) { + bool InsertSub = false; + unsigned Opc = MBBI->getOpcode(); + + if (MBBI->readsRegister(Base)) { + int Offset; + bool IsLoad = + Opc == ARM::tLDRi || Opc == ARM::tLDRHi || Opc == ARM::tLDRBi; + bool IsStore = + Opc == ARM::tSTRi || Opc == ARM::tSTRHi || Opc == ARM::tSTRBi; + + if (IsLoad || IsStore) { + // Loads and stores with immediate offsets can be updated, but only if + // the new offset isn't negative. + // The MachineOperand containing the offset immediate is the last one + // before predicates. + MachineOperand &MO = + MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3); + // The offsets are scaled by 1, 2 or 4 depending on the Opcode. + Offset = MO.getImm() - WordOffset * getImmScale(Opc); + + // If storing the base register, it needs to be reset first. + unsigned InstrSrcReg = getLoadStoreRegOp(*MBBI).getReg(); + + if (Offset >= 0 && !(IsStore && InstrSrcReg == Base)) + MO.setImm(Offset); + else + InsertSub = true; + + } else if ((Opc == ARM::tSUBi8 || Opc == ARM::tADDi8) && + !definesCPSR(MBBI)) { + // SUBS/ADDS using this register, with a dead def of the CPSR. + // Merge it with the update; if the merged offset is too large, + // insert a new sub instead. + MachineOperand &MO = + MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3); + Offset = (Opc == ARM::tSUBi8) ? + MO.getImm() + WordOffset * 4 : + MO.getImm() - WordOffset * 4 ; + if (Offset >= 0 && TL->isLegalAddImmediate(Offset)) { + // FIXME: Swap ADDS<->SUBS if Offset < 0, erase instruction if + // Offset == 0. + MO.setImm(Offset); + // The base register has now been reset, so exit early. + return; + } else { + InsertSub = true; + } + + } else { + // Can't update the instruction. + InsertSub = true; + } + + } else if (definesCPSR(MBBI) || MBBI->isCall() || MBBI->isBranch()) { + // Since SUBS sets the condition flags, we can't place the base reset + // after an instruction that has a live CPSR def. + // The base register might also contain an argument for a function call. + InsertSub = true; + } + + if (InsertSub) { + // An instruction above couldn't be updated, so insert a sub. + AddDefaultT1CC(BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true) + .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg); + return; + } + + if (MBBI->killsRegister(Base) || MBBI->definesRegister(Base)) + // Register got killed. Stop updating. + return; + } + + // End of block was reached. + if (MBB.succ_size() > 0) { + // FIXME: Because of a bug, live registers are sometimes missing from + // the successor blocks' live-in sets. This means we can't trust that + // information and *always* have to reset at the end of a block. + // See PR21029. + if (MBBI != MBB.end()) --MBBI; + AddDefaultT1CC( + BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true) + .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg); + } +} + +/// Return the first register of class \p RegClass that is not in \p Regs. +unsigned ARMLoadStoreOpt::findFreeReg(const TargetRegisterClass &RegClass) { + if (!RegClassInfoValid) { + RegClassInfo.runOnMachineFunction(*MF); + RegClassInfoValid = true; + } + + for (unsigned Reg : RegClassInfo.getOrder(&RegClass)) + if (!LiveRegs.contains(Reg)) + return Reg; + return 0; +} + +/// Compute live registers just before instruction \p Before (in normal schedule +/// direction). Computes backwards so multiple queries in the same block must +/// come in reverse order. +void ARMLoadStoreOpt::moveLiveRegsBefore(const MachineBasicBlock &MBB, + MachineBasicBlock::const_iterator Before) { + // Initialize if we never queried in this block. + if (!LiveRegsValid) { + LiveRegs.init(TRI); + LiveRegs.addLiveOuts(&MBB, true); + LiveRegPos = MBB.end(); + LiveRegsValid = true; + } + // Move backward just before the "Before" position. + while (LiveRegPos != Before) { + --LiveRegPos; + LiveRegs.stepBackward(*LiveRegPos); + } +} + +static bool ContainsReg(const ArrayRef> &Regs, + unsigned Reg) { + for (const std::pair &R : Regs) + if (R.first == Reg) + return true; + return false; +} + +/// Create and insert a LDM or STM with Base as base register and registers in +/// Regs as the register operands that would be loaded / stored. It returns +/// true if the transformation is done. +MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, + bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, + DebugLoc DL, ArrayRef> Regs) { unsigned NumRegs = Regs.size(); - if (NumRegs <= 1) - return false; + assert(NumRegs > 1); + + // For Thumb1 targets, it might be necessary to clobber the CPSR to merge. + // Compute liveness information for that register to make the decision. + bool SafeToClobberCPSR = !isThumb1 || + (MBB.computeRegisterLiveness(TRI, ARM::CPSR, InsertBefore, 20) == + MachineBasicBlock::LQR_Dead); + + bool Writeback = isThumb1; // Thumb1 LDM/STM have base reg writeback. + + // Exception: If the base register is in the input reglist, Thumb1 LDM is + // non-writeback. + // It's also not possible to merge an STR of the base register in Thumb1. + if (isThumb1 && isi32Load(Opcode) && ContainsReg(Regs, Base)) { + assert(Base != ARM::SP && "Thumb1 does not allow SP in register list"); + if (Opcode == ARM::tLDRi) { + Writeback = false; + } else if (Opcode == ARM::tSTRi) { + return nullptr; + } + } ARM_AM::AMSubMode Mode = ARM_AM::ia; - // VFP and Thumb2 do not support IB or DA modes. + // VFP and Thumb2 do not support IB or DA modes. Thumb1 only supports IA. bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode); - bool haveIBAndDA = isNotVFP && !isThumb2; - if (Offset == 4 && haveIBAndDA) + bool haveIBAndDA = isNotVFP && !isThumb2 && !isThumb1; + + if (Offset == 4 && haveIBAndDA) { Mode = ARM_AM::ib; - else if (Offset == -4 * (int)NumRegs + 4 && haveIBAndDA) + } else if (Offset == -4 * (int)NumRegs + 4 && haveIBAndDA) { Mode = ARM_AM::da; - else if (Offset == -4 * (int)NumRegs && isNotVFP) + } else if (Offset == -4 * (int)NumRegs && isNotVFP && !isThumb1) { // VLDM/VSTM do not support DB mode without also updating the base reg. Mode = ARM_AM::db; - else if (Offset != 0) { - // Check if this is a supported opcode before we insert instructions to + } else if (Offset != 0 || Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) { + // Check if this is a supported opcode before inserting instructions to // calculate a new base register. - if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return false; + if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return nullptr; // If starting offset isn't zero, insert a MI to materialize a new base. // But only do so if it is cost effective, i.e. merging more than two // loads / stores. if (NumRegs <= 2) - return false; + return nullptr; + + // On Thumb1, it's not worth materializing a new base register without + // clobbering the CPSR (i.e. not using ADDS/SUBS). + if (!SafeToClobberCPSR) + return nullptr; unsigned NewBase; - if (isi32Load(Opcode)) - // If it is a load, then just use one of the destination register to - // use as the new base. + if (isi32Load(Opcode)) { + // If it is a load, then just use one of the destination registers + // as the new base. Will no longer be writeback in Thumb1. NewBase = Regs[NumRegs-1].first; - else { - // Use the scratch register to use as a new base. - NewBase = Scratch; + Writeback = false; + } else { + // Find a free register that we can use as scratch register. + moveLiveRegsBefore(MBB, InsertBefore); + // The merged instruction does not exist yet but will use several Regs if + // it is a Store. + if (!isLoadSingle(Opcode)) + for (const std::pair &R : Regs) + LiveRegs.addReg(R.first); + + NewBase = findFreeReg(isThumb1 ? ARM::tGPRRegClass : ARM::GPRRegClass); if (NewBase == 0) - return false; + return nullptr; } - int BaseOpc = !isThumb2 ? ARM::ADDri : ARM::t2ADDri; + + int BaseOpc = + isThumb2 ? ARM::t2ADDri : + (isThumb1 && Base == ARM::SP) ? ARM::tADDrSPi : + (isThumb1 && Offset < 8) ? ARM::tADDi3 : + isThumb1 ? ARM::tADDi8 : ARM::ADDri; + if (Offset < 0) { - BaseOpc = !isThumb2 ? ARM::SUBri : ARM::t2SUBri; Offset = - Offset; + BaseOpc = + isThumb2 ? ARM::t2SUBri : + (isThumb1 && Offset < 8 && Base != ARM::SP) ? ARM::tSUBi3 : + isThumb1 ? ARM::tSUBi8 : ARM::SUBri; + } + + if (!TL->isLegalAddImmediate(Offset)) + // FIXME: Try add with register operand? + return nullptr; // Probably not worth it then. + + // We can only append a kill flag to the add/sub input if the value is not + // used in the register list of the stm as well. + bool KillOldBase = BaseKill && + (!isi32Store(Opcode) || !ContainsReg(Regs, Base)); + + if (isThumb1) { + // Thumb1: depending on immediate size, use either + // ADDS NewBase, Base, #imm3 + // or + // MOV NewBase, Base + // ADDS NewBase, #imm8. + if (Base != NewBase && + (BaseOpc == ARM::tADDi8 || BaseOpc == ARM::tSUBi8)) { + // Need to insert a MOV to the new base first. + if (isARMLowRegister(NewBase) && isARMLowRegister(Base) && + !STI->hasV6Ops()) { + // thumbv4t doesn't have lo->lo copies, and we can't predicate tMOVSr + if (Pred != ARMCC::AL) + return nullptr; + BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVSr), NewBase) + .addReg(Base, getKillRegState(KillOldBase)); + } else + BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVr), NewBase) + .addReg(Base, getKillRegState(KillOldBase)) + .addImm(Pred).addReg(PredReg); + + // The following ADDS/SUBS becomes an update. + Base = NewBase; + KillOldBase = true; + } + if (BaseOpc == ARM::tADDrSPi) { + assert(Offset % 4 == 0 && "tADDrSPi offset is scaled by 4"); + BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase) + .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset/4) + .addImm(Pred).addReg(PredReg); + } else + AddDefaultT1CC( + BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase), true) + .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset) + .addImm(Pred).addReg(PredReg); + } else { + BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase) + .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset) + .addImm(Pred).addReg(PredReg).addReg(0); } - int ImmedOffset = isThumb2 - ? ARM_AM::getT2SOImmVal(Offset) : ARM_AM::getSOImmVal(Offset); - if (ImmedOffset == -1) - // FIXME: Try t2ADDri12 or t2SUBri12? - return false; // Probably not worth it then. - - BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase) - .addReg(Base, getKillRegState(BaseKill)).addImm(Offset) - .addImm(Pred).addReg(PredReg).addReg(0); Base = NewBase; - BaseKill = true; // New base is always killed right its use. + BaseKill = true; // New base is always killed straight away. } - bool isDef = (isi32Load(Opcode) || Opcode == ARM::VLDRS || - Opcode == ARM::VLDRD); + bool isDef = isLoadSingle(Opcode); + + // Get LS multiple opcode. Note that for Thumb1 this might be an opcode with + // base register writeback. Opcode = getLoadStoreMultipleOpcode(Opcode, Mode); - if (!Opcode) return false; - MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode)) - .addReg(Base, getKillRegState(BaseKill)) - .addImm(Pred).addReg(PredReg); - for (unsigned i = 0; i != NumRegs; ++i) - MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef) - | getKillRegState(Regs[i].second)); + if (!Opcode) + return nullptr; + + // Check if a Thumb1 LDM/STM merge is safe. This is the case if: + // - There is no writeback (LDM of base register), + // - the base register is killed by the merged instruction, + // - or it's safe to overwrite the condition flags, i.e. to insert a SUBS + // to reset the base register. + // Otherwise, don't merge. + // It's safe to return here since the code to materialize a new base register + // above is also conditional on SafeToClobberCPSR. + if (isThumb1 && !SafeToClobberCPSR && Writeback && !BaseKill) + return nullptr; + + MachineInstrBuilder MIB; + + if (Writeback) { + assert(isThumb1 && "expected Writeback only inThumb1"); + if (Opcode == ARM::tLDMIA) { + assert(!(ContainsReg(Regs, Base)) && "Thumb1 can't LDM ! with Base in Regs"); + // Update tLDMIA with writeback if necessary. + Opcode = ARM::tLDMIA_UPD; + } - // Add implicit defs for super-registers. - for (unsigned i = 0, e = ImpDefs.size(); i != e; ++i) - MIB.addReg(ImpDefs[i], RegState::ImplicitDefine); + MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode)); - return true; -} + // Thumb1: we might need to set base writeback when building the MI. + MIB.addReg(Base, getDefRegState(true)) + .addReg(Base, getKillRegState(BaseKill)); -// MergeOpsUpdate - call MergeOps and update MemOps and merges accordingly on -// success. -void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB, - MemOpQueue &memOps, - unsigned memOpsBegin, unsigned memOpsEnd, - unsigned insertAfter, int Offset, - unsigned Base, bool BaseKill, - int Opcode, - ARMCC::CondCodes Pred, unsigned PredReg, - unsigned Scratch, - DebugLoc dl, - SmallVector &Merges) { - // First calculate which of the registers should be killed by the merged - // instruction. - const unsigned insertPos = memOps[insertAfter].Position; - SmallSet KilledRegs; - DenseMap Killer; - for (unsigned i = 0, e = memOps.size(); i != e; ++i) { - if (i == memOpsBegin) { - i = memOpsEnd; - if (i == e) - break; - } - if (memOps[i].Position < insertPos && memOps[i].isKill) { - unsigned Reg = memOps[i].Reg; - KilledRegs.insert(Reg); - Killer[Reg] = i; - } - } + // The base isn't dead after a merged instruction with writeback. + // Insert a sub instruction after the newly formed instruction to reset. + if (!BaseKill) + UpdateBaseRegUses(MBB, InsertBefore, DL, Base, NumRegs, Pred, PredReg); - SmallVector, 8> Regs; - SmallVector ImpDefs; - for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) { - unsigned Reg = memOps[i].Reg; - // If we are inserting the merged operation after an operation that - // uses the same register, make sure to transfer any kill flag. - bool isKill = memOps[i].isKill || KilledRegs.count(Reg); - Regs.push_back(std::make_pair(Reg, isKill)); - - // Collect any implicit defs of super-registers. They must be preserved. - for (MIOperands MO(memOps[i].MBBI); MO.isValid(); ++MO) { - if (!MO->isReg() || !MO->isDef() || !MO->isImplicit() || MO->isDead()) - continue; - unsigned DefReg = MO->getReg(); - if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) == ImpDefs.end()) - ImpDefs.push_back(DefReg); - } + } else { + // No writeback, simply build the MachineInstr. + MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode)); + MIB.addReg(Base, getKillRegState(BaseKill)); } - // Try to do the merge. - MachineBasicBlock::iterator Loc = memOps[insertAfter].MBBI; - ++Loc; - if (!MergeOps(MBB, Loc, Offset, Base, BaseKill, Opcode, - Pred, PredReg, Scratch, dl, Regs, ImpDefs)) - return; - - // Merge succeeded, update records. - Merges.push_back(prior(Loc)); - for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) { - // Remove kill flags from any memops that come before insertPos. - if (Regs[i-memOpsBegin].second) { - unsigned Reg = Regs[i-memOpsBegin].first; - if (KilledRegs.count(Reg)) { - unsigned j = Killer[Reg]; - int Idx = memOps[j].MBBI->findRegisterUseOperandIdx(Reg, true); - assert(Idx >= 0 && "Cannot find killing operand"); - memOps[j].MBBI->getOperand(Idx).setIsKill(false); - memOps[j].isKill = false; - } - memOps[i].isKill = true; - } - MBB.erase(memOps[i].MBBI); - // Update this memop to refer to the merged instruction. - // We may need to move kill flags again. - memOps[i].Merged = true; - memOps[i].MBBI = Merges.back(); - memOps[i].Position = insertPos; - } -} + MIB.addImm(Pred).addReg(PredReg); -/// MergeLDR_STR - Merge a number of load / store instructions into one or more -/// load / store multiple instructions. -void -ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, - unsigned Base, int Opcode, unsigned Size, - ARMCC::CondCodes Pred, unsigned PredReg, - unsigned Scratch, MemOpQueue &MemOps, - SmallVector &Merges) { - bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode); - int Offset = MemOps[SIndex].Offset; - int SOffset = Offset; - unsigned insertAfter = SIndex; - MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI; - DebugLoc dl = Loc->getDebugLoc(); - const MachineOperand &PMO = Loc->getOperand(0); - unsigned PReg = PMO.getReg(); - unsigned PRegNum = PMO.isUndef() ? UINT_MAX - : getARMRegisterNumbering(PReg); - unsigned Count = 1; - unsigned Limit = ~0U; - - // vldm / vstm limit are 32 for S variants, 16 for D variants. + for (const std::pair &R : Regs) + MIB.addReg(R.first, getDefRegState(isDef) | getKillRegState(R.second)); - switch (Opcode) { - default: break; - case ARM::VSTRS: - Limit = 32; - break; - case ARM::VSTRD: - Limit = 16; - break; - case ARM::VLDRD: - Limit = 16; - break; - case ARM::VLDRS: - Limit = 32; - break; + return MIB.getInstr(); +} + +MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, + bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, + DebugLoc DL, ArrayRef> Regs) const { + bool IsLoad = isi32Load(Opcode); + assert((IsLoad || isi32Store(Opcode)) && "Must have integer load or store"); + unsigned LoadStoreOpcode = IsLoad ? ARM::t2LDRDi8 : ARM::t2STRDi8; + + assert(Regs.size() == 2); + MachineInstrBuilder MIB = BuildMI(MBB, InsertBefore, DL, + TII->get(LoadStoreOpcode)); + if (IsLoad) { + MIB.addReg(Regs[0].first, RegState::Define) + .addReg(Regs[1].first, RegState::Define); + } else { + MIB.addReg(Regs[0].first, getKillRegState(Regs[0].second)) + .addReg(Regs[1].first, getKillRegState(Regs[1].second)); } + MIB.addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); + return MIB.getInstr(); +} - for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) { - int NewOffset = MemOps[i].Offset; - const MachineOperand &MO = MemOps[i].MBBI->getOperand(0); +/// Call MergeOps and update MemOps and merges accordingly on success. +MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { + const MachineInstr *First = Cand.Instrs.front(); + unsigned Opcode = First->getOpcode(); + bool IsLoad = isLoadSingle(Opcode); + SmallVector, 8> Regs; + SmallVector ImpDefs; + DenseSet KilledRegs; + DenseSet UsedRegs; + // Determine list of registers and list of implicit super-register defs. + for (const MachineInstr *MI : Cand.Instrs) { + const MachineOperand &MO = getLoadStoreRegOp(*MI); unsigned Reg = MO.getReg(); - unsigned RegNum = MO.isUndef() ? UINT_MAX - : getARMRegisterNumbering(Reg); - // Register numbers must be in ascending order. For VFP / NEON load and - // store multiples, the registers must also be consecutive and within the - // limit on the number of registers per instruction. - if (Reg != ARM::SP && - NewOffset == Offset + (int)Size && - ((isNotVFP && RegNum > PRegNum) || - ((Count < Limit) && RegNum == PRegNum+1))) { - Offset += Size; - PRegNum = RegNum; - ++Count; - } else { - // Can't merge this in. Try merge the earlier ones first. - MergeOpsUpdate(MBB, MemOps, SIndex, i, insertAfter, SOffset, - Base, false, Opcode, Pred, PredReg, Scratch, dl, Merges); - MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch, - MemOps, Merges); - return; + bool IsKill = MO.isKill(); + if (IsKill) + KilledRegs.insert(Reg); + Regs.push_back(std::make_pair(Reg, IsKill)); + UsedRegs.insert(Reg); + + if (IsLoad) { + // Collect any implicit defs of super-registers, after merging we can't + // be sure anymore that we properly preserved these live ranges and must + // removed these implicit operands. + for (const MachineOperand &MO : MI->implicit_operands()) { + if (!MO.isReg() || !MO.isDef() || MO.isDead()) + continue; + assert(MO.isImplicit()); + unsigned DefReg = MO.getReg(); + + if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) != ImpDefs.end()) + continue; + // We can ignore cases where the super-reg is read and written. + if (MI->readsRegister(DefReg)) + continue; + ImpDefs.push_back(DefReg); + } } - - if (MemOps[i].Position > MemOps[insertAfter].Position) - insertAfter = i; } - bool BaseKill = Loc->findRegisterUseOperandIdx(Base, true) != -1; - MergeOpsUpdate(MBB, MemOps, SIndex, MemOps.size(), insertAfter, SOffset, - Base, BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges); - return; -} - -static bool definesCPSR(MachineInstr *MI) { - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); - if (!MO.isReg()) - continue; - if (MO.isDef() && MO.getReg() == ARM::CPSR && !MO.isDead()) - // If the instruction has live CPSR def, then it's not safe to fold it - // into load / store. - return true; + // Attempt the merge. + typedef MachineBasicBlock::iterator iterator; + MachineInstr *LatestMI = Cand.Instrs[Cand.LatestMIIdx]; + iterator InsertBefore = std::next(iterator(LatestMI)); + MachineBasicBlock &MBB = *LatestMI->getParent(); + unsigned Offset = getMemoryOpOffset(First); + unsigned Base = getLoadStoreBaseOp(*First).getReg(); + bool BaseKill = LatestMI->killsRegister(Base); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = getInstrPredicate(First, PredReg); + DebugLoc DL = First->getDebugLoc(); + MachineInstr *Merged = nullptr; + if (Cand.CanMergeToLSDouble) + Merged = CreateLoadStoreDouble(MBB, InsertBefore, Offset, Base, BaseKill, + Opcode, Pred, PredReg, DL, Regs); + if (!Merged && Cand.CanMergeToLSMulti) + Merged = CreateLoadStoreMulti(MBB, InsertBefore, Offset, Base, BaseKill, + Opcode, Pred, PredReg, DL, Regs); + if (!Merged) + return nullptr; + + // Determine earliest instruction that will get removed. We then keep an + // iterator just above it so the following erases don't invalidated it. + iterator EarliestI(Cand.Instrs[Cand.EarliestMIIdx]); + bool EarliestAtBegin = false; + if (EarliestI == MBB.begin()) { + EarliestAtBegin = true; + } else { + EarliestI = std::prev(EarliestI); } - return false; -} + // Remove instructions which have been merged. + for (MachineInstr *MI : Cand.Instrs) + MBB.erase(MI); -static bool isMatchingDecrement(MachineInstr *MI, unsigned Base, - unsigned Bytes, unsigned Limit, - ARMCC::CondCodes Pred, unsigned PredReg) { - unsigned MyPredReg = 0; - if (!MI) - return false; + // Determine range between the earliest removed instruction and the new one. + if (EarliestAtBegin) + EarliestI = MBB.begin(); + else + EarliestI = std::next(EarliestI); + auto FixupRange = make_range(EarliestI, iterator(Merged)); + + if (isLoadSingle(Opcode)) { + // If the previous loads defined a super-reg, then we have to mark earlier + // operands undef; Replicate the super-reg def on the merged instruction. + for (MachineInstr &MI : FixupRange) { + for (unsigned &ImpDefReg : ImpDefs) { + for (MachineOperand &MO : MI.implicit_operands()) { + if (!MO.isReg() || MO.getReg() != ImpDefReg) + continue; + if (MO.readsReg()) + MO.setIsUndef(); + else if (MO.isDef()) + ImpDefReg = 0; + } + } + } - bool CheckCPSRDef = false; - switch (MI->getOpcode()) { - default: return false; - case ARM::t2SUBri: - case ARM::SUBri: - CheckCPSRDef = true; - // fallthrough - case ARM::tSUBspi: - break; + MachineInstrBuilder MIB(*Merged->getParent()->getParent(), Merged); + for (unsigned ImpDef : ImpDefs) + MIB.addReg(ImpDef, RegState::ImplicitDefine); + } else { + // Remove kill flags: We are possibly storing the values later now. + assert(isi32Store(Opcode) || Opcode == ARM::VSTRS || Opcode == ARM::VSTRD); + for (MachineInstr &MI : FixupRange) { + for (MachineOperand &MO : MI.uses()) { + if (!MO.isReg() || !MO.isKill()) + continue; + if (UsedRegs.count(MO.getReg())) + MO.setIsKill(false); + } + } + assert(ImpDefs.empty()); } - // Make sure the offset fits in 8 bits. - if (Bytes == 0 || (Limit && Bytes >= Limit)) - return false; - - unsigned Scale = (MI->getOpcode() == ARM::tSUBspi) ? 4 : 1; // FIXME - if (!(MI->getOperand(0).getReg() == Base && - MI->getOperand(1).getReg() == Base && - (MI->getOperand(2).getImm()*Scale) == Bytes && - getInstrPredicate(MI, MyPredReg) == Pred && - MyPredReg == PredReg)) - return false; - - return CheckCPSRDef ? !definesCPSR(MI) : true; + return Merged; } -static bool isMatchingIncrement(MachineInstr *MI, unsigned Base, - unsigned Bytes, unsigned Limit, - ARMCC::CondCodes Pred, unsigned PredReg) { - unsigned MyPredReg = 0; - if (!MI) - return false; - - bool CheckCPSRDef = false; - switch (MI->getOpcode()) { - default: return false; - case ARM::t2ADDri: - case ARM::ADDri: - CheckCPSRDef = true; - // fallthrough - case ARM::tADDspi: - break; - } +static bool isValidLSDoubleOffset(int Offset) { + unsigned Value = abs(Offset); + // t2LDRDi8/t2STRDi8 supports an 8 bit immediate which is internally + // multiplied by 4. + return (Value % 4) == 0 && Value < 1024; +} - if (Bytes == 0 || (Limit && Bytes >= Limit)) - // Make sure the offset fits in 8 bits. - return false; +/// Find candidates for load/store multiple merge in list of MemOpQueueEntries. +void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) { + const MachineInstr *FirstMI = MemOps[0].MI; + unsigned Opcode = FirstMI->getOpcode(); + bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode); + unsigned Size = getLSMultipleTransferSize(FirstMI); + + unsigned SIndex = 0; + unsigned EIndex = MemOps.size(); + do { + // Look at the first instruction. + const MachineInstr *MI = MemOps[SIndex].MI; + int Offset = MemOps[SIndex].Offset; + const MachineOperand &PMO = getLoadStoreRegOp(*MI); + unsigned PReg = PMO.getReg(); + unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg); + unsigned Latest = SIndex; + unsigned Earliest = SIndex; + unsigned Count = 1; + bool CanMergeToLSDouble = + STI->isThumb2() && isNotVFP && isValidLSDoubleOffset(Offset); + // ARM errata 602117: LDRD with base in list may result in incorrect base + // register when interrupted or faulted. + if (STI->isCortexM3() && isi32Load(Opcode) && + PReg == getLoadStoreBaseOp(*MI).getReg()) + CanMergeToLSDouble = false; + + bool CanMergeToLSMulti = true; + // On swift vldm/vstm starting with an odd register number as that needs + // more uops than single vldrs. + if (STI->isSwift() && !isNotVFP && (PRegNum % 2) == 1) + CanMergeToLSMulti = false; + + // LDRD/STRD do not allow SP/PC. LDM/STM do not support it or have it + // deprecated; LDM to PC is fine but cannot happen here. + if (PReg == ARM::SP || PReg == ARM::PC) + CanMergeToLSMulti = CanMergeToLSDouble = false; + + // Merge following instructions where possible. + for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) { + int NewOffset = MemOps[I].Offset; + if (NewOffset != Offset + (int)Size) + break; + const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI); + unsigned Reg = MO.getReg(); + if (Reg == ARM::SP || Reg == ARM::PC) + break; - unsigned Scale = (MI->getOpcode() == ARM::tADDspi) ? 4 : 1; // FIXME - if (!(MI->getOperand(0).getReg() == Base && - MI->getOperand(1).getReg() == Base && - (MI->getOperand(2).getImm()*Scale) == Bytes && - getInstrPredicate(MI, MyPredReg) == Pred && - MyPredReg == PredReg)) - return false; + // See if the current load/store may be part of a multi load/store. + unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg); + bool PartOfLSMulti = CanMergeToLSMulti; + if (PartOfLSMulti) { + // Register numbers must be in ascending order. + if (RegNum <= PRegNum) + PartOfLSMulti = false; + // For VFP / NEON load/store multiples, the registers must be + // consecutive and within the limit on the number of registers per + // instruction. + else if (!isNotVFP && RegNum != PRegNum+1) + PartOfLSMulti = false; + } + // See if the current load/store may be part of a double load/store. + bool PartOfLSDouble = CanMergeToLSDouble && Count <= 1; - return CheckCPSRDef ? !definesCPSR(MI) : true; -} + if (!PartOfLSMulti && !PartOfLSDouble) + break; + CanMergeToLSMulti &= PartOfLSMulti; + CanMergeToLSDouble &= PartOfLSDouble; + // Track MemOp with latest and earliest position (Positions are + // counted in reverse). + unsigned Position = MemOps[I].Position; + if (Position < MemOps[Latest].Position) + Latest = I; + else if (Position > MemOps[Earliest].Position) + Earliest = I; + // Prepare for next MemOp. + Offset += Size; + PRegNum = RegNum; + } -static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) { - switch (MI->getOpcode()) { - default: return 0; - case ARM::LDRi12: - case ARM::STRi12: - case ARM::t2LDRi8: - case ARM::t2LDRi12: - case ARM::t2STRi8: - case ARM::t2STRi12: - case ARM::VLDRS: - case ARM::VSTRS: - return 4; - case ARM::VLDRD: - case ARM::VSTRD: - return 8; - case ARM::LDMIA: - case ARM::LDMDA: - case ARM::LDMDB: - case ARM::LDMIB: - case ARM::STMIA: - case ARM::STMDA: - case ARM::STMDB: - case ARM::STMIB: - case ARM::t2LDMIA: - case ARM::t2LDMDB: - case ARM::t2STMIA: - case ARM::t2STMDB: - case ARM::VLDMSIA: - case ARM::VSTMSIA: - return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4; - case ARM::VLDMDIA: - case ARM::VSTMDIA: - return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8; - } + // Form a candidate from the Ops collected so far. + MergeCandidate *Candidate = new(Allocator.Allocate()) MergeCandidate; + for (unsigned C = SIndex, CE = SIndex + Count; C < CE; ++C) + Candidate->Instrs.push_back(MemOps[C].MI); + Candidate->LatestMIIdx = Latest - SIndex; + Candidate->EarliestMIIdx = Earliest - SIndex; + Candidate->InsertPos = MemOps[Latest].Position; + if (Count == 1) + CanMergeToLSMulti = CanMergeToLSDouble = false; + Candidate->CanMergeToLSMulti = CanMergeToLSMulti; + Candidate->CanMergeToLSDouble = CanMergeToLSDouble; + Candidates.push_back(Candidate); + // Continue after the chain. + SIndex += Count; + } while (SIndex < EIndex); } static unsigned getUpdatingLSMultipleOpcode(unsigned Opc, @@ -697,8 +1080,77 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc, } } -/// MergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base -/// register into the LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible: +/// Check if the given instruction increments or decrements a register and +/// return the amount it is incremented/decremented. Returns 0 if the CPSR flags +/// generated by the instruction are possibly read as well. +static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg, + ARMCC::CondCodes Pred, unsigned PredReg) { + bool CheckCPSRDef; + int Scale; + switch (MI.getOpcode()) { + case ARM::tADDi8: Scale = 4; CheckCPSRDef = true; break; + case ARM::tSUBi8: Scale = -4; CheckCPSRDef = true; break; + case ARM::t2SUBri: + case ARM::SUBri: Scale = -1; CheckCPSRDef = true; break; + case ARM::t2ADDri: + case ARM::ADDri: Scale = 1; CheckCPSRDef = true; break; + case ARM::tADDspi: Scale = 4; CheckCPSRDef = false; break; + case ARM::tSUBspi: Scale = -4; CheckCPSRDef = false; break; + default: return 0; + } + + unsigned MIPredReg; + if (MI.getOperand(0).getReg() != Reg || + MI.getOperand(1).getReg() != Reg || + getInstrPredicate(&MI, MIPredReg) != Pred || + MIPredReg != PredReg) + return 0; + + if (CheckCPSRDef && definesCPSR(&MI)) + return 0; + return MI.getOperand(2).getImm() * Scale; +} + +/// Searches for an increment or decrement of \p Reg before \p MBBI. +static MachineBasicBlock::iterator +findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg, + ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) { + Offset = 0; + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineBasicBlock::iterator BeginMBBI = MBB.begin(); + MachineBasicBlock::iterator EndMBBI = MBB.end(); + if (MBBI == BeginMBBI) + return EndMBBI; + + // Skip debug values. + MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI); + while (PrevMBBI->isDebugValue() && PrevMBBI != BeginMBBI) + --PrevMBBI; + + Offset = isIncrementOrDecrement(*PrevMBBI, Reg, Pred, PredReg); + return Offset == 0 ? EndMBBI : PrevMBBI; +} + +/// Searches for a increment or decrement of \p Reg after \p MBBI. +static MachineBasicBlock::iterator +findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg, + ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) { + Offset = 0; + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineBasicBlock::iterator EndMBBI = MBB.end(); + MachineBasicBlock::iterator NextMBBI = std::next(MBBI); + // Skip debug values. + while (NextMBBI != EndMBBI && NextMBBI->isDebugValue()) + ++NextMBBI; + if (NextMBBI == EndMBBI) + return EndMBBI; + + Offset = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg); + return Offset == 0 ? EndMBBI : NextMBBI; +} + +/// Fold proceeding/trailing inc/dec of base register into the +/// LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible: /// /// stmia rn, /// rn := rn + 4 * 3; @@ -709,18 +1161,17 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc, /// ldmia rn, /// => /// ldmdb rn!, -bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - bool &Advance, - MachineBasicBlock::iterator &I) { - MachineInstr *MI = MBBI; - unsigned Base = MI->getOperand(0).getReg(); - bool BaseKill = MI->getOperand(0).isKill(); - unsigned Bytes = getLSMultipleTransferSize(MI); +bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) { + // Thumb1 is already using updating loads/stores. + if (isThumb1) return false; + + const MachineOperand &BaseOP = MI->getOperand(0); + unsigned Base = BaseOP.getReg(); + bool BaseKill = BaseOP.isKill(); unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); - int Opcode = MI->getOpcode(); - DebugLoc dl = MI->getDebugLoc(); + unsigned Opcode = MI->getOpcode(); + DebugLoc DL = MI->getDebugLoc(); // Can't use an updating ld/st if the base register is also a dest // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined. @@ -728,55 +1179,27 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, if (MI->getOperand(i).getReg() == Base) return false; - bool DoMerge = false; - ARM_AM::AMSubMode Mode = ARM_AM::getLoadStoreMultipleSubMode(Opcode); - - // Try merging with the previous instruction. - MachineBasicBlock::iterator BeginMBBI = MBB.begin(); - if (MBBI != BeginMBBI) { - MachineBasicBlock::iterator PrevMBBI = prior(MBBI); - while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue()) - --PrevMBBI; - if (Mode == ARM_AM::ia && - isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { - Mode = ARM_AM::db; - DoMerge = true; - } else if (Mode == ARM_AM::ib && - isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { - Mode = ARM_AM::da; - DoMerge = true; - } - if (DoMerge) - MBB.erase(PrevMBBI); - } - - // Try merging with the next instruction. - MachineBasicBlock::iterator EndMBBI = MBB.end(); - if (!DoMerge && MBBI != EndMBBI) { - MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI); - while (NextMBBI != EndMBBI && NextMBBI->isDebugValue()) - ++NextMBBI; - if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) && - isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { - DoMerge = true; - } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) && - isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { - DoMerge = true; - } - if (DoMerge) { - if (NextMBBI == I) { - Advance = true; - ++I; - } - MBB.erase(NextMBBI); - } + int Bytes = getLSMultipleTransferSize(MI); + MachineBasicBlock &MBB = *MI->getParent(); + MachineBasicBlock::iterator MBBI(MI); + int Offset; + MachineBasicBlock::iterator MergeInstr + = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset); + ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode); + if (Mode == ARM_AM::ia && Offset == -Bytes) { + Mode = ARM_AM::db; + } else if (Mode == ARM_AM::ib && Offset == -Bytes) { + Mode = ARM_AM::da; + } else { + MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset); + if (((Mode != ARM_AM::ia && Mode != ARM_AM::ib) || Offset != Bytes) && + ((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes)) + return false; } - - if (!DoMerge) - return false; + MBB.erase(MergeInstr); unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode); - MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(NewOpc)) + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)) .addReg(Base, getDefRegState(true)) // WB base register .addReg(Base, getKillRegState(BaseKill)) .addImm(Pred).addReg(PredReg); @@ -842,19 +1265,17 @@ static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc, } } -/// MergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base -/// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible: -bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - const TargetInstrInfo *TII, - bool &Advance, - MachineBasicBlock::iterator &I) { - MachineInstr *MI = MBBI; - unsigned Base = MI->getOperand(1).getReg(); - bool BaseKill = MI->getOperand(1).isKill(); - unsigned Bytes = getLSMultipleTransferSize(MI); - int Opcode = MI->getOpcode(); - DebugLoc dl = MI->getDebugLoc(); +/// Fold proceeding/trailing inc/dec of base register into the +/// LDR/STR/FLD{D|S}/FST{D|S} op when possible: +bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { + // Thumb1 doesn't have updating LDR/STR. + // FIXME: Use LDM/STM with single register instead. + if (isThumb1) return false; + + unsigned Base = getLoadStoreBaseOp(*MI).getReg(); + bool BaseKill = getLoadStoreBaseOp(*MI).isKill(); + unsigned Opcode = MI->getOpcode(); + DebugLoc DL = MI->getDebugLoc(); bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS || Opcode == ARM::VSTRD || Opcode == ARM::VSTRS); bool isAM2 = (Opcode == ARM::LDRi12 || Opcode == ARM::STRi12); @@ -864,72 +1285,45 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0) return false; - bool isLd = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD; // Can't do the merge if the destination register is the same as the would-be // writeback register. - if (isLd && MI->getOperand(0).getReg() == Base) + if (MI->getOperand(0).getReg() == Base) return false; unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); - bool DoMerge = false; - ARM_AM::AddrOpc AddSub = ARM_AM::add; - unsigned NewOpc = 0; - // AM2 - 12 bits, thumb2 - 8 bits. - unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100); - - // Try merging with the previous instruction. - MachineBasicBlock::iterator BeginMBBI = MBB.begin(); - if (MBBI != BeginMBBI) { - MachineBasicBlock::iterator PrevMBBI = prior(MBBI); - while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue()) - --PrevMBBI; - if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) { - DoMerge = true; - AddSub = ARM_AM::sub; - } else if (!isAM5 && - isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) { - DoMerge = true; - } - if (DoMerge) { - NewOpc = getPreIndexedLoadStoreOpcode(Opcode, AddSub); - MBB.erase(PrevMBBI); - } - } - - // Try merging with the next instruction. - MachineBasicBlock::iterator EndMBBI = MBB.end(); - if (!DoMerge && MBBI != EndMBBI) { - MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI); - while (NextMBBI != EndMBBI && NextMBBI->isDebugValue()) - ++NextMBBI; - if (!isAM5 && - isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) { - DoMerge = true; - AddSub = ARM_AM::sub; - } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) { - DoMerge = true; - } - if (DoMerge) { - NewOpc = getPostIndexedLoadStoreOpcode(Opcode, AddSub); - if (NextMBBI == I) { - Advance = true; - ++I; - } - MBB.erase(NextMBBI); - } + int Bytes = getLSMultipleTransferSize(MI); + MachineBasicBlock &MBB = *MI->getParent(); + MachineBasicBlock::iterator MBBI(MI); + int Offset; + MachineBasicBlock::iterator MergeInstr + = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset); + unsigned NewOpc; + if (!isAM5 && Offset == Bytes) { + NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::add); + } else if (Offset == -Bytes) { + NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::sub); + } else { + MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset); + if (Offset == Bytes) { + NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::add); + } else if (!isAM5 && Offset == -Bytes) { + NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::sub); + } else + return false; } + MBB.erase(MergeInstr); - if (!DoMerge) - return false; + ARM_AM::AddrOpc AddSub = Offset < 0 ? ARM_AM::sub : ARM_AM::add; + bool isLd = isLoadSingle(Opcode); if (isAM5) { - // VLDM[SD}_UPD, VSTM[SD]_UPD + // VLDM[SD]_UPD, VSTM[SD]_UPD // (There are no base-updating versions of VLDR/VSTR instructions, but the // updating load/store-multiple instructions can be used with only one // register.) MachineOperand &MO = MI->getOperand(0); - BuildMI(MBB, MBBI, dl, TII->get(NewOpc)) + BuildMI(MBB, MBBI, DL, TII->get(NewOpc)) .addReg(Base, getDefRegState(true)) // WB base register .addReg(Base, getKillRegState(isLd ? BaseKill : false)) .addImm(Pred).addReg(PredReg) @@ -939,20 +1333,18 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, if (isAM2) { // LDR_PRE, LDR_POST if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) { - int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; - BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) + BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) .addReg(Base, RegState::Define) .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); } else { - int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); - BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) + int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); + BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) .addReg(Base, RegState::Define) - .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); + .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg); } } else { - int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; // t2LDR_PRE, t2LDR_POST - BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) + BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) .addReg(Base, RegState::Define) .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); } @@ -962,15 +1354,14 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, // the vestigal zero-reg offset register. When that's fixed, this clause // can be removed entirely. if (isAM2 && NewOpc == ARM::STR_POST_IMM) { - int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); + int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); // STR_PRE, STR_POST - BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) + BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base) .addReg(MO.getReg(), getKillRegState(MO.isKill())) - .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); + .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg); } else { - int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; // t2STR_PRE, t2STR_POST - BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) + BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base) .addReg(MO.getReg(), getKillRegState(MO.isKill())) .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); } @@ -980,101 +1371,125 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, return true; } -/// isMemoryOp - Returns true if instruction is a memory operation that this -/// pass is capable of operating on. -static bool isMemoryOp(const MachineInstr *MI) { - // When no memory operands are present, conservatively assume unaligned, - // volatile, unfoldable. - if (!MI->hasOneMemOperand()) +bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const { + unsigned Opcode = MI.getOpcode(); + assert((Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) && + "Must have t2STRDi8 or t2LDRDi8"); + if (MI.getOperand(3).getImm() != 0) return false; - const MachineMemOperand *MMO = *MI->memoperands_begin(); - - // Don't touch volatile memory accesses - we may be changing their order. - if (MMO->isVolatile()) + // Behaviour for writeback is undefined if base register is the same as one + // of the others. + const MachineOperand &BaseOp = MI.getOperand(2); + unsigned Base = BaseOp.getReg(); + const MachineOperand &Reg0Op = MI.getOperand(0); + const MachineOperand &Reg1Op = MI.getOperand(1); + if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base) return false; - // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is - // not. - if (MMO->getAlignment() < 4) - return false; + unsigned PredReg; + ARMCC::CondCodes Pred = getInstrPredicate(&MI, PredReg); + MachineBasicBlock::iterator MBBI(MI); + MachineBasicBlock &MBB = *MI.getParent(); + int Offset; + MachineBasicBlock::iterator MergeInstr = findIncDecBefore(MBBI, Base, Pred, + PredReg, Offset); + unsigned NewOpc; + if (Offset == 8 || Offset == -8) { + NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_PRE : ARM::t2STRD_PRE; + } else { + MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset); + if (Offset == 8 || Offset == -8) { + NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_POST : ARM::t2STRD_POST; + } else + return false; + } + MBB.erase(MergeInstr); - // str could probably be eliminated entirely, but for now we just want - // to avoid making a mess of it. - // FIXME: Use str as a wildcard to enable better stm folding. - if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg() && - MI->getOperand(0).isUndef()) - return false; + DebugLoc DL = MI.getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)); + if (NewOpc == ARM::t2LDRD_PRE || NewOpc == ARM::t2LDRD_POST) { + MIB.addOperand(Reg0Op).addOperand(Reg1Op) + .addReg(BaseOp.getReg(), RegState::Define); + } else { + assert(NewOpc == ARM::t2STRD_PRE || NewOpc == ARM::t2STRD_POST); + MIB.addReg(BaseOp.getReg(), RegState::Define) + .addOperand(Reg0Op).addOperand(Reg1Op); + } + MIB.addReg(BaseOp.getReg(), RegState::Kill) + .addImm(Offset).addImm(Pred).addReg(PredReg); + assert(TII->get(Opcode).getNumOperands() == 6 && + TII->get(NewOpc).getNumOperands() == 7 && + "Unexpected number of operands in Opcode specification."); - // Likewise don't mess with references to undefined addresses. - if (MI->getNumOperands() > 1 && MI->getOperand(1).isReg() && - MI->getOperand(1).isUndef()) - return false; + // Transfer implicit operands. + for (const MachineOperand &MO : MI.implicit_operands()) + MIB.addOperand(MO); + MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + + MBB.erase(MBBI); + return true; +} - int Opcode = MI->getOpcode(); +/// Returns true if instruction is a memory operation that this pass is capable +/// of operating on. +static bool isMemoryOp(const MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); switch (Opcode) { - default: break; case ARM::VLDRS: case ARM::VSTRS: - return MI->getOperand(1).isReg(); case ARM::VLDRD: case ARM::VSTRD: - return MI->getOperand(1).isReg(); case ARM::LDRi12: case ARM::STRi12: + case ARM::tLDRi: + case ARM::tSTRi: + case ARM::tLDRspi: + case ARM::tSTRspi: case ARM::t2LDRi8: case ARM::t2LDRi12: case ARM::t2STRi8: case ARM::t2STRi12: - return MI->getOperand(1).isReg(); + break; + default: + return false; } - return false; -} + if (!MI.getOperand(1).isReg()) + return false; -/// AdvanceRS - Advance register scavenger to just before the earliest memory -/// op that is being merged. -void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) { - MachineBasicBlock::iterator Loc = MemOps[0].MBBI; - unsigned Position = MemOps[0].Position; - for (unsigned i = 1, e = MemOps.size(); i != e; ++i) { - if (MemOps[i].Position < Position) { - Position = MemOps[i].Position; - Loc = MemOps[i].MBBI; - } - } + // When no memory operands are present, conservatively assume unaligned, + // volatile, unfoldable. + if (!MI.hasOneMemOperand()) + return false; - if (Loc != MBB.begin()) - RS->forward(prior(Loc)); -} + const MachineMemOperand &MMO = **MI.memoperands_begin(); -static int getMemoryOpOffset(const MachineInstr *MI) { - int Opcode = MI->getOpcode(); - bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD; - unsigned NumOperands = MI->getDesc().getNumOperands(); - unsigned OffField = MI->getOperand(NumOperands-3).getImm(); + // Don't touch volatile memory accesses - we may be changing their order. + if (MMO.isVolatile()) + return false; - if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 || - Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 || - Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8 || - Opcode == ARM::LDRi12 || Opcode == ARM::STRi12) - return OffField; + // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is + // not. + if (MMO.getAlignment() < 4) + return false; - int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField) - : ARM_AM::getAM5Offset(OffField) * 4; - if (isAM3) { - if (ARM_AM::getAM3Op(OffField) == ARM_AM::sub) - Offset = -Offset; - } else { - if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub) - Offset = -Offset; - } - return Offset; + // str could probably be eliminated entirely, but for now we just want + // to avoid making a mess of it. + // FIXME: Use str as a wildcard to enable better stm folding. + if (MI.getOperand(0).isReg() && MI.getOperand(0).isUndef()) + return false; + + // Likewise don't mess with references to undefined addresses. + if (MI.getOperand(1).isUndef()) + return false; + + return true; } static void InsertLDR_STR(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, int Offset, bool isDef, - DebugLoc dl, unsigned NewOpc, + DebugLoc DL, unsigned NewOpc, unsigned Reg, bool RegDeadKill, bool RegUndef, unsigned BaseReg, bool BaseKill, bool BaseUndef, bool OffKill, bool OffUndef, @@ -1099,289 +1514,276 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI) { MachineInstr *MI = &*MBBI; unsigned Opcode = MI->getOpcode(); - if (Opcode == ARM::LDRD || Opcode == ARM::STRD || - Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) { - const MachineOperand &BaseOp = MI->getOperand(2); - unsigned BaseReg = BaseOp.getReg(); - unsigned EvenReg = MI->getOperand(0).getReg(); - unsigned OddReg = MI->getOperand(1).getReg(); - unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false); - unsigned OddRegNum = TRI->getDwarfRegNum(OddReg, false); - // ARM errata 602117: LDRD with base in list may result in incorrect base - // register when interrupted or faulted. - bool Errata602117 = EvenReg == BaseReg && STI->isCortexM3(); - if (!Errata602117 && - ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum)) - return false; + if (Opcode != ARM::LDRD && Opcode != ARM::STRD && Opcode != ARM::t2LDRDi8) + return false; - MachineBasicBlock::iterator NewBBI = MBBI; - bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8; - bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8; - bool EvenDeadKill = isLd ? - MI->getOperand(0).isDead() : MI->getOperand(0).isKill(); - bool EvenUndef = MI->getOperand(0).isUndef(); - bool OddDeadKill = isLd ? - MI->getOperand(1).isDead() : MI->getOperand(1).isKill(); - bool OddUndef = MI->getOperand(1).isUndef(); - bool BaseKill = BaseOp.isKill(); - bool BaseUndef = BaseOp.isUndef(); - bool OffKill = isT2 ? false : MI->getOperand(3).isKill(); - bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef(); - int OffImm = getMemoryOpOffset(MI); - unsigned PredReg = 0; - ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); - - if (OddRegNum > EvenRegNum && OffImm == 0) { - // Ascending register numbers and no offset. It's safe to change it to a - // ldm or stm. - unsigned NewOpc = (isLd) - ? (isT2 ? ARM::t2LDMIA : ARM::LDMIA) - : (isT2 ? ARM::t2STMIA : ARM::STMIA); - if (isLd) { - BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) - .addReg(BaseReg, getKillRegState(BaseKill)) - .addImm(Pred).addReg(PredReg) - .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill)) - .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill)); - ++NumLDRD2LDM; - } else { - BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) - .addReg(BaseReg, getKillRegState(BaseKill)) - .addImm(Pred).addReg(PredReg) - .addReg(EvenReg, - getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef)) - .addReg(OddReg, - getKillRegState(OddDeadKill) | getUndefRegState(OddUndef)); - ++NumSTRD2STM; - } - NewBBI = llvm::prior(MBBI); + const MachineOperand &BaseOp = MI->getOperand(2); + unsigned BaseReg = BaseOp.getReg(); + unsigned EvenReg = MI->getOperand(0).getReg(); + unsigned OddReg = MI->getOperand(1).getReg(); + unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false); + unsigned OddRegNum = TRI->getDwarfRegNum(OddReg, false); + + // ARM errata 602117: LDRD with base in list may result in incorrect base + // register when interrupted or faulted. + bool Errata602117 = EvenReg == BaseReg && + (Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8) && STI->isCortexM3(); + // ARM LDRD/STRD needs consecutive registers. + bool NonConsecutiveRegs = (Opcode == ARM::LDRD || Opcode == ARM::STRD) && + (EvenRegNum % 2 != 0 || EvenRegNum + 1 != OddRegNum); + + if (!Errata602117 && !NonConsecutiveRegs) + return false; + + bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8; + bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8; + bool EvenDeadKill = isLd ? + MI->getOperand(0).isDead() : MI->getOperand(0).isKill(); + bool EvenUndef = MI->getOperand(0).isUndef(); + bool OddDeadKill = isLd ? + MI->getOperand(1).isDead() : MI->getOperand(1).isKill(); + bool OddUndef = MI->getOperand(1).isUndef(); + bool BaseKill = BaseOp.isKill(); + bool BaseUndef = BaseOp.isUndef(); + bool OffKill = isT2 ? false : MI->getOperand(3).isKill(); + bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef(); + int OffImm = getMemoryOpOffset(MI); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); + + if (OddRegNum > EvenRegNum && OffImm == 0) { + // Ascending register numbers and no offset. It's safe to change it to a + // ldm or stm. + unsigned NewOpc = (isLd) + ? (isT2 ? ARM::t2LDMIA : ARM::LDMIA) + : (isT2 ? ARM::t2STMIA : ARM::STMIA); + if (isLd) { + BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) + .addReg(BaseReg, getKillRegState(BaseKill)) + .addImm(Pred).addReg(PredReg) + .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill)) + .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill)); + ++NumLDRD2LDM; } else { - // Split into two instructions. - unsigned NewOpc = (isLd) - ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12) - : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12); - // Be extra careful for thumb2. t2LDRi8 can't reference a zero offset, - // so adjust and use t2LDRi12 here for that. - unsigned NewOpc2 = (isLd) - ? (isT2 ? (OffImm+4 < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12) - : (isT2 ? (OffImm+4 < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12); - DebugLoc dl = MBBI->getDebugLoc(); - // If this is a load and base register is killed, it may have been - // re-defed by the load, make sure the first load does not clobber it. - if (isLd && - (BaseKill || OffKill) && - (TRI->regsOverlap(EvenReg, BaseReg))) { - assert(!TRI->regsOverlap(OddReg, BaseReg)); - InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2, - OddReg, OddDeadKill, false, - BaseReg, false, BaseUndef, false, OffUndef, - Pred, PredReg, TII, isT2); - NewBBI = llvm::prior(MBBI); - InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, - EvenReg, EvenDeadKill, false, - BaseReg, BaseKill, BaseUndef, OffKill, OffUndef, - Pred, PredReg, TII, isT2); - } else { - if (OddReg == EvenReg && EvenDeadKill) { - // If the two source operands are the same, the kill marker is - // probably on the first one. e.g. - // t2STRDi8 %R5, %R5, %R9, 0, 14, %reg0 - EvenDeadKill = false; - OddDeadKill = true; - } - // Never kill the base register in the first instruction. - // - if (EvenReg == BaseReg) - EvenDeadKill = false; - InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, - EvenReg, EvenDeadKill, EvenUndef, - BaseReg, false, BaseUndef, false, OffUndef, - Pred, PredReg, TII, isT2); - NewBBI = llvm::prior(MBBI); - InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2, - OddReg, OddDeadKill, OddUndef, - BaseReg, BaseKill, BaseUndef, OffKill, OffUndef, - Pred, PredReg, TII, isT2); + BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) + .addReg(BaseReg, getKillRegState(BaseKill)) + .addImm(Pred).addReg(PredReg) + .addReg(EvenReg, + getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef)) + .addReg(OddReg, + getKillRegState(OddDeadKill) | getUndefRegState(OddUndef)); + ++NumSTRD2STM; + } + } else { + // Split into two instructions. + unsigned NewOpc = (isLd) + ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12) + : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12); + // Be extra careful for thumb2. t2LDRi8 can't reference a zero offset, + // so adjust and use t2LDRi12 here for that. + unsigned NewOpc2 = (isLd) + ? (isT2 ? (OffImm+4 < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12) + : (isT2 ? (OffImm+4 < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12); + DebugLoc dl = MBBI->getDebugLoc(); + // If this is a load and base register is killed, it may have been + // re-defed by the load, make sure the first load does not clobber it. + if (isLd && + (BaseKill || OffKill) && + (TRI->regsOverlap(EvenReg, BaseReg))) { + assert(!TRI->regsOverlap(OddReg, BaseReg)); + InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2, + OddReg, OddDeadKill, false, + BaseReg, false, BaseUndef, false, OffUndef, + Pred, PredReg, TII, isT2); + InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, + EvenReg, EvenDeadKill, false, + BaseReg, BaseKill, BaseUndef, OffKill, OffUndef, + Pred, PredReg, TII, isT2); + } else { + if (OddReg == EvenReg && EvenDeadKill) { + // If the two source operands are the same, the kill marker is + // probably on the first one. e.g. + // t2STRDi8 %R5, %R5, %R9, 0, 14, %reg0 + EvenDeadKill = false; + OddDeadKill = true; } - if (isLd) - ++NumLDRD2LDR; - else - ++NumSTRD2STR; + // Never kill the base register in the first instruction. + if (EvenReg == BaseReg) + EvenDeadKill = false; + InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, + EvenReg, EvenDeadKill, EvenUndef, + BaseReg, false, BaseUndef, false, OffUndef, + Pred, PredReg, TII, isT2); + InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2, + OddReg, OddDeadKill, OddUndef, + BaseReg, BaseKill, BaseUndef, OffKill, OffUndef, + Pred, PredReg, TII, isT2); } - - MBB.erase(MI); - MBBI = NewBBI; - return true; + if (isLd) + ++NumLDRD2LDR; + else + ++NumSTRD2STR; } - return false; + + MBBI = MBB.erase(MBBI); + return true; } -/// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR -/// ops of the same base and incrementing offset into LDM / STM ops. +/// An optimization pass to turn multiple LDR / STR ops of the same base and +/// incrementing offset into LDM / STM ops. bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { - unsigned NumMerges = 0; - unsigned NumMemOps = 0; MemOpQueue MemOps; unsigned CurrBase = 0; - int CurrOpc = -1; - unsigned CurrSize = 0; + unsigned CurrOpc = ~0u; ARMCC::CondCodes CurrPred = ARMCC::AL; - unsigned CurrPredReg = 0; unsigned Position = 0; - SmallVector Merges; - - RS->enterBasicBlock(&MBB); - MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); - while (MBBI != E) { + assert(Candidates.size() == 0); + assert(MergeBaseCandidates.size() == 0); + LiveRegsValid = false; + + for (MachineBasicBlock::iterator I = MBB.end(), MBBI; I != MBB.begin(); + I = MBBI) { + // The instruction in front of the iterator is the one we look at. + MBBI = std::prev(I); if (FixInvalidRegPairOp(MBB, MBBI)) continue; + ++Position; - bool Advance = false; - bool TryMerge = false; - bool Clobber = false; - - bool isMemOp = isMemoryOp(MBBI); - if (isMemOp) { - int Opcode = MBBI->getOpcode(); - unsigned Size = getLSMultipleTransferSize(MBBI); + if (isMemoryOp(*MBBI)) { + unsigned Opcode = MBBI->getOpcode(); const MachineOperand &MO = MBBI->getOperand(0); unsigned Reg = MO.getReg(); - bool isKill = MO.isDef() ? false : MO.isKill(); - unsigned Base = MBBI->getOperand(1).getReg(); + unsigned Base = getLoadStoreBaseOp(*MBBI).getReg(); unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg); int Offset = getMemoryOpOffset(MBBI); - // Watch out for: - // r4 := ldr [r5] - // r5 := ldr [r5, #4] - // r6 := ldr [r5, #8] - // - // The second ldr has effectively broken the chain even though it - // looks like the later ldr(s) use the same base register. Try to - // merge the ldr's so far, including this one. But don't try to - // combine the following ldr(s). - Clobber = (isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg()); - if (CurrBase == 0 && !Clobber) { + if (CurrBase == 0) { // Start of a new chain. CurrBase = Base; CurrOpc = Opcode; - CurrSize = Size; CurrPred = Pred; - CurrPredReg = PredReg; - MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI)); - ++NumMemOps; - Advance = true; - } else { - if (Clobber) { - TryMerge = true; - Advance = true; + MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position)); + continue; + } + // Note: No need to match PredReg in the next if. + if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) { + // Watch out for: + // r4 := ldr [r0, #8] + // r4 := ldr [r0, #4] + // or + // r0 := ldr [r0] + // If a load overrides the base register or a register loaded by + // another load in our chain, we cannot take this instruction. + bool Overlap = false; + if (isLoadSingle(Opcode)) { + Overlap = (Base == Reg); + if (!Overlap) { + for (const MemOpQueueEntry &E : MemOps) { + if (TRI->regsOverlap(Reg, E.MI->getOperand(0).getReg())) { + Overlap = true; + break; + } + } + } } - if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) { - // No need to match PredReg. - // Continue adding to the queue. + if (!Overlap) { + // Check offset and sort memory operation into the current chain. if (Offset > MemOps.back().Offset) { - MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, - Position, MBBI)); - ++NumMemOps; - Advance = true; + MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position)); + continue; } else { - for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); - I != E; ++I) { - if (Offset < I->Offset) { - MemOps.insert(I, MemOpQueueEntry(Offset, Reg, isKill, - Position, MBBI)); - ++NumMemOps; - Advance = true; + MemOpQueue::iterator MI, ME; + for (MI = MemOps.begin(), ME = MemOps.end(); MI != ME; ++MI) { + if (Offset < MI->Offset) { + // Found a place to insert. break; - } else if (Offset == I->Offset) { - // Collision! This can't be merged! + } + if (Offset == MI->Offset) { + // Collision, abort. + MI = ME; break; } } + if (MI != MemOps.end()) { + MemOps.insert(MI, MemOpQueueEntry(MBBI, Offset, Position)); + continue; + } } } } + + // Don't advance the iterator; The op will start a new chain next. + MBBI = I; + --Position; + // Fallthrough to look into existing chain. + } else if (MBBI->isDebugValue()) { + continue; + } else if (MBBI->getOpcode() == ARM::t2LDRDi8 || + MBBI->getOpcode() == ARM::t2STRDi8) { + // ARMPreAllocLoadStoreOpt has already formed some LDRD/STRD instructions + // remember them because we may still be able to merge add/sub into them. + MergeBaseCandidates.push_back(MBBI); } - if (MBBI->isDebugValue()) { - ++MBBI; - if (MBBI == E) - // Reach the end of the block, try merging the memory instructions. - TryMerge = true; - } else if (Advance) { - ++Position; - ++MBBI; - if (MBBI == E) - // Reach the end of the block, try merging the memory instructions. - TryMerge = true; - } else - TryMerge = true; - - if (TryMerge) { - if (NumMemOps > 1) { - // Try to find a free register to use as a new base in case it's needed. - // First advance to the instruction just before the start of the chain. - AdvanceRS(MBB, MemOps); - // Find a scratch register. - unsigned Scratch = RS->FindUnusedReg(&ARM::GPRRegClass); - // Process the load / store instructions. - RS->forward(prior(MBBI)); - - // Merge ops. - Merges.clear(); - MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize, - CurrPred, CurrPredReg, Scratch, MemOps, Merges); - - // Try folding preceding/trailing base inc/dec into the generated - // LDM/STM ops. - for (unsigned i = 0, e = Merges.size(); i < e; ++i) - if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI)) - ++NumMerges; - NumMerges += Merges.size(); - - // Try folding preceding/trailing base inc/dec into those load/store - // that were not merged to form LDM/STM ops. - for (unsigned i = 0; i != NumMemOps; ++i) - if (!MemOps[i].Merged) - if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI)) - ++NumMerges; - - // RS may be pointing to an instruction that's deleted. - RS->skipTo(prior(MBBI)); - } else if (NumMemOps == 1) { - // Try folding preceding/trailing base inc/dec into the single - // load/store. - if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) { - ++NumMerges; - RS->forward(prior(MBBI)); - } - } + // If we are here then the chain is broken; Extract candidates for a merge. + if (MemOps.size() > 0) { + FormCandidates(MemOps); + // Reset for the next chain. CurrBase = 0; - CurrOpc = -1; - CurrSize = 0; + CurrOpc = ~0u; CurrPred = ARMCC::AL; - CurrPredReg = 0; - if (NumMemOps) { - MemOps.clear(); - NumMemOps = 0; - } + MemOps.clear(); + } + } + if (MemOps.size() > 0) + FormCandidates(MemOps); - // If iterator hasn't been advanced and this is not a memory op, skip it. - // It can't start a new chain anyway. - if (!Advance && !isMemOp && MBBI != E) { - ++Position; - ++MBBI; + // Sort candidates so they get processed from end to begin of the basic + // block later; This is necessary for liveness calculation. + auto LessThan = [](const MergeCandidate* M0, const MergeCandidate *M1) { + return M0->InsertPos < M1->InsertPos; + }; + std::sort(Candidates.begin(), Candidates.end(), LessThan); + + // Go through list of candidates and merge. + bool Changed = false; + for (const MergeCandidate *Candidate : Candidates) { + if (Candidate->CanMergeToLSMulti || Candidate->CanMergeToLSDouble) { + MachineInstr *Merged = MergeOpsUpdate(*Candidate); + // Merge preceding/trailing base inc/dec into the merged op. + if (Merged) { + Changed = true; + unsigned Opcode = Merged->getOpcode(); + if (Opcode == ARM::t2STRDi8 || Opcode == ARM::t2LDRDi8) + MergeBaseUpdateLSDouble(*Merged); + else + MergeBaseUpdateLSMultiple(Merged); + } else { + for (MachineInstr *MI : Candidate->Instrs) { + if (MergeBaseUpdateLoadStore(MI)) + Changed = true; + } } + } else { + assert(Candidate->Instrs.size() == 1); + if (MergeBaseUpdateLoadStore(Candidate->Instrs.front())) + Changed = true; } } - return NumMerges > 0; + Candidates.clear(); + // Try to fold add/sub into the LDRD/STRD formed by ARMPreAllocLoadStoreOpt. + for (MachineInstr *MI : MergeBaseCandidates) + MergeBaseUpdateLSDouble(*MI); + MergeBaseCandidates.clear(); + + return Changed; } -/// MergeReturnIntoLDM - If this is a exit BB, try merging the return ops -/// ("bx lr" and "mov pc, lr") into the preceding stack restore so it -/// directly restore the value of LR into pc. +/// If this is a exit BB, try merging the return ops ("bx lr" and "mov pc, lr") +/// into the preceding stack restore so it directly restore the value of LR +/// into pc. /// ldmfd sp!, {..., lr} /// bx lr /// or @@ -1390,6 +1792,8 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { /// => /// ldmfd sp!, {..., pc} bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) { + // Thumb1 LDM doesn't allow high registers. + if (isThumb1) return false; if (MBB.empty()) return false; MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); @@ -1397,7 +1801,11 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) { (MBBI->getOpcode() == ARM::BX_RET || MBBI->getOpcode() == ARM::tBX_RET || MBBI->getOpcode() == ARM::MOVPCLR)) { - MachineInstr *PrevMI = prior(MBBI); + MachineBasicBlock::iterator PrevI = std::prev(MBBI); + // Ignore any DBG_VALUE instructions. + while (PrevI->isDebugValue() && PrevI != MBB.begin()) + --PrevI; + MachineInstr *PrevMI = PrevI; unsigned Opcode = PrevMI->getOpcode(); if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD || Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD || @@ -1410,7 +1818,7 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) { Opcode == ARM::LDMIA_UPD) && "Unsupported multiple load-return!"); PrevMI->setDesc(TII->get(NewOpc)); MO.setReg(ARM::PC); - PrevMI->copyImplicitOps(&*MBBI); + PrevMI->copyImplicitOps(*MBB.getParent(), &*MBBI); MBB.erase(MBBI); return true; } @@ -1418,49 +1826,84 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) { return false; } +bool ARMLoadStoreOpt::CombineMovBx(MachineBasicBlock &MBB) { + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + if (MBBI == MBB.begin() || MBBI == MBB.end() || + MBBI->getOpcode() != ARM::tBX_RET) + return false; + + MachineBasicBlock::iterator Prev = MBBI; + --Prev; + if (Prev->getOpcode() != ARM::tMOVr || !Prev->definesRegister(ARM::LR)) + return false; + + for (auto Use : Prev->uses()) + if (Use.isKill()) { + AddDefaultPred(BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(ARM::tBX)) + .addReg(Use.getReg(), RegState::Kill)) + .copyImplicitOps(&*MBBI); + MBB.erase(MBBI); + MBB.erase(Prev); + return true; + } + + llvm_unreachable("tMOVr doesn't kill a reg before tBX_RET?"); +} + bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { - const TargetMachine &TM = Fn.getTarget(); + MF = &Fn; + STI = &static_cast(Fn.getSubtarget()); + TL = STI->getTargetLowering(); AFI = Fn.getInfo(); - TII = TM.getInstrInfo(); - TRI = TM.getRegisterInfo(); - STI = &TM.getSubtarget(); - RS = new RegScavenger(); + TII = STI->getInstrInfo(); + TRI = STI->getRegisterInfo(); + + RegClassInfoValid = false; isThumb2 = AFI->isThumb2Function(); + isThumb1 = AFI->isThumbFunction() && !isThumb2; bool Modified = false; for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; ++MFI) { MachineBasicBlock &MBB = *MFI; Modified |= LoadStoreMultipleOpti(MBB); - if (TM.getSubtarget().hasV5TOps()) + if (STI->hasV5TOps()) Modified |= MergeReturnIntoLDM(MBB); + if (isThumb1) + Modified |= CombineMovBx(MBB); } - delete RS; + Allocator.DestroyAll(); return Modified; } +namespace llvm { +void initializeARMPreAllocLoadStoreOptPass(PassRegistry &); +} -/// ARMPreAllocLoadStoreOpt - Pre- register allocation pass that move -/// load / stores from consecutive locations close to make it more -/// likely they will be combined later. +#define ARM_PREALLOC_LOAD_STORE_OPT_NAME \ + "ARM pre- register allocation load / store optimization pass" namespace { + /// Pre- register allocation pass that move load / stores from consecutive + /// locations close to make it more likely they will be combined later. struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{ static char ID; - ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {} + ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) { + initializeARMPreAllocLoadStoreOptPass(*PassRegistry::getPassRegistry()); + } - const TargetData *TD; + const DataLayout *TD; const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; const ARMSubtarget *STI; MachineRegisterInfo *MRI; MachineFunction *MF; - virtual bool runOnMachineFunction(MachineFunction &Fn); + bool runOnMachineFunction(MachineFunction &Fn) override; - virtual const char *getPassName() const { - return "ARM pre- register allocation load / store optimization pass"; + const char *getPassName() const override { + return ARM_PREALLOC_LOAD_STORE_OPT_NAME; } private: @@ -1471,7 +1914,7 @@ namespace { unsigned &PredReg, ARMCC::CondCodes &Pred, bool &isT2); bool RescheduleOps(MachineBasicBlock *MBB, - SmallVector &Ops, + SmallVectorImpl &Ops, unsigned Base, bool isLd, DenseMap &MI2LocMap); bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB); @@ -1479,18 +1922,20 @@ namespace { char ARMPreAllocLoadStoreOpt::ID = 0; } +INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-load-store-opt", + ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false) + bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { - TD = Fn.getTarget().getTargetData(); - TII = Fn.getTarget().getInstrInfo(); - TRI = Fn.getTarget().getRegisterInfo(); - STI = &Fn.getTarget().getSubtarget(); + TD = &Fn.getDataLayout(); + STI = &static_cast(Fn.getSubtarget()); + TII = STI->getInstrInfo(); + TRI = STI->getRegisterInfo(); MRI = &Fn.getRegInfo(); MF = &Fn; bool Modified = false; - for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; - ++MFI) - Modified |= RescheduleLoadStoreInstrs(MFI); + for (MachineBasicBlock &MFI : Fn) + Modified |= RescheduleLoadStoreInstrs(&MFI); return Modified; } @@ -1498,7 +1943,7 @@ bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base, MachineBasicBlock::iterator I, MachineBasicBlock::iterator E, - SmallPtrSet &MemOps, + SmallPtrSetImpl &MemOps, SmallSet &MemRegs, const TargetRegisterInfo *TRI) { // Are there stores / loads / calls between them? @@ -1541,29 +1986,13 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base, return AddedRegPressure.size() <= MemRegs.size() * 2; } - -/// Copy Op0 and Op1 operands into a new array assigned to MI. -static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0, - MachineInstr *Op1) { - assert(MI->memoperands_empty() && "expected a new machineinstr"); - size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin()) - + (Op1->memoperands_end() - Op1->memoperands_begin()); - - MachineFunction *MF = MI->getParent()->getParent(); - MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs); - MachineSDNode::mmo_iterator MemEnd = - std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin); - MemEnd = - std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd); - MI->setMemRefs(MemBegin, MemEnd); -} - bool ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, - DebugLoc &dl, - unsigned &NewOpc, unsigned &EvenReg, - unsigned &OddReg, unsigned &BaseReg, - int &Offset, unsigned &PredReg, + DebugLoc &dl, unsigned &NewOpc, + unsigned &FirstReg, + unsigned &SecondReg, + unsigned &BaseReg, int &Offset, + unsigned &PredReg, ARMCC::CondCodes &Pred, bool &isT2) { // Make sure we're allowed to generate LDRD/STRD. @@ -1573,11 +2002,11 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD unsigned Scale = 1; unsigned Opcode = Op0->getOpcode(); - if (Opcode == ARM::LDRi12) + if (Opcode == ARM::LDRi12) { NewOpc = ARM::LDRD; - else if (Opcode == ARM::STRi12) + } else if (Opcode == ARM::STRi12) { NewOpc = ARM::STRD; - else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) { + } else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) { NewOpc = ARM::t2LDRDi8; Scale = 4; isT2 = true; @@ -1585,12 +2014,14 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, NewOpc = ARM::t2STRDi8; Scale = 4; isT2 = true; - } else + } else { return false; + } // Make sure the base address satisfies i64 ld / st alignment requirement. + // At the moment, we ignore the memoryoperand's value. + // If we want to use AliasAnalysis, we should check it accordingly. if (!Op0->hasOneMemOperand() || - !(*Op0->memoperands_begin())->getValue() || (*Op0->memoperands_begin())->isVolatile()) return false; @@ -1620,9 +2051,9 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, return false; Offset = ARM_AM::getAM3Opc(AddSub, OffImm); } - EvenReg = Op0->getOperand(0).getReg(); - OddReg = Op1->getOperand(0).getReg(); - if (EvenReg == OddReg) + FirstReg = Op0->getOperand(0).getReg(); + SecondReg = Op1->getOperand(0).getReg(); + if (FirstReg == SecondReg) return false; BaseReg = Op0->getOperand(1).getReg(); Pred = getInstrPredicate(Op0, PredReg); @@ -1630,25 +2061,20 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, return true; } -namespace { - struct OffsetCompare { - bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const { - int LOffset = getMemoryOpOffset(LHS); - int ROffset = getMemoryOpOffset(RHS); - assert(LHS == RHS || LOffset != ROffset); - return LOffset > ROffset; - } - }; -} - bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, - SmallVector &Ops, + SmallVectorImpl &Ops, unsigned Base, bool isLd, DenseMap &MI2LocMap) { bool RetVal = false; // Sort by offset (in reverse order). - std::sort(Ops.begin(), Ops.end(), OffsetCompare()); + std::sort(Ops.begin(), Ops.end(), + [](const MachineInstr *LHS, const MachineInstr *RHS) { + int LOffset = getMemoryOpOffset(LHS); + int ROffset = getMemoryOpOffset(RHS); + assert(LHS == RHS || LOffset != ROffset); + return LOffset > ROffset; + }); // The loads / stores of the same base are in order. Scan them from first to // last and check for the following: @@ -1657,8 +2083,8 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, while (Ops.size() > 1) { unsigned FirstLoc = ~0U; unsigned LastLoc = 0; - MachineInstr *FirstOp = 0; - MachineInstr *LastOp = 0; + MachineInstr *FirstOp = nullptr; + MachineInstr *LastOp = nullptr; int LastOffset = 0; unsigned LastOpcode = 0; unsigned LastBytes = 0; @@ -1723,7 +2149,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, // to try to allocate a pair of registers that can form register pairs. MachineInstr *Op0 = Ops.back(); MachineInstr *Op1 = Ops[Ops.size()-2]; - unsigned EvenReg = 0, OddReg = 0; + unsigned FirstReg = 0, SecondReg = 0; unsigned BaseReg = 0, PredReg = 0; ARMCC::CondCodes Pred = ARMCC::AL; bool isT2 = false; @@ -1731,21 +2157,21 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, int Offset = 0; DebugLoc dl; if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc, - EvenReg, OddReg, BaseReg, + FirstReg, SecondReg, BaseReg, Offset, PredReg, Pred, isT2)) { Ops.pop_back(); Ops.pop_back(); const MCInstrDesc &MCID = TII->get(NewOpc); const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF); - MRI->constrainRegClass(EvenReg, TRC); - MRI->constrainRegClass(OddReg, TRC); + MRI->constrainRegClass(FirstReg, TRC); + MRI->constrainRegClass(SecondReg, TRC); // Form the pair instruction. if (isLd) { MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID) - .addReg(EvenReg, RegState::Define) - .addReg(OddReg, RegState::Define) + .addReg(FirstReg, RegState::Define) + .addReg(SecondReg, RegState::Define) .addReg(BaseReg); // FIXME: We're converting from LDRi12 to an insn that still // uses addrmode2, so we need an explicit offset reg. It should @@ -1753,13 +2179,13 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, if (!isT2) MIB.addReg(0); MIB.addImm(Offset).addImm(Pred).addReg(PredReg); - concatenateMemOperands(MIB, Op0, Op1); + MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1)); DEBUG(dbgs() << "Formed " << *MIB << "\n"); ++NumLDRDFormed; } else { MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID) - .addReg(EvenReg) - .addReg(OddReg) + .addReg(FirstReg) + .addReg(SecondReg) .addReg(BaseReg); // FIXME: We're converting from LDRi12 to an insn that still // uses addrmode2, so we need an explicit offset reg. It should @@ -1767,16 +2193,18 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, if (!isT2) MIB.addReg(0); MIB.addImm(Offset).addImm(Pred).addReg(PredReg); - concatenateMemOperands(MIB, Op0, Op1); + MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1)); DEBUG(dbgs() << "Formed " << *MIB << "\n"); ++NumSTRDFormed; } MBB->erase(Op0); MBB->erase(Op1); - // Add register allocation hints to form register pairs. - MRI->setRegAllocationHint(EvenReg, ARMRI::RegPairEven, OddReg); - MRI->setRegAllocationHint(OddReg, ARMRI::RegPairOdd, EvenReg); + if (!isT2) { + // Add register allocation hints to form register pairs. + MRI->setRegAllocationHint(FirstReg, ARMRI::RegPairEven, SecondReg); + MRI->setRegAllocationHint(SecondReg, ARMRI::RegPairOdd, FirstReg); + } } else { for (unsigned i = 0; i != NumMove; ++i) { MachineInstr *Op = Ops.back(); @@ -1819,14 +2247,14 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { if (!MI->isDebugValue()) MI2LocMap[MI] = ++Loc; - if (!isMemoryOp(MI)) + if (!isMemoryOp(*MI)) continue; unsigned PredReg = 0; if (getInstrPredicate(MI, PredReg) != ARMCC::AL) continue; int Opc = MI->getOpcode(); - bool isLd = isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD; + bool isLd = isLoadSingle(Opc); unsigned Base = MI->getOperand(1).getReg(); int Offset = getMemoryOpOffset(MI); @@ -1844,9 +2272,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { if (!StopHere) BI->second.push_back(MI); } else { - SmallVector MIs; - MIs.push_back(MI); - Base2LdsMap[Base] = MIs; + Base2LdsMap[Base].push_back(MI); LdBases.push_back(Base); } } else { @@ -1862,9 +2288,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { if (!StopHere) BI->second.push_back(MI); } else { - SmallVector MIs; - MIs.push_back(MI); - Base2StsMap[Base] = MIs; + Base2StsMap[Base].push_back(MI); StBases.push_back(Base); } } @@ -1880,7 +2304,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { // Re-schedule loads. for (unsigned i = 0, e = LdBases.size(); i != e; ++i) { unsigned Base = LdBases[i]; - SmallVector &Lds = Base2LdsMap[Base]; + SmallVectorImpl &Lds = Base2LdsMap[Base]; if (Lds.size() > 1) RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap); } @@ -1888,7 +2312,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { // Re-schedule stores. for (unsigned i = 0, e = StBases.size(); i != e; ++i) { unsigned Base = StBases[i]; - SmallVector &Sts = Base2StsMap[Base]; + SmallVectorImpl &Sts = Base2StsMap[Base]; if (Sts.size() > 1) RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap); } @@ -1905,10 +2329,10 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { } -/// createARMLoadStoreOptimizationPass - returns an instance of the load / store -/// optimization pass. +/// Returns an instance of the load / store optimization pass. FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) { if (PreAlloc) return new ARMPreAllocLoadStoreOpt(); return new ARMLoadStoreOpt(); } +