1 //===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 /// \file This file contains a pass that performs load / store related peephole
11 /// optimizations. This pass should be run after register allocation.
13 //===----------------------------------------------------------------------===//
16 #include "ARMBaseInstrInfo.h"
17 #include "ARMBaseRegisterInfo.h"
18 #include "ARMISelLowering.h"
19 #include "ARMMachineFunctionInfo.h"
20 #include "ARMSubtarget.h"
21 #include "MCTargetDesc/ARMAddressingModes.h"
22 #include "ThumbRegisterInfo.h"
23 #include "llvm/ADT/DenseMap.h"
24 #include "llvm/ADT/STLExtras.h"
25 #include "llvm/ADT/SmallPtrSet.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/SmallVector.h"
28 #include "llvm/ADT/Statistic.h"
29 #include "llvm/CodeGen/MachineBasicBlock.h"
30 #include "llvm/CodeGen/MachineFunctionPass.h"
31 #include "llvm/CodeGen/MachineInstr.h"
32 #include "llvm/CodeGen/MachineInstrBuilder.h"
33 #include "llvm/CodeGen/MachineRegisterInfo.h"
34 #include "llvm/CodeGen/RegisterClassInfo.h"
35 #include "llvm/CodeGen/SelectionDAGNodes.h"
36 #include "llvm/CodeGen/LivePhysRegs.h"
37 #include "llvm/IR/DataLayout.h"
38 #include "llvm/IR/DerivedTypes.h"
39 #include "llvm/IR/Function.h"
40 #include "llvm/Support/Allocator.h"
41 #include "llvm/Support/Debug.h"
42 #include "llvm/Support/ErrorHandling.h"
43 #include "llvm/Support/raw_ostream.h"
44 #include "llvm/Target/TargetInstrInfo.h"
45 #include "llvm/Target/TargetMachine.h"
46 #include "llvm/Target/TargetRegisterInfo.h"
49 #define DEBUG_TYPE "arm-ldst-opt"
51 STATISTIC(NumLDMGened , "Number of ldm instructions generated");
52 STATISTIC(NumSTMGened , "Number of stm instructions generated");
53 STATISTIC(NumVLDMGened, "Number of vldm instructions generated");
54 STATISTIC(NumVSTMGened, "Number of vstm instructions generated");
55 STATISTIC(NumLdStMoved, "Number of load / store instructions moved");
56 STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation");
57 STATISTIC(NumSTRDFormed,"Number of strd created before allocation");
58 STATISTIC(NumLDRD2LDM, "Number of ldrd instructions turned back into ldm");
59 STATISTIC(NumSTRD2STM, "Number of strd instructions turned back into stm");
60 STATISTIC(NumLDRD2LDR, "Number of ldrd instructions turned back into ldr's");
61 STATISTIC(NumSTRD2STR, "Number of strd instructions turned back into str's");
64 void initializeARMLoadStoreOptPass(PassRegistry &);
67 #define ARM_LOAD_STORE_OPT_NAME "ARM load / store optimization pass"
70 /// Post- register allocation pass the combine load / store instructions to
71 /// form ldm / stm instructions.
72 struct ARMLoadStoreOpt : public MachineFunctionPass {
74 ARMLoadStoreOpt() : MachineFunctionPass(ID) {
75 initializeARMLoadStoreOptPass(*PassRegistry::getPassRegistry());
78 const MachineFunction *MF;
79 const TargetInstrInfo *TII;
80 const TargetRegisterInfo *TRI;
81 const MachineRegisterInfo *MRI;
82 const ARMSubtarget *STI;
83 const TargetLowering *TL;
85 LivePhysRegs LiveRegs;
86 RegisterClassInfo RegClassInfo;
87 MachineBasicBlock::const_iterator LiveRegPos;
89 bool RegClassInfoValid;
90 bool isThumb1, isThumb2;
92 bool runOnMachineFunction(MachineFunction &Fn) override;
94 const char *getPassName() const override {
95 return ARM_LOAD_STORE_OPT_NAME;
99 /// A set of load/store MachineInstrs with same base register sorted by
101 struct MemOpQueueEntry {
103 int Offset; ///< Load/Store offset.
104 unsigned Position; ///< Position as counted from end of basic block.
105 MemOpQueueEntry(MachineInstr *MI, int Offset, unsigned Position)
106 : MI(MI), Offset(Offset), Position(Position) {}
108 typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
110 /// A set of MachineInstrs that fulfill (nearly all) conditions to get
111 /// merged into a LDM/STM.
112 struct MergeCandidate {
113 /// List of instructions ordered by load/store offset.
114 SmallVector<MachineInstr*, 4> Instrs;
115 /// Index in Instrs of the instruction being latest in the schedule.
116 unsigned LatestMIIdx;
117 /// Index in Instrs of the instruction being earliest in the schedule.
118 unsigned EarliestMIIdx;
119 /// Index into the basic block where the merged instruction will be
120 /// inserted. (See MemOpQueueEntry.Position)
122 /// Whether the instructions can be merged into a ldm/stm instruction.
123 bool CanMergeToLSMulti;
124 /// Whether the instructions can be merged into a ldrd/strd instruction.
125 bool CanMergeToLSDouble;
127 SpecificBumpPtrAllocator<MergeCandidate> Allocator;
128 SmallVector<const MergeCandidate*,4> Candidates;
129 SmallVector<MachineInstr*,4> MergeBaseCandidates;
131 void moveLiveRegsBefore(const MachineBasicBlock &MBB,
132 MachineBasicBlock::const_iterator Before);
133 unsigned findFreeReg(const TargetRegisterClass &RegClass);
134 void UpdateBaseRegUses(MachineBasicBlock &MBB,
135 MachineBasicBlock::iterator MBBI,
136 DebugLoc DL, unsigned Base, unsigned WordOffset,
137 ARMCC::CondCodes Pred, unsigned PredReg);
138 MachineInstr *CreateLoadStoreMulti(MachineBasicBlock &MBB,
139 MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
140 bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
141 DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs);
142 MachineInstr *CreateLoadStoreDouble(MachineBasicBlock &MBB,
143 MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
144 bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
145 DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) const;
146 void FormCandidates(const MemOpQueue &MemOps);
147 MachineInstr *MergeOpsUpdate(const MergeCandidate &Cand);
148 bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
149 MachineBasicBlock::iterator &MBBI);
150 bool MergeBaseUpdateLoadStore(MachineInstr *MI);
151 bool MergeBaseUpdateLSMultiple(MachineInstr *MI);
152 bool MergeBaseUpdateLSDouble(MachineInstr &MI) const;
153 bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
154 bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
156 char ARMLoadStoreOpt::ID = 0;
159 INITIALIZE_PASS(ARMLoadStoreOpt, "arm-load-store-opt", ARM_LOAD_STORE_OPT_NAME, false, false)
161 static bool definesCPSR(const MachineInstr *MI) {
162 for (const auto &MO : MI->operands()) {
165 if (MO.isDef() && MO.getReg() == ARM::CPSR && !MO.isDead())
166 // If the instruction has live CPSR def, then it's not safe to fold it
167 // into load / store.
174 static int getMemoryOpOffset(const MachineInstr *MI) {
175 unsigned Opcode = MI->getOpcode();
176 bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD;
177 unsigned NumOperands = MI->getDesc().getNumOperands();
178 unsigned OffField = MI->getOperand(NumOperands-3).getImm();
180 if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 ||
181 Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 ||
182 Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8 ||
183 Opcode == ARM::LDRi12 || Opcode == ARM::STRi12)
186 // Thumb1 immediate offsets are scaled by 4
187 if (Opcode == ARM::tLDRi || Opcode == ARM::tSTRi ||
188 Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi)
191 int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField)
192 : ARM_AM::getAM5Offset(OffField) * 4;
193 ARM_AM::AddrOpc Op = isAM3 ? ARM_AM::getAM3Op(OffField)
194 : ARM_AM::getAM5Op(OffField);
196 if (Op == ARM_AM::sub)
202 static const MachineOperand &getLoadStoreBaseOp(const MachineInstr &MI) {
203 return MI.getOperand(1);
206 static const MachineOperand &getLoadStoreRegOp(const MachineInstr &MI) {
207 return MI.getOperand(0);
210 static int getLoadStoreMultipleOpcode(unsigned Opcode, ARM_AM::AMSubMode Mode) {
212 default: llvm_unreachable("Unhandled opcode!");
216 default: llvm_unreachable("Unhandled submode!");
217 case ARM_AM::ia: return ARM::LDMIA;
218 case ARM_AM::da: return ARM::LDMDA;
219 case ARM_AM::db: return ARM::LDMDB;
220 case ARM_AM::ib: return ARM::LDMIB;
225 default: llvm_unreachable("Unhandled submode!");
226 case ARM_AM::ia: return ARM::STMIA;
227 case ARM_AM::da: return ARM::STMDA;
228 case ARM_AM::db: return ARM::STMDB;
229 case ARM_AM::ib: return ARM::STMIB;
233 // tLDMIA is writeback-only - unless the base register is in the input
237 default: llvm_unreachable("Unhandled submode!");
238 case ARM_AM::ia: return ARM::tLDMIA;
242 // There is no non-writeback tSTMIA either.
245 default: llvm_unreachable("Unhandled submode!");
246 case ARM_AM::ia: return ARM::tSTMIA_UPD;
252 default: llvm_unreachable("Unhandled submode!");
253 case ARM_AM::ia: return ARM::t2LDMIA;
254 case ARM_AM::db: return ARM::t2LDMDB;
260 default: llvm_unreachable("Unhandled submode!");
261 case ARM_AM::ia: return ARM::t2STMIA;
262 case ARM_AM::db: return ARM::t2STMDB;
267 default: llvm_unreachable("Unhandled submode!");
268 case ARM_AM::ia: return ARM::VLDMSIA;
269 case ARM_AM::db: return 0; // Only VLDMSDB_UPD exists.
274 default: llvm_unreachable("Unhandled submode!");
275 case ARM_AM::ia: return ARM::VSTMSIA;
276 case ARM_AM::db: return 0; // Only VSTMSDB_UPD exists.
281 default: llvm_unreachable("Unhandled submode!");
282 case ARM_AM::ia: return ARM::VLDMDIA;
283 case ARM_AM::db: return 0; // Only VLDMDDB_UPD exists.
288 default: llvm_unreachable("Unhandled submode!");
289 case ARM_AM::ia: return ARM::VSTMDIA;
290 case ARM_AM::db: return 0; // Only VSTMDDB_UPD exists.
295 static ARM_AM::AMSubMode getLoadStoreMultipleSubMode(unsigned Opcode) {
297 default: llvm_unreachable("Unhandled opcode!");
304 case ARM::tLDMIA_UPD:
305 case ARM::tSTMIA_UPD:
306 case ARM::t2LDMIA_RET:
308 case ARM::t2LDMIA_UPD:
310 case ARM::t2STMIA_UPD:
312 case ARM::VLDMSIA_UPD:
314 case ARM::VSTMSIA_UPD:
316 case ARM::VLDMDIA_UPD:
318 case ARM::VSTMDIA_UPD:
332 case ARM::t2LDMDB_UPD:
334 case ARM::t2STMDB_UPD:
335 case ARM::VLDMSDB_UPD:
336 case ARM::VSTMSDB_UPD:
337 case ARM::VLDMDDB_UPD:
338 case ARM::VSTMDDB_UPD:
349 static bool isT1i32Load(unsigned Opc) {
350 return Opc == ARM::tLDRi || Opc == ARM::tLDRspi;
353 static bool isT2i32Load(unsigned Opc) {
354 return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8;
357 static bool isi32Load(unsigned Opc) {
358 return Opc == ARM::LDRi12 || isT1i32Load(Opc) || isT2i32Load(Opc) ;
361 static bool isT1i32Store(unsigned Opc) {
362 return Opc == ARM::tSTRi || Opc == ARM::tSTRspi;
365 static bool isT2i32Store(unsigned Opc) {
366 return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8;
369 static bool isi32Store(unsigned Opc) {
370 return Opc == ARM::STRi12 || isT1i32Store(Opc) || isT2i32Store(Opc);
373 static bool isLoadSingle(unsigned Opc) {
374 return isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD;
377 static unsigned getImmScale(unsigned Opc) {
379 default: llvm_unreachable("Unhandled opcode!");
394 static unsigned getLSMultipleTransferSize(const MachineInstr *MI) {
395 switch (MI->getOpcode()) {
422 case ARM::tLDMIA_UPD:
423 case ARM::tSTMIA_UPD:
430 return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4;
433 return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8;
437 /// Update future uses of the base register with the offset introduced
438 /// due to writeback. This function only works on Thumb1.
440 ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
441 MachineBasicBlock::iterator MBBI,
442 DebugLoc DL, unsigned Base,
444 ARMCC::CondCodes Pred, unsigned PredReg) {
445 assert(isThumb1 && "Can only update base register uses for Thumb1!");
446 // Start updating any instructions with immediate offsets. Insert a SUB before
447 // the first non-updateable instruction (if any).
448 for (; MBBI != MBB.end(); ++MBBI) {
449 bool InsertSub = false;
450 unsigned Opc = MBBI->getOpcode();
452 if (MBBI->readsRegister(Base)) {
455 Opc == ARM::tLDRi || Opc == ARM::tLDRHi || Opc == ARM::tLDRBi;
457 Opc == ARM::tSTRi || Opc == ARM::tSTRHi || Opc == ARM::tSTRBi;
459 if (IsLoad || IsStore) {
460 // Loads and stores with immediate offsets can be updated, but only if
461 // the new offset isn't negative.
462 // The MachineOperand containing the offset immediate is the last one
463 // before predicates.
465 MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
466 // The offsets are scaled by 1, 2 or 4 depending on the Opcode.
467 Offset = MO.getImm() - WordOffset * getImmScale(Opc);
469 // If storing the base register, it needs to be reset first.
470 unsigned InstrSrcReg = getLoadStoreRegOp(*MBBI).getReg();
472 if (Offset >= 0 && !(IsStore && InstrSrcReg == Base))
477 } else if ((Opc == ARM::tSUBi8 || Opc == ARM::tADDi8) &&
478 !definesCPSR(MBBI)) {
479 // SUBS/ADDS using this register, with a dead def of the CPSR.
480 // Merge it with the update; if the merged offset is too large,
481 // insert a new sub instead.
483 MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
484 Offset = (Opc == ARM::tSUBi8) ?
485 MO.getImm() + WordOffset * 4 :
486 MO.getImm() - WordOffset * 4 ;
487 if (Offset >= 0 && TL->isLegalAddImmediate(Offset)) {
488 // FIXME: Swap ADDS<->SUBS if Offset < 0, erase instruction if
491 // The base register has now been reset, so exit early.
498 // Can't update the instruction.
502 } else if (definesCPSR(MBBI) || MBBI->isCall() || MBBI->isBranch()) {
503 // Since SUBS sets the condition flags, we can't place the base reset
504 // after an instruction that has a live CPSR def.
505 // The base register might also contain an argument for a function call.
510 // An instruction above couldn't be updated, so insert a sub.
511 AddDefaultT1CC(BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true)
512 .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg);
516 if (MBBI->killsRegister(Base) || MBBI->definesRegister(Base))
517 // Register got killed. Stop updating.
521 // End of block was reached.
522 if (MBB.succ_size() > 0) {
523 // FIXME: Because of a bug, live registers are sometimes missing from
524 // the successor blocks' live-in sets. This means we can't trust that
525 // information and *always* have to reset at the end of a block.
527 if (MBBI != MBB.end()) --MBBI;
529 BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true)
530 .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg);
534 /// Return the first register of class \p RegClass that is not in \p Regs.
535 unsigned ARMLoadStoreOpt::findFreeReg(const TargetRegisterClass &RegClass) {
536 if (!RegClassInfoValid) {
537 RegClassInfo.runOnMachineFunction(*MF);
538 RegClassInfoValid = true;
541 for (unsigned Reg : RegClassInfo.getOrder(&RegClass))
542 if (!LiveRegs.contains(Reg))
547 /// Compute live registers just before instruction \p Before (in normal schedule
548 /// direction). Computes backwards so multiple queries in the same block must
549 /// come in reverse order.
550 void ARMLoadStoreOpt::moveLiveRegsBefore(const MachineBasicBlock &MBB,
551 MachineBasicBlock::const_iterator Before) {
552 // Initialize if we never queried in this block.
553 if (!LiveRegsValid) {
555 LiveRegs.addLiveOuts(&MBB, true);
556 LiveRegPos = MBB.end();
557 LiveRegsValid = true;
559 // Move backward just before the "Before" position.
560 while (LiveRegPos != Before) {
562 LiveRegs.stepBackward(*LiveRegPos);
566 static bool ContainsReg(const ArrayRef<std::pair<unsigned, bool>> &Regs,
568 for (const std::pair<unsigned, bool> &R : Regs)
574 /// Create and insert a LDM or STM with Base as base register and registers in
575 /// Regs as the register operands that would be loaded / stored. It returns
576 /// true if the transformation is done.
577 MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(MachineBasicBlock &MBB,
578 MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
579 bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
580 DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) {
581 unsigned NumRegs = Regs.size();
584 // For Thumb1 targets, it might be necessary to clobber the CPSR to merge.
585 // Compute liveness information for that register to make the decision.
586 bool SafeToClobberCPSR = !isThumb1 ||
587 (MBB.computeRegisterLiveness(TRI, ARM::CPSR, InsertBefore, 20) ==
588 MachineBasicBlock::LQR_Dead);
590 bool Writeback = isThumb1; // Thumb1 LDM/STM have base reg writeback.
592 // Exception: If the base register is in the input reglist, Thumb1 LDM is
594 // It's also not possible to merge an STR of the base register in Thumb1.
595 if (isThumb1 && isi32Load(Opcode) && ContainsReg(Regs, Base)) {
596 assert(Base != ARM::SP && "Thumb1 does not allow SP in register list");
597 if (Opcode == ARM::tLDRi) {
599 } else if (Opcode == ARM::tSTRi) {
604 ARM_AM::AMSubMode Mode = ARM_AM::ia;
605 // VFP and Thumb2 do not support IB or DA modes. Thumb1 only supports IA.
606 bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
607 bool haveIBAndDA = isNotVFP && !isThumb2 && !isThumb1;
609 if (Offset == 4 && haveIBAndDA) {
611 } else if (Offset == -4 * (int)NumRegs + 4 && haveIBAndDA) {
613 } else if (Offset == -4 * (int)NumRegs && isNotVFP && !isThumb1) {
614 // VLDM/VSTM do not support DB mode without also updating the base reg.
616 } else if (Offset != 0 || Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) {
617 // Check if this is a supported opcode before inserting instructions to
618 // calculate a new base register.
619 if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return nullptr;
621 // If starting offset isn't zero, insert a MI to materialize a new base.
622 // But only do so if it is cost effective, i.e. merging more than two
627 // On Thumb1, it's not worth materializing a new base register without
628 // clobbering the CPSR (i.e. not using ADDS/SUBS).
629 if (!SafeToClobberCPSR)
633 if (isi32Load(Opcode)) {
634 // If it is a load, then just use one of the destination register to
635 // use as the new base.
636 NewBase = Regs[NumRegs-1].first;
638 // Find a free register that we can use as scratch register.
639 moveLiveRegsBefore(MBB, InsertBefore);
640 // The merged instruction does not exist yet but will use several Regs if
642 if (!isLoadSingle(Opcode))
643 for (const std::pair<unsigned, bool> &R : Regs)
644 LiveRegs.addReg(R.first);
646 NewBase = findFreeReg(isThumb1 ? ARM::tGPRRegClass : ARM::GPRRegClass);
652 isThumb2 ? ARM::t2ADDri :
653 (isThumb1 && Base == ARM::SP) ? ARM::tADDrSPi :
654 (isThumb1 && Offset < 8) ? ARM::tADDi3 :
655 isThumb1 ? ARM::tADDi8 : ARM::ADDri;
660 isThumb2 ? ARM::t2SUBri :
661 (isThumb1 && Offset < 8 && Base != ARM::SP) ? ARM::tSUBi3 :
662 isThumb1 ? ARM::tSUBi8 : ARM::SUBri;
665 if (!TL->isLegalAddImmediate(Offset))
666 // FIXME: Try add with register operand?
667 return nullptr; // Probably not worth it then.
669 // We can only append a kill flag to the add/sub input if the value is not
670 // used in the register list of the stm as well.
671 bool KillOldBase = BaseKill &&
672 (!isi32Store(Opcode) || !ContainsReg(Regs, Base));
675 // Thumb1: depending on immediate size, use either
676 // ADDS NewBase, Base, #imm3
679 // ADDS NewBase, #imm8.
680 if (Base != NewBase &&
681 (BaseOpc == ARM::tADDi8 || BaseOpc == ARM::tSUBi8)) {
682 // Need to insert a MOV to the new base first.
683 if (isARMLowRegister(NewBase) && isARMLowRegister(Base) &&
685 // thumbv4t doesn't have lo->lo copies, and we can't predicate tMOVSr
686 if (Pred != ARMCC::AL)
688 BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVSr), NewBase)
689 .addReg(Base, getKillRegState(KillOldBase));
691 BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVr), NewBase)
692 .addReg(Base, getKillRegState(KillOldBase))
693 .addImm(Pred).addReg(PredReg);
695 // The following ADDS/SUBS becomes an update.
699 if (BaseOpc == ARM::tADDrSPi) {
700 assert(Offset % 4 == 0 && "tADDrSPi offset is scaled by 4");
701 BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
702 .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset/4)
703 .addImm(Pred).addReg(PredReg);
706 BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase), true)
707 .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset)
708 .addImm(Pred).addReg(PredReg);
710 BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
711 .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset)
712 .addImm(Pred).addReg(PredReg).addReg(0);
715 BaseKill = true; // New base is always killed straight away.
718 bool isDef = isLoadSingle(Opcode);
720 // Get LS multiple opcode. Note that for Thumb1 this might be an opcode with
721 // base register writeback.
722 Opcode = getLoadStoreMultipleOpcode(Opcode, Mode);
726 // Check if a Thumb1 LDM/STM merge is safe. This is the case if:
727 // - There is no writeback (LDM of base register),
728 // - the base register is killed by the merged instruction,
729 // - or it's safe to overwrite the condition flags, i.e. to insert a SUBS
730 // to reset the base register.
731 // Otherwise, don't merge.
732 // It's safe to return here since the code to materialize a new base register
733 // above is also conditional on SafeToClobberCPSR.
734 if (isThumb1 && !SafeToClobberCPSR && Writeback && !BaseKill)
737 MachineInstrBuilder MIB;
740 if (Opcode == ARM::tLDMIA)
741 // Update tLDMIA with writeback if necessary.
742 Opcode = ARM::tLDMIA_UPD;
744 MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode));
746 // Thumb1: we might need to set base writeback when building the MI.
747 MIB.addReg(Base, getDefRegState(true))
748 .addReg(Base, getKillRegState(BaseKill));
750 // The base isn't dead after a merged instruction with writeback.
751 // Insert a sub instruction after the newly formed instruction to reset.
753 UpdateBaseRegUses(MBB, InsertBefore, DL, Base, NumRegs, Pred, PredReg);
756 // No writeback, simply build the MachineInstr.
757 MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode));
758 MIB.addReg(Base, getKillRegState(BaseKill));
761 MIB.addImm(Pred).addReg(PredReg);
763 for (const std::pair<unsigned, bool> &R : Regs)
764 MIB.addReg(R.first, getDefRegState(isDef) | getKillRegState(R.second));
766 return MIB.getInstr();
769 MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble(MachineBasicBlock &MBB,
770 MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base,
771 bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg,
772 DebugLoc DL, ArrayRef<std::pair<unsigned, bool>> Regs) const {
773 bool IsLoad = isi32Load(Opcode);
774 assert((IsLoad || isi32Store(Opcode)) && "Must have integer load or store");
775 unsigned LoadStoreOpcode = IsLoad ? ARM::t2LDRDi8 : ARM::t2STRDi8;
777 assert(Regs.size() == 2);
778 MachineInstrBuilder MIB = BuildMI(MBB, InsertBefore, DL,
779 TII->get(LoadStoreOpcode));
781 MIB.addReg(Regs[0].first, RegState::Define)
782 .addReg(Regs[1].first, RegState::Define);
784 MIB.addReg(Regs[0].first, getKillRegState(Regs[0].second))
785 .addReg(Regs[1].first, getKillRegState(Regs[1].second));
787 MIB.addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
788 return MIB.getInstr();
791 /// Call MergeOps and update MemOps and merges accordingly on success.
792 MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
793 const MachineInstr *First = Cand.Instrs.front();
794 unsigned Opcode = First->getOpcode();
795 bool IsLoad = isLoadSingle(Opcode);
796 SmallVector<std::pair<unsigned, bool>, 8> Regs;
797 SmallVector<unsigned, 4> ImpDefs;
798 DenseSet<unsigned> KilledRegs;
799 DenseSet<unsigned> UsedRegs;
800 // Determine list of registers and list of implicit super-register defs.
801 for (const MachineInstr *MI : Cand.Instrs) {
802 const MachineOperand &MO = getLoadStoreRegOp(*MI);
803 unsigned Reg = MO.getReg();
804 bool IsKill = MO.isKill();
806 KilledRegs.insert(Reg);
807 Regs.push_back(std::make_pair(Reg, IsKill));
808 UsedRegs.insert(Reg);
811 // Collect any implicit defs of super-registers, after merging we can't
812 // be sure anymore that we properly preserved these live ranges and must
813 // removed these implicit operands.
814 for (const MachineOperand &MO : MI->implicit_operands()) {
815 if (!MO.isReg() || !MO.isDef() || MO.isDead())
817 assert(MO.isImplicit());
818 unsigned DefReg = MO.getReg();
820 if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) != ImpDefs.end())
822 // We can ignore cases where the super-reg is read and written.
823 if (MI->readsRegister(DefReg))
825 ImpDefs.push_back(DefReg);
830 // Attempt the merge.
831 typedef MachineBasicBlock::iterator iterator;
832 MachineInstr *LatestMI = Cand.Instrs[Cand.LatestMIIdx];
833 iterator InsertBefore = std::next(iterator(LatestMI));
834 MachineBasicBlock &MBB = *LatestMI->getParent();
835 unsigned Offset = getMemoryOpOffset(First);
836 unsigned Base = getLoadStoreBaseOp(*First).getReg();
837 bool BaseKill = LatestMI->killsRegister(Base);
838 unsigned PredReg = 0;
839 ARMCC::CondCodes Pred = getInstrPredicate(First, PredReg);
840 DebugLoc DL = First->getDebugLoc();
841 MachineInstr *Merged = nullptr;
842 if (Cand.CanMergeToLSDouble)
843 Merged = CreateLoadStoreDouble(MBB, InsertBefore, Offset, Base, BaseKill,
844 Opcode, Pred, PredReg, DL, Regs);
845 if (!Merged && Cand.CanMergeToLSMulti)
846 Merged = CreateLoadStoreMulti(MBB, InsertBefore, Offset, Base, BaseKill,
847 Opcode, Pred, PredReg, DL, Regs);
851 // Determine earliest instruction that will get removed. We then keep an
852 // iterator just above it so the following erases don't invalidated it.
853 iterator EarliestI(Cand.Instrs[Cand.EarliestMIIdx]);
854 bool EarliestAtBegin = false;
855 if (EarliestI == MBB.begin()) {
856 EarliestAtBegin = true;
858 EarliestI = std::prev(EarliestI);
861 // Remove instructions which have been merged.
862 for (MachineInstr *MI : Cand.Instrs)
865 // Determine range between the earliest removed instruction and the new one.
867 EarliestI = MBB.begin();
869 EarliestI = std::next(EarliestI);
870 auto FixupRange = make_range(EarliestI, iterator(Merged));
872 if (isLoadSingle(Opcode)) {
873 // If the previous loads defined a super-reg, then we have to mark earlier
874 // operands undef; Replicate the super-reg def on the merged instruction.
875 for (MachineInstr &MI : FixupRange) {
876 for (unsigned &ImpDefReg : ImpDefs) {
877 for (MachineOperand &MO : MI.implicit_operands()) {
878 if (!MO.isReg() || MO.getReg() != ImpDefReg)
888 MachineInstrBuilder MIB(*Merged->getParent()->getParent(), Merged);
889 for (unsigned ImpDef : ImpDefs)
890 MIB.addReg(ImpDef, RegState::ImplicitDefine);
892 // Remove kill flags: We are possibly storing the values later now.
893 assert(isi32Store(Opcode) || Opcode == ARM::VSTRS || Opcode == ARM::VSTRD);
894 for (MachineInstr &MI : FixupRange) {
895 for (MachineOperand &MO : MI.uses()) {
896 if (!MO.isReg() || !MO.isKill())
898 if (UsedRegs.count(MO.getReg()))
902 assert(ImpDefs.empty());
908 static bool isValidLSDoubleOffset(int Offset) {
909 unsigned Value = abs(Offset);
910 // t2LDRDi8/t2STRDi8 supports an 8 bit immediate which is internally
912 return (Value % 4) == 0 && Value < 1024;
915 /// Find candidates for load/store multiple merge in list of MemOpQueueEntries.
916 void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
917 const MachineInstr *FirstMI = MemOps[0].MI;
918 unsigned Opcode = FirstMI->getOpcode();
919 bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
920 unsigned Size = getLSMultipleTransferSize(FirstMI);
923 unsigned EIndex = MemOps.size();
925 // Look at the first instruction.
926 const MachineInstr *MI = MemOps[SIndex].MI;
927 int Offset = MemOps[SIndex].Offset;
928 const MachineOperand &PMO = getLoadStoreRegOp(*MI);
929 unsigned PReg = PMO.getReg();
930 unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg);
931 unsigned Latest = SIndex;
932 unsigned Earliest = SIndex;
934 bool CanMergeToLSDouble =
935 STI->isThumb2() && isNotVFP && isValidLSDoubleOffset(Offset);
936 // ARM errata 602117: LDRD with base in list may result in incorrect base
937 // register when interrupted or faulted.
938 if (STI->isCortexM3() && isi32Load(Opcode) &&
939 PReg == getLoadStoreBaseOp(*MI).getReg())
940 CanMergeToLSDouble = false;
942 bool CanMergeToLSMulti = true;
943 // On swift vldm/vstm starting with an odd register number as that needs
944 // more uops than single vldrs.
945 if (STI->isSwift() && !isNotVFP && (PRegNum % 2) == 1)
946 CanMergeToLSMulti = false;
948 // LDRD/STRD do not allow SP/PC. LDM/STM do not support it or have it
949 // deprecated; LDM to PC is fine but cannot happen here.
950 if (PReg == ARM::SP || PReg == ARM::PC)
951 CanMergeToLSMulti = CanMergeToLSDouble = false;
953 // Merge following instructions where possible.
954 for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) {
955 int NewOffset = MemOps[I].Offset;
956 if (NewOffset != Offset + (int)Size)
958 const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI);
959 unsigned Reg = MO.getReg();
960 if (Reg == ARM::SP || Reg == ARM::PC)
963 // See if the current load/store may be part of a multi load/store.
964 unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg);
965 bool PartOfLSMulti = CanMergeToLSMulti;
967 // Register numbers must be in ascending order.
968 if (RegNum <= PRegNum)
969 PartOfLSMulti = false;
970 // For VFP / NEON load/store multiples, the registers must be
971 // consecutive and within the limit on the number of registers per
973 else if (!isNotVFP && RegNum != PRegNum+1)
974 PartOfLSMulti = false;
976 // See if the current load/store may be part of a double load/store.
977 bool PartOfLSDouble = CanMergeToLSDouble && Count <= 1;
979 if (!PartOfLSMulti && !PartOfLSDouble)
981 CanMergeToLSMulti &= PartOfLSMulti;
982 CanMergeToLSDouble &= PartOfLSDouble;
983 // Track MemOp with latest and earliest position (Positions are
984 // counted in reverse).
985 unsigned Position = MemOps[I].Position;
986 if (Position < MemOps[Latest].Position)
988 else if (Position > MemOps[Earliest].Position)
990 // Prepare for next MemOp.
995 // Form a candidate from the Ops collected so far.
996 MergeCandidate *Candidate = new(Allocator.Allocate()) MergeCandidate;
997 for (unsigned C = SIndex, CE = SIndex + Count; C < CE; ++C)
998 Candidate->Instrs.push_back(MemOps[C].MI);
999 Candidate->LatestMIIdx = Latest - SIndex;
1000 Candidate->EarliestMIIdx = Earliest - SIndex;
1001 Candidate->InsertPos = MemOps[Latest].Position;
1003 CanMergeToLSMulti = CanMergeToLSDouble = false;
1004 Candidate->CanMergeToLSMulti = CanMergeToLSMulti;
1005 Candidate->CanMergeToLSDouble = CanMergeToLSDouble;
1006 Candidates.push_back(Candidate);
1007 // Continue after the chain.
1009 } while (SIndex < EIndex);
1012 static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
1013 ARM_AM::AMSubMode Mode) {
1015 default: llvm_unreachable("Unhandled opcode!");
1021 default: llvm_unreachable("Unhandled submode!");
1022 case ARM_AM::ia: return ARM::LDMIA_UPD;
1023 case ARM_AM::ib: return ARM::LDMIB_UPD;
1024 case ARM_AM::da: return ARM::LDMDA_UPD;
1025 case ARM_AM::db: return ARM::LDMDB_UPD;
1032 default: llvm_unreachable("Unhandled submode!");
1033 case ARM_AM::ia: return ARM::STMIA_UPD;
1034 case ARM_AM::ib: return ARM::STMIB_UPD;
1035 case ARM_AM::da: return ARM::STMDA_UPD;
1036 case ARM_AM::db: return ARM::STMDB_UPD;
1041 default: llvm_unreachable("Unhandled submode!");
1042 case ARM_AM::ia: return ARM::t2LDMIA_UPD;
1043 case ARM_AM::db: return ARM::t2LDMDB_UPD;
1048 default: llvm_unreachable("Unhandled submode!");
1049 case ARM_AM::ia: return ARM::t2STMIA_UPD;
1050 case ARM_AM::db: return ARM::t2STMDB_UPD;
1054 default: llvm_unreachable("Unhandled submode!");
1055 case ARM_AM::ia: return ARM::VLDMSIA_UPD;
1056 case ARM_AM::db: return ARM::VLDMSDB_UPD;
1060 default: llvm_unreachable("Unhandled submode!");
1061 case ARM_AM::ia: return ARM::VLDMDIA_UPD;
1062 case ARM_AM::db: return ARM::VLDMDDB_UPD;
1066 default: llvm_unreachable("Unhandled submode!");
1067 case ARM_AM::ia: return ARM::VSTMSIA_UPD;
1068 case ARM_AM::db: return ARM::VSTMSDB_UPD;
1072 default: llvm_unreachable("Unhandled submode!");
1073 case ARM_AM::ia: return ARM::VSTMDIA_UPD;
1074 case ARM_AM::db: return ARM::VSTMDDB_UPD;
1079 /// Check if the given instruction increments or decrements a register and
1080 /// return the amount it is incremented/decremented. Returns 0 if the CPSR flags
1081 /// generated by the instruction are possibly read as well.
1082 static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg,
1083 ARMCC::CondCodes Pred, unsigned PredReg) {
1086 switch (MI.getOpcode()) {
1087 case ARM::tADDi8: Scale = 4; CheckCPSRDef = true; break;
1088 case ARM::tSUBi8: Scale = -4; CheckCPSRDef = true; break;
1090 case ARM::SUBri: Scale = -1; CheckCPSRDef = true; break;
1092 case ARM::ADDri: Scale = 1; CheckCPSRDef = true; break;
1093 case ARM::tADDspi: Scale = 4; CheckCPSRDef = false; break;
1094 case ARM::tSUBspi: Scale = -4; CheckCPSRDef = false; break;
1099 if (MI.getOperand(0).getReg() != Reg ||
1100 MI.getOperand(1).getReg() != Reg ||
1101 getInstrPredicate(&MI, MIPredReg) != Pred ||
1102 MIPredReg != PredReg)
1105 if (CheckCPSRDef && definesCPSR(&MI))
1107 return MI.getOperand(2).getImm() * Scale;
1110 /// Searches for an increment or decrement of \p Reg before \p MBBI.
1111 static MachineBasicBlock::iterator
1112 findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg,
1113 ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
1115 MachineBasicBlock &MBB = *MBBI->getParent();
1116 MachineBasicBlock::iterator BeginMBBI = MBB.begin();
1117 MachineBasicBlock::iterator EndMBBI = MBB.end();
1118 if (MBBI == BeginMBBI)
1121 // Skip debug values.
1122 MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI);
1123 while (PrevMBBI->isDebugValue() && PrevMBBI != BeginMBBI)
1126 Offset = isIncrementOrDecrement(*PrevMBBI, Reg, Pred, PredReg);
1127 return Offset == 0 ? EndMBBI : PrevMBBI;
1130 /// Searches for a increment or decrement of \p Reg after \p MBBI.
1131 static MachineBasicBlock::iterator
1132 findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg,
1133 ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
1135 MachineBasicBlock &MBB = *MBBI->getParent();
1136 MachineBasicBlock::iterator EndMBBI = MBB.end();
1137 MachineBasicBlock::iterator NextMBBI = std::next(MBBI);
1138 // Skip debug values.
1139 while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
1141 if (NextMBBI == EndMBBI)
1144 Offset = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg);
1145 return Offset == 0 ? EndMBBI : NextMBBI;
1148 /// Fold proceeding/trailing inc/dec of base register into the
1149 /// LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible:
1151 /// stmia rn, <ra, rb, rc>
1152 /// rn := rn + 4 * 3;
1154 /// stmia rn!, <ra, rb, rc>
1156 /// rn := rn - 4 * 3;
1157 /// ldmia rn, <ra, rb, rc>
1159 /// ldmdb rn!, <ra, rb, rc>
1160 bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
1161 // Thumb1 is already using updating loads/stores.
1162 if (isThumb1) return false;
1164 const MachineOperand &BaseOP = MI->getOperand(0);
1165 unsigned Base = BaseOP.getReg();
1166 bool BaseKill = BaseOP.isKill();
1167 unsigned PredReg = 0;
1168 ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
1169 unsigned Opcode = MI->getOpcode();
1170 DebugLoc DL = MI->getDebugLoc();
1172 // Can't use an updating ld/st if the base register is also a dest
1173 // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
1174 for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i)
1175 if (MI->getOperand(i).getReg() == Base)
1178 int Bytes = getLSMultipleTransferSize(MI);
1179 MachineBasicBlock &MBB = *MI->getParent();
1180 MachineBasicBlock::iterator MBBI(MI);
1182 MachineBasicBlock::iterator MergeInstr
1183 = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
1184 ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode);
1185 if (Mode == ARM_AM::ia && Offset == -Bytes) {
1187 } else if (Mode == ARM_AM::ib && Offset == -Bytes) {
1190 MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
1191 if (((Mode != ARM_AM::ia && Mode != ARM_AM::ib) || Offset != Bytes) &&
1192 ((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes))
1195 MBB.erase(MergeInstr);
1197 unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode);
1198 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
1199 .addReg(Base, getDefRegState(true)) // WB base register
1200 .addReg(Base, getKillRegState(BaseKill))
1201 .addImm(Pred).addReg(PredReg);
1203 // Transfer the rest of operands.
1204 for (unsigned OpNum = 3, e = MI->getNumOperands(); OpNum != e; ++OpNum)
1205 MIB.addOperand(MI->getOperand(OpNum));
1207 // Transfer memoperands.
1208 MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
1214 static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc,
1215 ARM_AM::AddrOpc Mode) {
1218 return ARM::LDR_PRE_IMM;
1220 return ARM::STR_PRE_IMM;
1222 return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
1224 return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
1226 return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
1228 return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
1231 return ARM::t2LDR_PRE;
1234 return ARM::t2STR_PRE;
1235 default: llvm_unreachable("Unhandled opcode!");
1239 static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc,
1240 ARM_AM::AddrOpc Mode) {
1243 return ARM::LDR_POST_IMM;
1245 return ARM::STR_POST_IMM;
1247 return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
1249 return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
1251 return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
1253 return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
1256 return ARM::t2LDR_POST;
1259 return ARM::t2STR_POST;
1260 default: llvm_unreachable("Unhandled opcode!");
1264 /// Fold proceeding/trailing inc/dec of base register into the
1265 /// LDR/STR/FLD{D|S}/FST{D|S} op when possible:
1266 bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
1267 // Thumb1 doesn't have updating LDR/STR.
1268 // FIXME: Use LDM/STM with single register instead.
1269 if (isThumb1) return false;
1271 unsigned Base = getLoadStoreBaseOp(*MI).getReg();
1272 bool BaseKill = getLoadStoreBaseOp(*MI).isKill();
1273 unsigned Opcode = MI->getOpcode();
1274 DebugLoc DL = MI->getDebugLoc();
1275 bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS ||
1276 Opcode == ARM::VSTRD || Opcode == ARM::VSTRS);
1277 bool isAM2 = (Opcode == ARM::LDRi12 || Opcode == ARM::STRi12);
1278 if (isi32Load(Opcode) || isi32Store(Opcode))
1279 if (MI->getOperand(2).getImm() != 0)
1281 if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
1284 // Can't do the merge if the destination register is the same as the would-be
1285 // writeback register.
1286 if (MI->getOperand(0).getReg() == Base)
1289 unsigned PredReg = 0;
1290 ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
1291 int Bytes = getLSMultipleTransferSize(MI);
1292 MachineBasicBlock &MBB = *MI->getParent();
1293 MachineBasicBlock::iterator MBBI(MI);
1295 MachineBasicBlock::iterator MergeInstr
1296 = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset);
1298 if (!isAM5 && Offset == Bytes) {
1299 NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
1300 } else if (Offset == -Bytes) {
1301 NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
1303 MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
1304 if (Offset == Bytes) {
1305 NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::add);
1306 } else if (!isAM5 && Offset == -Bytes) {
1307 NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::sub);
1311 MBB.erase(MergeInstr);
1313 ARM_AM::AddrOpc AddSub = Offset < 0 ? ARM_AM::sub : ARM_AM::add;
1315 bool isLd = isLoadSingle(Opcode);
1317 // VLDM[SD]_UPD, VSTM[SD]_UPD
1318 // (There are no base-updating versions of VLDR/VSTR instructions, but the
1319 // updating load/store-multiple instructions can be used with only one
1321 MachineOperand &MO = MI->getOperand(0);
1322 BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
1323 .addReg(Base, getDefRegState(true)) // WB base register
1324 .addReg(Base, getKillRegState(isLd ? BaseKill : false))
1325 .addImm(Pred).addReg(PredReg)
1326 .addReg(MO.getReg(), (isLd ? getDefRegState(true) :
1327 getKillRegState(MO.isKill())));
1330 // LDR_PRE, LDR_POST
1331 if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) {
1332 BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
1333 .addReg(Base, RegState::Define)
1334 .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
1336 int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
1337 BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
1338 .addReg(Base, RegState::Define)
1339 .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
1342 // t2LDR_PRE, t2LDR_POST
1343 BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
1344 .addReg(Base, RegState::Define)
1345 .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
1348 MachineOperand &MO = MI->getOperand(0);
1349 // FIXME: post-indexed stores use am2offset_imm, which still encodes
1350 // the vestigal zero-reg offset register. When that's fixed, this clause
1351 // can be removed entirely.
1352 if (isAM2 && NewOpc == ARM::STR_POST_IMM) {
1353 int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
1354 // STR_PRE, STR_POST
1355 BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
1356 .addReg(MO.getReg(), getKillRegState(MO.isKill()))
1357 .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
1359 // t2STR_PRE, t2STR_POST
1360 BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
1361 .addReg(MO.getReg(), getKillRegState(MO.isKill()))
1362 .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
1370 bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
1371 unsigned Opcode = MI.getOpcode();
1372 assert((Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) &&
1373 "Must have t2STRDi8 or t2LDRDi8");
1374 if (MI.getOperand(3).getImm() != 0)
1377 // Behaviour for writeback is undefined if base register is the same as one
1379 const MachineOperand &BaseOp = MI.getOperand(2);
1380 unsigned Base = BaseOp.getReg();
1381 const MachineOperand &Reg0Op = MI.getOperand(0);
1382 const MachineOperand &Reg1Op = MI.getOperand(1);
1383 if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base)
1387 ARMCC::CondCodes Pred = getInstrPredicate(&MI, PredReg);
1388 MachineBasicBlock::iterator MBBI(MI);
1389 MachineBasicBlock &MBB = *MI.getParent();
1391 MachineBasicBlock::iterator MergeInstr = findIncDecBefore(MBBI, Base, Pred,
1394 if (Offset == 8 || Offset == -8) {
1395 NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_PRE : ARM::t2STRD_PRE;
1397 MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset);
1398 if (Offset == 8 || Offset == -8) {
1399 NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_POST : ARM::t2STRD_POST;
1403 MBB.erase(MergeInstr);
1405 DebugLoc DL = MI.getDebugLoc();
1406 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
1407 if (NewOpc == ARM::t2LDRD_PRE || NewOpc == ARM::t2LDRD_POST) {
1408 MIB.addOperand(Reg0Op).addOperand(Reg1Op)
1409 .addReg(BaseOp.getReg(), RegState::Define);
1411 assert(NewOpc == ARM::t2STRD_PRE || NewOpc == ARM::t2STRD_POST);
1412 MIB.addReg(BaseOp.getReg(), RegState::Define)
1413 .addOperand(Reg0Op).addOperand(Reg1Op);
1415 MIB.addReg(BaseOp.getReg(), RegState::Kill)
1416 .addImm(Offset).addImm(Pred).addReg(PredReg);
1417 assert(TII->get(Opcode).getNumOperands() == 6 &&
1418 TII->get(NewOpc).getNumOperands() == 7 &&
1419 "Unexpected number of operands in Opcode specification.");
1421 // Transfer implicit operands.
1422 for (const MachineOperand &MO : MI.implicit_operands())
1424 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
1430 /// Returns true if instruction is a memory operation that this pass is capable
1431 /// of operating on.
1432 static bool isMemoryOp(const MachineInstr *MI) {
1433 // When no memory operands are present, conservatively assume unaligned,
1434 // volatile, unfoldable.
1435 if (!MI->hasOneMemOperand())
1438 const MachineMemOperand *MMO = *MI->memoperands_begin();
1440 // Don't touch volatile memory accesses - we may be changing their order.
1441 if (MMO->isVolatile())
1444 // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is
1446 if (MMO->getAlignment() < 4)
1449 // str <undef> could probably be eliminated entirely, but for now we just want
1450 // to avoid making a mess of it.
1451 // FIXME: Use str <undef> as a wildcard to enable better stm folding.
1452 if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg() &&
1453 MI->getOperand(0).isUndef())
1456 // Likewise don't mess with references to undefined addresses.
1457 if (MI->getNumOperands() > 1 && MI->getOperand(1).isReg() &&
1458 MI->getOperand(1).isUndef())
1461 unsigned Opcode = MI->getOpcode();
1466 return MI->getOperand(1).isReg();
1469 return MI->getOperand(1).isReg();
1480 return MI->getOperand(1).isReg();
1485 static void InsertLDR_STR(MachineBasicBlock &MBB,
1486 MachineBasicBlock::iterator &MBBI,
1487 int Offset, bool isDef,
1488 DebugLoc DL, unsigned NewOpc,
1489 unsigned Reg, bool RegDeadKill, bool RegUndef,
1490 unsigned BaseReg, bool BaseKill, bool BaseUndef,
1491 bool OffKill, bool OffUndef,
1492 ARMCC::CondCodes Pred, unsigned PredReg,
1493 const TargetInstrInfo *TII, bool isT2) {
1495 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
1497 .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill))
1498 .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
1499 MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1501 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
1503 .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef))
1504 .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
1505 MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1509 bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
1510 MachineBasicBlock::iterator &MBBI) {
1511 MachineInstr *MI = &*MBBI;
1512 unsigned Opcode = MI->getOpcode();
1513 if (Opcode != ARM::LDRD && Opcode != ARM::STRD && Opcode != ARM::t2LDRDi8)
1516 const MachineOperand &BaseOp = MI->getOperand(2);
1517 unsigned BaseReg = BaseOp.getReg();
1518 unsigned EvenReg = MI->getOperand(0).getReg();
1519 unsigned OddReg = MI->getOperand(1).getReg();
1520 unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false);
1521 unsigned OddRegNum = TRI->getDwarfRegNum(OddReg, false);
1523 // ARM errata 602117: LDRD with base in list may result in incorrect base
1524 // register when interrupted or faulted.
1525 bool Errata602117 = EvenReg == BaseReg &&
1526 (Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8) && STI->isCortexM3();
1527 // ARM LDRD/STRD needs consecutive registers.
1528 bool NonConsecutiveRegs = (Opcode == ARM::LDRD || Opcode == ARM::STRD) &&
1529 (EvenRegNum % 2 != 0 || EvenRegNum + 1 != OddRegNum);
1531 if (!Errata602117 && !NonConsecutiveRegs)
1534 bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8;
1535 bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8;
1536 bool EvenDeadKill = isLd ?
1537 MI->getOperand(0).isDead() : MI->getOperand(0).isKill();
1538 bool EvenUndef = MI->getOperand(0).isUndef();
1539 bool OddDeadKill = isLd ?
1540 MI->getOperand(1).isDead() : MI->getOperand(1).isKill();
1541 bool OddUndef = MI->getOperand(1).isUndef();
1542 bool BaseKill = BaseOp.isKill();
1543 bool BaseUndef = BaseOp.isUndef();
1544 bool OffKill = isT2 ? false : MI->getOperand(3).isKill();
1545 bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef();
1546 int OffImm = getMemoryOpOffset(MI);
1547 unsigned PredReg = 0;
1548 ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
1550 if (OddRegNum > EvenRegNum && OffImm == 0) {
1551 // Ascending register numbers and no offset. It's safe to change it to a
1553 unsigned NewOpc = (isLd)
1554 ? (isT2 ? ARM::t2LDMIA : ARM::LDMIA)
1555 : (isT2 ? ARM::t2STMIA : ARM::STMIA);
1557 BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
1558 .addReg(BaseReg, getKillRegState(BaseKill))
1559 .addImm(Pred).addReg(PredReg)
1560 .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill))
1561 .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill));
1564 BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
1565 .addReg(BaseReg, getKillRegState(BaseKill))
1566 .addImm(Pred).addReg(PredReg)
1568 getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef))
1570 getKillRegState(OddDeadKill) | getUndefRegState(OddUndef));
1574 // Split into two instructions.
1575 unsigned NewOpc = (isLd)
1576 ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
1577 : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
1578 // Be extra careful for thumb2. t2LDRi8 can't reference a zero offset,
1579 // so adjust and use t2LDRi12 here for that.
1580 unsigned NewOpc2 = (isLd)
1581 ? (isT2 ? (OffImm+4 < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
1582 : (isT2 ? (OffImm+4 < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
1583 DebugLoc dl = MBBI->getDebugLoc();
1584 // If this is a load and base register is killed, it may have been
1585 // re-defed by the load, make sure the first load does not clobber it.
1587 (BaseKill || OffKill) &&
1588 (TRI->regsOverlap(EvenReg, BaseReg))) {
1589 assert(!TRI->regsOverlap(OddReg, BaseReg));
1590 InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
1591 OddReg, OddDeadKill, false,
1592 BaseReg, false, BaseUndef, false, OffUndef,
1593 Pred, PredReg, TII, isT2);
1594 InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
1595 EvenReg, EvenDeadKill, false,
1596 BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
1597 Pred, PredReg, TII, isT2);
1599 if (OddReg == EvenReg && EvenDeadKill) {
1600 // If the two source operands are the same, the kill marker is
1601 // probably on the first one. e.g.
1602 // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0
1603 EvenDeadKill = false;
1606 // Never kill the base register in the first instruction.
1607 if (EvenReg == BaseReg)
1608 EvenDeadKill = false;
1609 InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
1610 EvenReg, EvenDeadKill, EvenUndef,
1611 BaseReg, false, BaseUndef, false, OffUndef,
1612 Pred, PredReg, TII, isT2);
1613 InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
1614 OddReg, OddDeadKill, OddUndef,
1615 BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
1616 Pred, PredReg, TII, isT2);
1624 MBBI = MBB.erase(MBBI);
1628 /// An optimization pass to turn multiple LDR / STR ops of the same base and
1629 /// incrementing offset into LDM / STM ops.
1630 bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
1632 unsigned CurrBase = 0;
1633 unsigned CurrOpc = ~0u;
1634 ARMCC::CondCodes CurrPred = ARMCC::AL;
1635 unsigned Position = 0;
1636 assert(Candidates.size() == 0);
1637 assert(MergeBaseCandidates.size() == 0);
1638 LiveRegsValid = false;
1640 for (MachineBasicBlock::iterator I = MBB.end(), MBBI; I != MBB.begin();
1642 // The instruction in front of the iterator is the one we look at.
1643 MBBI = std::prev(I);
1644 if (FixInvalidRegPairOp(MBB, MBBI))
1648 if (isMemoryOp(MBBI)) {
1649 unsigned Opcode = MBBI->getOpcode();
1650 const MachineOperand &MO = MBBI->getOperand(0);
1651 unsigned Reg = MO.getReg();
1652 unsigned Base = getLoadStoreBaseOp(*MBBI).getReg();
1653 unsigned PredReg = 0;
1654 ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg);
1655 int Offset = getMemoryOpOffset(MBBI);
1656 if (CurrBase == 0) {
1657 // Start of a new chain.
1661 MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position));
1664 // Note: No need to match PredReg in the next if.
1665 if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
1667 // r4 := ldr [r0, #8]
1668 // r4 := ldr [r0, #4]
1671 // If a load overrides the base register or a register loaded by
1672 // another load in our chain, we cannot take this instruction.
1673 bool Overlap = false;
1674 if (isLoadSingle(Opcode)) {
1675 Overlap = (Base == Reg);
1677 for (const MemOpQueueEntry &E : MemOps) {
1678 if (TRI->regsOverlap(Reg, E.MI->getOperand(0).getReg())) {
1687 // Check offset and sort memory operation into the current chain.
1688 if (Offset > MemOps.back().Offset) {
1689 MemOps.push_back(MemOpQueueEntry(MBBI, Offset, Position));
1692 MemOpQueue::iterator MI, ME;
1693 for (MI = MemOps.begin(), ME = MemOps.end(); MI != ME; ++MI) {
1694 if (Offset < MI->Offset) {
1695 // Found a place to insert.
1698 if (Offset == MI->Offset) {
1699 // Collision, abort.
1704 if (MI != MemOps.end()) {
1705 MemOps.insert(MI, MemOpQueueEntry(MBBI, Offset, Position));
1712 // Don't advance the iterator; The op will start a new chain next.
1715 // Fallthrough to look into existing chain.
1716 } else if (MBBI->isDebugValue()) {
1718 } else if (MBBI->getOpcode() == ARM::t2LDRDi8 ||
1719 MBBI->getOpcode() == ARM::t2STRDi8) {
1720 // ARMPreAllocLoadStoreOpt has already formed some LDRD/STRD instructions
1721 // remember them because we may still be able to merge add/sub into them.
1722 MergeBaseCandidates.push_back(MBBI);
1726 // If we are here then the chain is broken; Extract candidates for a merge.
1727 if (MemOps.size() > 0) {
1728 FormCandidates(MemOps);
1729 // Reset for the next chain.
1732 CurrPred = ARMCC::AL;
1736 if (MemOps.size() > 0)
1737 FormCandidates(MemOps);
1739 // Sort candidates so they get processed from end to begin of the basic
1740 // block later; This is necessary for liveness calculation.
1741 auto LessThan = [](const MergeCandidate* M0, const MergeCandidate *M1) {
1742 return M0->InsertPos < M1->InsertPos;
1744 std::sort(Candidates.begin(), Candidates.end(), LessThan);
1746 // Go through list of candidates and merge.
1747 bool Changed = false;
1748 for (const MergeCandidate *Candidate : Candidates) {
1749 if (Candidate->CanMergeToLSMulti || Candidate->CanMergeToLSDouble) {
1750 MachineInstr *Merged = MergeOpsUpdate(*Candidate);
1751 // Merge preceding/trailing base inc/dec into the merged op.
1754 unsigned Opcode = Merged->getOpcode();
1755 if (Opcode == ARM::t2STRDi8 || Opcode == ARM::t2LDRDi8)
1756 MergeBaseUpdateLSDouble(*Merged);
1758 MergeBaseUpdateLSMultiple(Merged);
1760 for (MachineInstr *MI : Candidate->Instrs) {
1761 if (MergeBaseUpdateLoadStore(MI))
1766 assert(Candidate->Instrs.size() == 1);
1767 if (MergeBaseUpdateLoadStore(Candidate->Instrs.front()))
1772 // Try to fold add/sub into the LDRD/STRD formed by ARMPreAllocLoadStoreOpt.
1773 for (MachineInstr *MI : MergeBaseCandidates)
1774 MergeBaseUpdateLSDouble(*MI);
1775 MergeBaseCandidates.clear();
1780 /// If this is a exit BB, try merging the return ops ("bx lr" and "mov pc, lr")
1781 /// into the preceding stack restore so it directly restore the value of LR
1783 /// ldmfd sp!, {..., lr}
1786 /// ldmfd sp!, {..., lr}
1789 /// ldmfd sp!, {..., pc}
1790 bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
1791 // Thumb1 LDM doesn't allow high registers.
1792 if (isThumb1) return false;
1793 if (MBB.empty()) return false;
1795 MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
1796 if (MBBI != MBB.begin() &&
1797 (MBBI->getOpcode() == ARM::BX_RET ||
1798 MBBI->getOpcode() == ARM::tBX_RET ||
1799 MBBI->getOpcode() == ARM::MOVPCLR)) {
1800 MachineInstr *PrevMI = std::prev(MBBI);
1801 unsigned Opcode = PrevMI->getOpcode();
1802 if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD ||
1803 Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD ||
1804 Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) {
1805 MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1);
1806 if (MO.getReg() != ARM::LR)
1808 unsigned NewOpc = (isThumb2 ? ARM::t2LDMIA_RET : ARM::LDMIA_RET);
1809 assert(((isThumb2 && Opcode == ARM::t2LDMIA_UPD) ||
1810 Opcode == ARM::LDMIA_UPD) && "Unsupported multiple load-return!");
1811 PrevMI->setDesc(TII->get(NewOpc));
1813 PrevMI->copyImplicitOps(*MBB.getParent(), &*MBBI);
1821 bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1823 STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
1824 TL = STI->getTargetLowering();
1825 AFI = Fn.getInfo<ARMFunctionInfo>();
1826 TII = STI->getInstrInfo();
1827 TRI = STI->getRegisterInfo();
1828 MRI = &Fn.getRegInfo();
1829 RegClassInfoValid = false;
1830 isThumb2 = AFI->isThumb2Function();
1831 isThumb1 = AFI->isThumbFunction() && !isThumb2;
1833 bool Modified = false;
1834 for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
1836 MachineBasicBlock &MBB = *MFI;
1837 Modified |= LoadStoreMultipleOpti(MBB);
1838 if (STI->hasV5TOps())
1839 Modified |= MergeReturnIntoLDM(MBB);
1842 Allocator.DestroyAll();
1847 /// Pre- register allocation pass that move load / stores from consecutive
1848 /// locations close to make it more likely they will be combined later.
1849 struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
1851 ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {}
1853 const DataLayout *TD;
1854 const TargetInstrInfo *TII;
1855 const TargetRegisterInfo *TRI;
1856 const ARMSubtarget *STI;
1857 MachineRegisterInfo *MRI;
1858 MachineFunction *MF;
1860 bool runOnMachineFunction(MachineFunction &Fn) override;
1862 const char *getPassName() const override {
1863 return "ARM pre- register allocation load / store optimization pass";
1867 bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
1868 unsigned &NewOpc, unsigned &EvenReg,
1869 unsigned &OddReg, unsigned &BaseReg,
1871 unsigned &PredReg, ARMCC::CondCodes &Pred,
1873 bool RescheduleOps(MachineBasicBlock *MBB,
1874 SmallVectorImpl<MachineInstr *> &Ops,
1875 unsigned Base, bool isLd,
1876 DenseMap<MachineInstr*, unsigned> &MI2LocMap);
1877 bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
1879 char ARMPreAllocLoadStoreOpt::ID = 0;
1882 bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1883 TD = &Fn.getDataLayout();
1884 STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
1885 TII = STI->getInstrInfo();
1886 TRI = STI->getRegisterInfo();
1887 MRI = &Fn.getRegInfo();
1890 bool Modified = false;
1891 for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
1893 Modified |= RescheduleLoadStoreInstrs(MFI);
1898 static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
1899 MachineBasicBlock::iterator I,
1900 MachineBasicBlock::iterator E,
1901 SmallPtrSetImpl<MachineInstr*> &MemOps,
1902 SmallSet<unsigned, 4> &MemRegs,
1903 const TargetRegisterInfo *TRI) {
1904 // Are there stores / loads / calls between them?
1905 // FIXME: This is overly conservative. We should make use of alias information
1907 SmallSet<unsigned, 4> AddedRegPressure;
1909 if (I->isDebugValue() || MemOps.count(&*I))
1911 if (I->isCall() || I->isTerminator() || I->hasUnmodeledSideEffects())
1913 if (isLd && I->mayStore())
1918 // It's not safe to move the first 'str' down.
1921 // str r4, [r0, #+4]
1925 for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) {
1926 MachineOperand &MO = I->getOperand(j);
1929 unsigned Reg = MO.getReg();
1930 if (MO.isDef() && TRI->regsOverlap(Reg, Base))
1932 if (Reg != Base && !MemRegs.count(Reg))
1933 AddedRegPressure.insert(Reg);
1937 // Estimate register pressure increase due to the transformation.
1938 if (MemRegs.size() <= 4)
1939 // Ok if we are moving small number of instructions.
1941 return AddedRegPressure.size() <= MemRegs.size() * 2;
1945 /// Copy \p Op0 and \p Op1 operands into a new array assigned to MI.
1946 static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0,
1947 MachineInstr *Op1) {
1948 assert(MI->memoperands_empty() && "expected a new machineinstr");
1949 size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin())
1950 + (Op1->memoperands_end() - Op1->memoperands_begin());
1952 MachineFunction *MF = MI->getParent()->getParent();
1953 MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs);
1954 MachineSDNode::mmo_iterator MemEnd =
1955 std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin);
1957 std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd);
1958 MI->setMemRefs(MemBegin, MemEnd);
1962 ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
1963 DebugLoc &dl, unsigned &NewOpc,
1965 unsigned &SecondReg,
1966 unsigned &BaseReg, int &Offset,
1968 ARMCC::CondCodes &Pred,
1970 // Make sure we're allowed to generate LDRD/STRD.
1971 if (!STI->hasV5TEOps())
1974 // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD
1976 unsigned Opcode = Op0->getOpcode();
1977 if (Opcode == ARM::LDRi12) {
1979 } else if (Opcode == ARM::STRi12) {
1981 } else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) {
1982 NewOpc = ARM::t2LDRDi8;
1985 } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) {
1986 NewOpc = ARM::t2STRDi8;
1993 // Make sure the base address satisfies i64 ld / st alignment requirement.
1994 // At the moment, we ignore the memoryoperand's value.
1995 // If we want to use AliasAnalysis, we should check it accordingly.
1996 if (!Op0->hasOneMemOperand() ||
1997 (*Op0->memoperands_begin())->isVolatile())
2000 unsigned Align = (*Op0->memoperands_begin())->getAlignment();
2001 const Function *Func = MF->getFunction();
2002 unsigned ReqAlign = STI->hasV6Ops()
2003 ? TD->getABITypeAlignment(Type::getInt64Ty(Func->getContext()))
2004 : 8; // Pre-v6 need 8-byte align
2005 if (Align < ReqAlign)
2008 // Then make sure the immediate offset fits.
2009 int OffImm = getMemoryOpOffset(Op0);
2011 int Limit = (1 << 8) * Scale;
2012 if (OffImm >= Limit || (OffImm <= -Limit) || (OffImm & (Scale-1)))
2016 ARM_AM::AddrOpc AddSub = ARM_AM::add;
2018 AddSub = ARM_AM::sub;
2021 int Limit = (1 << 8) * Scale;
2022 if (OffImm >= Limit || (OffImm & (Scale-1)))
2024 Offset = ARM_AM::getAM3Opc(AddSub, OffImm);
2026 FirstReg = Op0->getOperand(0).getReg();
2027 SecondReg = Op1->getOperand(0).getReg();
2028 if (FirstReg == SecondReg)
2030 BaseReg = Op0->getOperand(1).getReg();
2031 Pred = getInstrPredicate(Op0, PredReg);
2032 dl = Op0->getDebugLoc();
2036 bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
2037 SmallVectorImpl<MachineInstr *> &Ops,
2038 unsigned Base, bool isLd,
2039 DenseMap<MachineInstr*, unsigned> &MI2LocMap) {
2040 bool RetVal = false;
2042 // Sort by offset (in reverse order).
2043 std::sort(Ops.begin(), Ops.end(),
2044 [](const MachineInstr *LHS, const MachineInstr *RHS) {
2045 int LOffset = getMemoryOpOffset(LHS);
2046 int ROffset = getMemoryOpOffset(RHS);
2047 assert(LHS == RHS || LOffset != ROffset);
2048 return LOffset > ROffset;
2051 // The loads / stores of the same base are in order. Scan them from first to
2052 // last and check for the following:
2053 // 1. Any def of base.
2055 while (Ops.size() > 1) {
2056 unsigned FirstLoc = ~0U;
2057 unsigned LastLoc = 0;
2058 MachineInstr *FirstOp = nullptr;
2059 MachineInstr *LastOp = nullptr;
2061 unsigned LastOpcode = 0;
2062 unsigned LastBytes = 0;
2063 unsigned NumMove = 0;
2064 for (int i = Ops.size() - 1; i >= 0; --i) {
2065 MachineInstr *Op = Ops[i];
2066 unsigned Loc = MI2LocMap[Op];
2067 if (Loc <= FirstLoc) {
2071 if (Loc >= LastLoc) {
2077 = getLoadStoreMultipleOpcode(Op->getOpcode(), ARM_AM::ia);
2078 if (LastOpcode && LSMOpcode != LastOpcode)
2081 int Offset = getMemoryOpOffset(Op);
2082 unsigned Bytes = getLSMultipleTransferSize(Op);
2084 if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes))
2087 LastOffset = Offset;
2089 LastOpcode = LSMOpcode;
2090 if (++NumMove == 8) // FIXME: Tune this limit.
2097 SmallPtrSet<MachineInstr*, 4> MemOps;
2098 SmallSet<unsigned, 4> MemRegs;
2099 for (int i = NumMove-1; i >= 0; --i) {
2100 MemOps.insert(Ops[i]);
2101 MemRegs.insert(Ops[i]->getOperand(0).getReg());
2104 // Be conservative, if the instructions are too far apart, don't
2105 // move them. We want to limit the increase of register pressure.
2106 bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this.
2108 DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp,
2109 MemOps, MemRegs, TRI);
2111 for (unsigned i = 0; i != NumMove; ++i)
2114 // This is the new location for the loads / stores.
2115 MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
2116 while (InsertPos != MBB->end()
2117 && (MemOps.count(InsertPos) || InsertPos->isDebugValue()))
2120 // If we are moving a pair of loads / stores, see if it makes sense
2121 // to try to allocate a pair of registers that can form register pairs.
2122 MachineInstr *Op0 = Ops.back();
2123 MachineInstr *Op1 = Ops[Ops.size()-2];
2124 unsigned FirstReg = 0, SecondReg = 0;
2125 unsigned BaseReg = 0, PredReg = 0;
2126 ARMCC::CondCodes Pred = ARMCC::AL;
2128 unsigned NewOpc = 0;
2131 if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc,
2132 FirstReg, SecondReg, BaseReg,
2133 Offset, PredReg, Pred, isT2)) {
2137 const MCInstrDesc &MCID = TII->get(NewOpc);
2138 const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF);
2139 MRI->constrainRegClass(FirstReg, TRC);
2140 MRI->constrainRegClass(SecondReg, TRC);
2142 // Form the pair instruction.
2144 MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
2145 .addReg(FirstReg, RegState::Define)
2146 .addReg(SecondReg, RegState::Define)
2148 // FIXME: We're converting from LDRi12 to an insn that still
2149 // uses addrmode2, so we need an explicit offset reg. It should
2150 // always by reg0 since we're transforming LDRi12s.
2153 MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
2154 concatenateMemOperands(MIB, Op0, Op1);
2155 DEBUG(dbgs() << "Formed " << *MIB << "\n");
2158 MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
2162 // FIXME: We're converting from LDRi12 to an insn that still
2163 // uses addrmode2, so we need an explicit offset reg. It should
2164 // always by reg0 since we're transforming STRi12s.
2167 MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
2168 concatenateMemOperands(MIB, Op0, Op1);
2169 DEBUG(dbgs() << "Formed " << *MIB << "\n");
2176 // Add register allocation hints to form register pairs.
2177 MRI->setRegAllocationHint(FirstReg, ARMRI::RegPairEven, SecondReg);
2178 MRI->setRegAllocationHint(SecondReg, ARMRI::RegPairOdd, FirstReg);
2181 for (unsigned i = 0; i != NumMove; ++i) {
2182 MachineInstr *Op = Ops.back();
2184 MBB->splice(InsertPos, MBB, Op);
2188 NumLdStMoved += NumMove;
2198 ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
2199 bool RetVal = false;
2201 DenseMap<MachineInstr*, unsigned> MI2LocMap;
2202 DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap;
2203 DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap;
2204 SmallVector<unsigned, 4> LdBases;
2205 SmallVector<unsigned, 4> StBases;
2208 MachineBasicBlock::iterator MBBI = MBB->begin();
2209 MachineBasicBlock::iterator E = MBB->end();
2211 for (; MBBI != E; ++MBBI) {
2212 MachineInstr *MI = MBBI;
2213 if (MI->isCall() || MI->isTerminator()) {
2214 // Stop at barriers.
2219 if (!MI->isDebugValue())
2220 MI2LocMap[MI] = ++Loc;
2222 if (!isMemoryOp(MI))
2224 unsigned PredReg = 0;
2225 if (getInstrPredicate(MI, PredReg) != ARMCC::AL)
2228 int Opc = MI->getOpcode();
2229 bool isLd = isLoadSingle(Opc);
2230 unsigned Base = MI->getOperand(1).getReg();
2231 int Offset = getMemoryOpOffset(MI);
2233 bool StopHere = false;
2235 DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
2236 Base2LdsMap.find(Base);
2237 if (BI != Base2LdsMap.end()) {
2238 for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
2239 if (Offset == getMemoryOpOffset(BI->second[i])) {
2245 BI->second.push_back(MI);
2247 Base2LdsMap[Base].push_back(MI);
2248 LdBases.push_back(Base);
2251 DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
2252 Base2StsMap.find(Base);
2253 if (BI != Base2StsMap.end()) {
2254 for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
2255 if (Offset == getMemoryOpOffset(BI->second[i])) {
2261 BI->second.push_back(MI);
2263 Base2StsMap[Base].push_back(MI);
2264 StBases.push_back(Base);
2269 // Found a duplicate (a base+offset combination that's seen earlier).
2276 // Re-schedule loads.
2277 for (unsigned i = 0, e = LdBases.size(); i != e; ++i) {
2278 unsigned Base = LdBases[i];
2279 SmallVectorImpl<MachineInstr *> &Lds = Base2LdsMap[Base];
2281 RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap);
2284 // Re-schedule stores.
2285 for (unsigned i = 0, e = StBases.size(); i != e; ++i) {
2286 unsigned Base = StBases[i];
2287 SmallVectorImpl<MachineInstr *> &Sts = Base2StsMap[Base];
2289 RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap);
2293 Base2LdsMap.clear();
2294 Base2StsMap.clear();
2304 /// Returns an instance of the load / store optimization pass.
2305 FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
2307 return new ARMPreAllocLoadStoreOpt();
2308 return new ARMLoadStoreOpt();