1 //===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ----*- C++ -*-=//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file contains a pass that performs load / store related peephole
11 // optimizations. This pass should be run after register allocation.
13 //===----------------------------------------------------------------------===//
15 #define DEBUG_TYPE "arm-ldst-opt"
17 #include "ARMAddressingModes.h"
18 #include "ARMBaseInstrInfo.h"
19 #include "ARMMachineFunctionInfo.h"
20 #include "ARMRegisterInfo.h"
21 #include "llvm/DerivedTypes.h"
22 #include "llvm/Function.h"
23 #include "llvm/CodeGen/MachineBasicBlock.h"
24 #include "llvm/CodeGen/MachineFunctionPass.h"
25 #include "llvm/CodeGen/MachineInstr.h"
26 #include "llvm/CodeGen/MachineInstrBuilder.h"
27 #include "llvm/CodeGen/MachineRegisterInfo.h"
28 #include "llvm/CodeGen/RegisterScavenging.h"
29 #include "llvm/Target/TargetData.h"
30 #include "llvm/Target/TargetInstrInfo.h"
31 #include "llvm/Target/TargetMachine.h"
32 #include "llvm/Target/TargetRegisterInfo.h"
33 #include "llvm/Support/ErrorHandling.h"
34 #include "llvm/ADT/DenseMap.h"
35 #include "llvm/ADT/STLExtras.h"
36 #include "llvm/ADT/SmallPtrSet.h"
37 #include "llvm/ADT/SmallSet.h"
38 #include "llvm/ADT/SmallVector.h"
39 #include "llvm/ADT/Statistic.h"
42 STATISTIC(NumLDMGened , "Number of ldm instructions generated");
43 STATISTIC(NumSTMGened , "Number of stm instructions generated");
44 STATISTIC(NumVLDMGened, "Number of vldm instructions generated");
45 STATISTIC(NumVSTMGened, "Number of vstm instructions generated");
46 STATISTIC(NumLdStMoved, "Number of load / store instructions moved");
47 STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation");
48 STATISTIC(NumSTRDFormed,"Number of strd created before allocation");
49 STATISTIC(NumLDRD2LDM, "Number of ldrd instructions turned back into ldm");
50 STATISTIC(NumSTRD2STM, "Number of strd instructions turned back into stm");
51 STATISTIC(NumLDRD2LDR, "Number of ldrd instructions turned back into ldr's");
52 STATISTIC(NumSTRD2STR, "Number of strd instructions turned back into str's");
54 /// ARMAllocLoadStoreOpt - Post- register allocation pass the combine
55 /// load / store instructions to form ldm / stm instructions.
58 struct ARMLoadStoreOpt : public MachineFunctionPass {
60 ARMLoadStoreOpt() : MachineFunctionPass(ID) {}
62 const TargetInstrInfo *TII;
63 const TargetRegisterInfo *TRI;
68 virtual bool runOnMachineFunction(MachineFunction &Fn);
70 virtual const char *getPassName() const {
71 return "ARM load / store optimization pass";
75 struct MemOpQueueEntry {
80 MachineBasicBlock::iterator MBBI;
82 MemOpQueueEntry(int o, unsigned r, bool k, unsigned p,
83 MachineBasicBlock::iterator i)
84 : Offset(o), Reg(r), isKill(k), Position(p), MBBI(i), Merged(false) {}
86 typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
87 typedef MemOpQueue::iterator MemOpQueueIter;
89 bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
90 int Offset, unsigned Base, bool BaseKill, int Opcode,
91 ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch,
92 DebugLoc dl, SmallVector<std::pair<unsigned, bool>, 8> &Regs);
93 void MergeOpsUpdate(MachineBasicBlock &MBB,
102 ARMCC::CondCodes Pred,
106 SmallVector<MachineBasicBlock::iterator, 4> &Merges);
107 void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base,
108 int Opcode, unsigned Size,
109 ARMCC::CondCodes Pred, unsigned PredReg,
110 unsigned Scratch, MemOpQueue &MemOps,
111 SmallVector<MachineBasicBlock::iterator, 4> &Merges);
113 void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps);
114 bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
115 MachineBasicBlock::iterator &MBBI);
116 bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
117 MachineBasicBlock::iterator MBBI,
118 const TargetInstrInfo *TII,
120 MachineBasicBlock::iterator &I);
121 bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
122 MachineBasicBlock::iterator MBBI,
124 MachineBasicBlock::iterator &I);
125 bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
126 bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
128 char ARMLoadStoreOpt::ID = 0;
131 static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) {
133 default: llvm_unreachable("Unhandled opcode!");
137 default: llvm_unreachable("Unhandled submode!");
138 case ARM_AM::ia: return ARM::LDMIA;
139 case ARM_AM::da: return ARM::LDMDA;
140 case ARM_AM::db: return ARM::LDMDB;
141 case ARM_AM::ib: return ARM::LDMIB;
147 default: llvm_unreachable("Unhandled submode!");
148 case ARM_AM::ia: return ARM::STMIA;
149 case ARM_AM::da: return ARM::STMDA;
150 case ARM_AM::db: return ARM::STMDB;
151 case ARM_AM::ib: return ARM::STMIB;
158 default: llvm_unreachable("Unhandled submode!");
159 case ARM_AM::ia: return ARM::t2LDMIA;
160 case ARM_AM::db: return ARM::t2LDMDB;
167 default: llvm_unreachable("Unhandled submode!");
168 case ARM_AM::ia: return ARM::t2STMIA;
169 case ARM_AM::db: return ARM::t2STMDB;
175 default: llvm_unreachable("Unhandled submode!");
176 case ARM_AM::ia: return ARM::VLDMSIA;
177 case ARM_AM::db: return ARM::VLDMSDB;
183 default: llvm_unreachable("Unhandled submode!");
184 case ARM_AM::ia: return ARM::VSTMSIA;
185 case ARM_AM::db: return ARM::VSTMSDB;
191 default: llvm_unreachable("Unhandled submode!");
192 case ARM_AM::ia: return ARM::VLDMDIA;
193 case ARM_AM::db: return ARM::VLDMDDB;
199 default: llvm_unreachable("Unhandled submode!");
200 case ARM_AM::ia: return ARM::VSTMDIA;
201 case ARM_AM::db: return ARM::VSTMDDB;
212 AMSubMode getLoadStoreMultipleSubMode(int Opcode) {
214 default: llvm_unreachable("Unhandled opcode!");
244 return ARM_AM::bad_am_submode;
247 } // end namespace ARM_AM
248 } // end namespace llvm
250 static bool isT2i32Load(unsigned Opc) {
251 return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8;
254 static bool isi32Load(unsigned Opc) {
255 return Opc == ARM::LDRi12 || isT2i32Load(Opc);
258 static bool isT2i32Store(unsigned Opc) {
259 return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8;
262 static bool isi32Store(unsigned Opc) {
263 return Opc == ARM::STRi12 || isT2i32Store(Opc);
266 /// MergeOps - Create and insert a LDM or STM with Base as base register and
267 /// registers in Regs as the register operands that would be loaded / stored.
268 /// It returns true if the transformation is done.
270 ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
271 MachineBasicBlock::iterator MBBI,
272 int Offset, unsigned Base, bool BaseKill,
273 int Opcode, ARMCC::CondCodes Pred,
274 unsigned PredReg, unsigned Scratch, DebugLoc dl,
275 SmallVector<std::pair<unsigned, bool>, 8> &Regs) {
276 // Only a single register to load / store. Don't bother.
277 unsigned NumRegs = Regs.size();
281 ARM_AM::AMSubMode Mode = ARM_AM::ia;
282 // VFP and Thumb2 do not support IB or DA modes.
283 bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
284 bool haveIBAndDA = isNotVFP && !isThumb2;
285 if (Offset == 4 && haveIBAndDA)
287 else if (Offset == -4 * (int)NumRegs + 4 && haveIBAndDA)
289 else if (Offset == -4 * (int)NumRegs && isNotVFP)
290 // VLDM/VSTM do not support DB mode without also updating the base reg.
292 else if (Offset != 0) {
293 // If starting offset isn't zero, insert a MI to materialize a new base.
294 // But only do so if it is cost effective, i.e. merging more than two
300 if (isi32Load(Opcode))
301 // If it is a load, then just use one of the destination register to
302 // use as the new base.
303 NewBase = Regs[NumRegs-1].first;
305 // Use the scratch register to use as a new base.
310 int BaseOpc = !isThumb2
312 : ((Base == ARM::SP) ? ARM::t2ADDrSPi : ARM::t2ADDri);
316 : ((Base == ARM::SP) ? ARM::t2SUBrSPi : ARM::t2SUBri);
319 int ImmedOffset = isThumb2
320 ? ARM_AM::getT2SOImmVal(Offset) : ARM_AM::getSOImmVal(Offset);
321 if (ImmedOffset == -1)
322 // FIXME: Try t2ADDri12 or t2SUBri12?
323 return false; // Probably not worth it then.
325 BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
326 .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
327 .addImm(Pred).addReg(PredReg).addReg(0);
329 BaseKill = true; // New base is always killed right its use.
332 bool isDef = (isi32Load(Opcode) || Opcode == ARM::VLDRS ||
333 Opcode == ARM::VLDRD);
334 Opcode = getLoadStoreMultipleOpcode(Opcode, Mode);
335 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode))
336 .addReg(Base, getKillRegState(BaseKill))
337 .addImm(Pred).addReg(PredReg);
338 for (unsigned i = 0; i != NumRegs; ++i)
339 MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef)
340 | getKillRegState(Regs[i].second));
345 // MergeOpsUpdate - call MergeOps and update MemOps and merges accordingly on
347 void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
349 unsigned memOpsBegin, unsigned memOpsEnd,
350 unsigned insertAfter, int Offset,
351 unsigned Base, bool BaseKill,
353 ARMCC::CondCodes Pred, unsigned PredReg,
356 SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
357 // First calculate which of the registers should be killed by the merged
359 const unsigned insertPos = memOps[insertAfter].Position;
361 SmallSet<unsigned, 4> UnavailRegs;
362 SmallSet<unsigned, 4> KilledRegs;
363 DenseMap<unsigned, unsigned> Killer;
364 for (unsigned i = 0; i < memOpsBegin; ++i) {
365 if (memOps[i].Position < insertPos && memOps[i].isKill) {
366 unsigned Reg = memOps[i].Reg;
367 if (memOps[i].Merged)
368 UnavailRegs.insert(Reg);
370 KilledRegs.insert(Reg);
375 for (unsigned i = memOpsEnd, e = memOps.size(); i != e; ++i) {
376 if (memOps[i].Position < insertPos && memOps[i].isKill) {
377 unsigned Reg = memOps[i].Reg;
378 KilledRegs.insert(Reg);
383 SmallVector<std::pair<unsigned, bool>, 8> Regs;
384 for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
385 unsigned Reg = memOps[i].Reg;
386 if (UnavailRegs.count(Reg))
387 // Register is killed before and it's not easy / possible to update the
388 // kill marker on already merged instructions. Abort.
391 // If we are inserting the merged operation after an unmerged operation that
392 // uses the same register, make sure to transfer any kill flag.
393 bool isKill = memOps[i].isKill || KilledRegs.count(Reg);
394 Regs.push_back(std::make_pair(Reg, isKill));
397 // Try to do the merge.
398 MachineBasicBlock::iterator Loc = memOps[insertAfter].MBBI;
400 if (!MergeOps(MBB, Loc, Offset, Base, BaseKill, Opcode,
401 Pred, PredReg, Scratch, dl, Regs))
404 // Merge succeeded, update records.
405 Merges.push_back(prior(Loc));
406 for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
407 // Remove kill flags from any unmerged memops that come before insertPos.
408 if (Regs[i-memOpsBegin].second) {
409 unsigned Reg = Regs[i-memOpsBegin].first;
410 if (KilledRegs.count(Reg)) {
411 unsigned j = Killer[Reg];
412 memOps[j].MBBI->getOperand(0).setIsKill(false);
413 memOps[j].isKill = false;
416 MBB.erase(memOps[i].MBBI);
417 memOps[i].Merged = true;
421 /// MergeLDR_STR - Merge a number of load / store instructions into one or more
422 /// load / store multiple instructions.
424 ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
425 unsigned Base, int Opcode, unsigned Size,
426 ARMCC::CondCodes Pred, unsigned PredReg,
427 unsigned Scratch, MemOpQueue &MemOps,
428 SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
429 bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
430 int Offset = MemOps[SIndex].Offset;
431 int SOffset = Offset;
432 unsigned insertAfter = SIndex;
433 MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI;
434 DebugLoc dl = Loc->getDebugLoc();
435 const MachineOperand &PMO = Loc->getOperand(0);
436 unsigned PReg = PMO.getReg();
437 unsigned PRegNum = PMO.isUndef() ? UINT_MAX
438 : getARMRegisterNumbering(PReg);
441 for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) {
442 int NewOffset = MemOps[i].Offset;
443 const MachineOperand &MO = MemOps[i].MBBI->getOperand(0);
444 unsigned Reg = MO.getReg();
445 unsigned RegNum = MO.isUndef() ? UINT_MAX
446 : getARMRegisterNumbering(Reg);
447 // Register numbers must be in ascending order. For VFP, the registers
448 // must also be consecutive and there is a limit of 16 double-word
449 // registers per instruction.
450 if (Reg != ARM::SP &&
451 NewOffset == Offset + (int)Size &&
452 ((isNotVFP && RegNum > PRegNum)
453 || ((Size < 8 || Count < 16) && RegNum == PRegNum+1))) {
458 // Can't merge this in. Try merge the earlier ones first.
459 MergeOpsUpdate(MBB, MemOps, SIndex, i, insertAfter, SOffset,
460 Base, false, Opcode, Pred, PredReg, Scratch, dl, Merges);
461 MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch,
466 if (MemOps[i].Position > MemOps[insertAfter].Position)
470 bool BaseKill = Loc->findRegisterUseOperandIdx(Base, true) != -1;
471 MergeOpsUpdate(MBB, MemOps, SIndex, MemOps.size(), insertAfter, SOffset,
472 Base, BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges);
476 static inline bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
477 unsigned Bytes, unsigned Limit,
478 ARMCC::CondCodes Pred, unsigned PredReg){
479 unsigned MyPredReg = 0;
482 if (MI->getOpcode() != ARM::t2SUBri &&
483 MI->getOpcode() != ARM::t2SUBrSPi &&
484 MI->getOpcode() != ARM::t2SUBrSPi12 &&
485 MI->getOpcode() != ARM::tSUBspi &&
486 MI->getOpcode() != ARM::SUBri)
489 // Make sure the offset fits in 8 bits.
490 if (Bytes == 0 || (Limit && Bytes >= Limit))
493 unsigned Scale = (MI->getOpcode() == ARM::tSUBspi) ? 4 : 1; // FIXME
494 return (MI->getOperand(0).getReg() == Base &&
495 MI->getOperand(1).getReg() == Base &&
496 (MI->getOperand(2).getImm()*Scale) == Bytes &&
497 llvm::getInstrPredicate(MI, MyPredReg) == Pred &&
498 MyPredReg == PredReg);
501 static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
502 unsigned Bytes, unsigned Limit,
503 ARMCC::CondCodes Pred, unsigned PredReg){
504 unsigned MyPredReg = 0;
507 if (MI->getOpcode() != ARM::t2ADDri &&
508 MI->getOpcode() != ARM::t2ADDrSPi &&
509 MI->getOpcode() != ARM::t2ADDrSPi12 &&
510 MI->getOpcode() != ARM::tADDspi &&
511 MI->getOpcode() != ARM::ADDri)
514 if (Bytes == 0 || (Limit && Bytes >= Limit))
515 // Make sure the offset fits in 8 bits.
518 unsigned Scale = (MI->getOpcode() == ARM::tADDspi) ? 4 : 1; // FIXME
519 return (MI->getOperand(0).getReg() == Base &&
520 MI->getOperand(1).getReg() == Base &&
521 (MI->getOperand(2).getImm()*Scale) == Bytes &&
522 llvm::getInstrPredicate(MI, MyPredReg) == Pred &&
523 MyPredReg == PredReg);
526 static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
527 switch (MI->getOpcode()) {
557 return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4;
562 return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8;
566 static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
567 ARM_AM::AMSubMode Mode) {
569 default: llvm_unreachable("Unhandled opcode!");
575 default: llvm_unreachable("Unhandled submode!");
576 case ARM_AM::ia: return ARM::LDMIA_UPD;
577 case ARM_AM::ib: return ARM::LDMIB_UPD;
578 case ARM_AM::da: return ARM::LDMDA_UPD;
579 case ARM_AM::db: return ARM::LDMDB_UPD;
587 default: llvm_unreachable("Unhandled submode!");
588 case ARM_AM::ia: return ARM::STMIA_UPD;
589 case ARM_AM::ib: return ARM::STMIB_UPD;
590 case ARM_AM::da: return ARM::STMDA_UPD;
591 case ARM_AM::db: return ARM::STMDB_UPD;
597 default: llvm_unreachable("Unhandled submode!");
598 case ARM_AM::ia: return ARM::t2LDMIA_UPD;
599 case ARM_AM::db: return ARM::t2LDMDB_UPD;
605 default: llvm_unreachable("Unhandled submode!");
606 case ARM_AM::ia: return ARM::t2STMIA_UPD;
607 case ARM_AM::db: return ARM::t2STMDB_UPD;
613 default: llvm_unreachable("Unhandled submode!");
614 case ARM_AM::ia: return ARM::VLDMSIA_UPD;
615 case ARM_AM::db: return ARM::VLDMSDB_UPD;
621 default: llvm_unreachable("Unhandled submode!");
622 case ARM_AM::ia: return ARM::VLDMDIA_UPD;
623 case ARM_AM::db: return ARM::VLDMDDB_UPD;
629 default: llvm_unreachable("Unhandled submode!");
630 case ARM_AM::ia: return ARM::VSTMSIA_UPD;
631 case ARM_AM::db: return ARM::VSTMSDB_UPD;
637 default: llvm_unreachable("Unhandled submode!");
638 case ARM_AM::ia: return ARM::VSTMDIA_UPD;
639 case ARM_AM::db: return ARM::VSTMDDB_UPD;
647 /// MergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base
648 /// register into the LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible:
650 /// stmia rn, <ra, rb, rc>
651 /// rn := rn + 4 * 3;
653 /// stmia rn!, <ra, rb, rc>
655 /// rn := rn - 4 * 3;
656 /// ldmia rn, <ra, rb, rc>
658 /// ldmdb rn!, <ra, rb, rc>
659 bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
660 MachineBasicBlock::iterator MBBI,
662 MachineBasicBlock::iterator &I) {
663 MachineInstr *MI = MBBI;
664 unsigned Base = MI->getOperand(0).getReg();
665 bool BaseKill = MI->getOperand(0).isKill();
666 unsigned Bytes = getLSMultipleTransferSize(MI);
667 unsigned PredReg = 0;
668 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
669 int Opcode = MI->getOpcode();
670 DebugLoc dl = MI->getDebugLoc();
672 // Can't use an updating ld/st if the base register is also a dest
673 // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
674 for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i)
675 if (MI->getOperand(i).getReg() == Base)
678 bool DoMerge = false;
679 ARM_AM::AMSubMode Mode = ARM_AM::getLoadStoreMultipleSubMode(Opcode);
681 // Try merging with the previous instruction.
682 MachineBasicBlock::iterator BeginMBBI = MBB.begin();
683 if (MBBI != BeginMBBI) {
684 MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
685 while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
687 if (Mode == ARM_AM::ia &&
688 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
691 } else if (Mode == ARM_AM::ib &&
692 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
700 // Try merging with the next instruction.
701 MachineBasicBlock::iterator EndMBBI = MBB.end();
702 if (!DoMerge && MBBI != EndMBBI) {
703 MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
704 while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
706 if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
707 isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
709 } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) &&
710 isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
725 unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode);
726 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
727 .addReg(Base, getDefRegState(true)) // WB base register
728 .addReg(Base, getKillRegState(BaseKill))
729 .addImm(Pred).addReg(PredReg);
731 // Transfer the rest of operands.
732 for (unsigned OpNum = 3, e = MI->getNumOperands(); OpNum != e; ++OpNum)
733 MIB.addOperand(MI->getOperand(OpNum));
735 // Transfer memoperands.
736 (*MIB).setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
742 static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc,
743 ARM_AM::AddrOpc Mode) {
750 return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
752 return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
754 return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
756 return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
759 return ARM::t2LDR_PRE;
762 return ARM::t2STR_PRE;
763 default: llvm_unreachable("Unhandled opcode!");
768 static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc,
769 ARM_AM::AddrOpc Mode) {
772 return ARM::LDR_POST;
774 return ARM::STR_POST;
776 return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
778 return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
780 return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
782 return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
785 return ARM::t2LDR_POST;
788 return ARM::t2STR_POST;
789 default: llvm_unreachable("Unhandled opcode!");
794 /// MergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base
795 /// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible:
796 bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
797 MachineBasicBlock::iterator MBBI,
798 const TargetInstrInfo *TII,
800 MachineBasicBlock::iterator &I) {
801 MachineInstr *MI = MBBI;
802 unsigned Base = MI->getOperand(1).getReg();
803 bool BaseKill = MI->getOperand(1).isKill();
804 unsigned Bytes = getLSMultipleTransferSize(MI);
805 int Opcode = MI->getOpcode();
806 DebugLoc dl = MI->getDebugLoc();
807 bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS ||
808 Opcode == ARM::VSTRD || Opcode == ARM::VSTRS);
809 bool isAM2 = (Opcode == ARM::LDRi12 || Opcode == ARM::STRi12);
810 if (isi32Load(Opcode) || isi32Store(Opcode))
811 if (MI->getOperand(2).getImm() != 0)
813 if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
816 bool isLd = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD;
817 // Can't do the merge if the destination register is the same as the would-be
818 // writeback register.
819 if (isLd && MI->getOperand(0).getReg() == Base)
822 unsigned PredReg = 0;
823 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
824 bool DoMerge = false;
825 ARM_AM::AddrOpc AddSub = ARM_AM::add;
827 // AM2 - 12 bits, thumb2 - 8 bits.
828 unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100);
830 // Try merging with the previous instruction.
831 MachineBasicBlock::iterator BeginMBBI = MBB.begin();
832 if (MBBI != BeginMBBI) {
833 MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
834 while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
836 if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) {
838 AddSub = ARM_AM::sub;
840 isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) {
844 NewOpc = getPreIndexedLoadStoreOpcode(Opcode, AddSub);
849 // Try merging with the next instruction.
850 MachineBasicBlock::iterator EndMBBI = MBB.end();
851 if (!DoMerge && MBBI != EndMBBI) {
852 MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
853 while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
856 isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) {
858 AddSub = ARM_AM::sub;
859 } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) {
863 NewOpc = getPostIndexedLoadStoreOpcode(Opcode, AddSub);
877 Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
879 Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
882 // VLDM[SD}_UPD, VSTM[SD]_UPD
883 // (There are no base-updating versions of VLDR/VSTR instructions, but the
884 // updating load/store-multiple instructions can be used with only one
886 MachineOperand &MO = MI->getOperand(0);
887 BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
888 .addReg(Base, getDefRegState(true)) // WB base register
889 .addReg(Base, getKillRegState(isLd ? BaseKill : false))
890 .addImm(Pred).addReg(PredReg)
891 .addReg(MO.getReg(), (isLd ? getDefRegState(true) :
892 getKillRegState(MO.isKill())));
895 // LDR_PRE, LDR_POST,
896 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
897 .addReg(Base, RegState::Define)
898 .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
900 // t2LDR_PRE, t2LDR_POST
901 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
902 .addReg(Base, RegState::Define)
903 .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
905 MachineOperand &MO = MI->getOperand(0);
908 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
909 .addReg(MO.getReg(), getKillRegState(MO.isKill()))
910 .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
912 // t2STR_PRE, t2STR_POST
913 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
914 .addReg(MO.getReg(), getKillRegState(MO.isKill()))
915 .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
922 /// isMemoryOp - Returns true if instruction is a memory operations (that this
923 /// pass is capable of operating on).
924 static bool isMemoryOp(const MachineInstr *MI) {
925 // When no memory operands are present, conservatively assume unaligned,
926 // volatile, unfoldable.
927 if (!MI->hasOneMemOperand())
930 const MachineMemOperand *MMO = *MI->memoperands_begin();
932 // Don't touch volatile memory accesses - we may be changing their order.
933 if (MMO->isVolatile())
936 // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is
938 if (MMO->getAlignment() < 4)
941 // str <undef> could probably be eliminated entirely, but for now we just want
942 // to avoid making a mess of it.
943 // FIXME: Use str <undef> as a wildcard to enable better stm folding.
944 if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg() &&
945 MI->getOperand(0).isUndef())
948 // Likewise don't mess with references to undefined addresses.
949 if (MI->getNumOperands() > 1 && MI->getOperand(1).isReg() &&
950 MI->getOperand(1).isUndef())
953 int Opcode = MI->getOpcode();
958 return MI->getOperand(1).isReg();
961 return MI->getOperand(1).isReg();
968 return MI->getOperand(1).isReg();
973 /// AdvanceRS - Advance register scavenger to just before the earliest memory
974 /// op that is being merged.
975 void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) {
976 MachineBasicBlock::iterator Loc = MemOps[0].MBBI;
977 unsigned Position = MemOps[0].Position;
978 for (unsigned i = 1, e = MemOps.size(); i != e; ++i) {
979 if (MemOps[i].Position < Position) {
980 Position = MemOps[i].Position;
981 Loc = MemOps[i].MBBI;
985 if (Loc != MBB.begin())
986 RS->forward(prior(Loc));
989 static int getMemoryOpOffset(const MachineInstr *MI) {
990 int Opcode = MI->getOpcode();
991 bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD;
992 unsigned NumOperands = MI->getDesc().getNumOperands();
993 unsigned OffField = MI->getOperand(NumOperands-3).getImm();
995 if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 ||
996 Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 ||
997 Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8 ||
998 Opcode == ARM::LDRi12 || Opcode == ARM::STRi12)
1001 int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField)
1002 : ARM_AM::getAM5Offset(OffField) * 4;
1004 if (ARM_AM::getAM3Op(OffField) == ARM_AM::sub)
1007 if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub)
1013 static void InsertLDR_STR(MachineBasicBlock &MBB,
1014 MachineBasicBlock::iterator &MBBI,
1015 int Offset, bool isDef,
1016 DebugLoc dl, unsigned NewOpc,
1017 unsigned Reg, bool RegDeadKill, bool RegUndef,
1018 unsigned BaseReg, bool BaseKill, bool BaseUndef,
1019 bool OffKill, bool OffUndef,
1020 ARMCC::CondCodes Pred, unsigned PredReg,
1021 const TargetInstrInfo *TII, bool isT2) {
1023 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
1025 .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill))
1026 .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
1027 MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1029 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
1031 .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef))
1032 .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
1033 MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1037 bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
1038 MachineBasicBlock::iterator &MBBI) {
1039 MachineInstr *MI = &*MBBI;
1040 unsigned Opcode = MI->getOpcode();
1041 if (Opcode == ARM::LDRD || Opcode == ARM::STRD ||
1042 Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) {
1043 unsigned EvenReg = MI->getOperand(0).getReg();
1044 unsigned OddReg = MI->getOperand(1).getReg();
1045 unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false);
1046 unsigned OddRegNum = TRI->getDwarfRegNum(OddReg, false);
1047 if ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum)
1050 MachineBasicBlock::iterator NewBBI = MBBI;
1051 bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8;
1052 bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8;
1053 bool EvenDeadKill = isLd ?
1054 MI->getOperand(0).isDead() : MI->getOperand(0).isKill();
1055 bool EvenUndef = MI->getOperand(0).isUndef();
1056 bool OddDeadKill = isLd ?
1057 MI->getOperand(1).isDead() : MI->getOperand(1).isKill();
1058 bool OddUndef = MI->getOperand(1).isUndef();
1059 const MachineOperand &BaseOp = MI->getOperand(2);
1060 unsigned BaseReg = BaseOp.getReg();
1061 bool BaseKill = BaseOp.isKill();
1062 bool BaseUndef = BaseOp.isUndef();
1063 bool OffKill = isT2 ? false : MI->getOperand(3).isKill();
1064 bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef();
1065 int OffImm = getMemoryOpOffset(MI);
1066 unsigned PredReg = 0;
1067 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
1069 if (OddRegNum > EvenRegNum && OffImm == 0) {
1070 // Ascending register numbers and no offset. It's safe to change it to a
1072 unsigned NewOpc = (isLd)
1073 ? (isT2 ? ARM::t2LDMIA : ARM::LDMIA)
1074 : (isT2 ? ARM::t2STMIA : ARM::STMIA);
1076 BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
1077 .addReg(BaseReg, getKillRegState(BaseKill))
1078 .addImm(Pred).addReg(PredReg)
1079 .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill))
1080 .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill));
1083 BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
1084 .addReg(BaseReg, getKillRegState(BaseKill))
1085 .addImm(Pred).addReg(PredReg)
1087 getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef))
1089 getKillRegState(OddDeadKill) | getUndefRegState(OddUndef));
1092 NewBBI = llvm::prior(MBBI);
1094 // Split into two instructions.
1095 unsigned NewOpc = (isLd)
1096 ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
1097 : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
1098 DebugLoc dl = MBBI->getDebugLoc();
1099 // If this is a load and base register is killed, it may have been
1100 // re-defed by the load, make sure the first load does not clobber it.
1102 (BaseKill || OffKill) &&
1103 (TRI->regsOverlap(EvenReg, BaseReg))) {
1104 assert(!TRI->regsOverlap(OddReg, BaseReg));
1105 InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc,
1106 OddReg, OddDeadKill, false,
1107 BaseReg, false, BaseUndef, false, OffUndef,
1108 Pred, PredReg, TII, isT2);
1109 NewBBI = llvm::prior(MBBI);
1110 InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
1111 EvenReg, EvenDeadKill, false,
1112 BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
1113 Pred, PredReg, TII, isT2);
1115 if (OddReg == EvenReg && EvenDeadKill) {
1116 // If the two source operands are the same, the kill marker is
1117 // probably on the first one. e.g.
1118 // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0
1119 EvenDeadKill = false;
1122 InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
1123 EvenReg, EvenDeadKill, EvenUndef,
1124 BaseReg, false, BaseUndef, false, OffUndef,
1125 Pred, PredReg, TII, isT2);
1126 NewBBI = llvm::prior(MBBI);
1127 InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc,
1128 OddReg, OddDeadKill, OddUndef,
1129 BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
1130 Pred, PredReg, TII, isT2);
1145 /// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR
1146 /// ops of the same base and incrementing offset into LDM / STM ops.
1147 bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
1148 unsigned NumMerges = 0;
1149 unsigned NumMemOps = 0;
1151 unsigned CurrBase = 0;
1153 unsigned CurrSize = 0;
1154 ARMCC::CondCodes CurrPred = ARMCC::AL;
1155 unsigned CurrPredReg = 0;
1156 unsigned Position = 0;
1157 SmallVector<MachineBasicBlock::iterator,4> Merges;
1159 RS->enterBasicBlock(&MBB);
1160 MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1162 if (FixInvalidRegPairOp(MBB, MBBI))
1165 bool Advance = false;
1166 bool TryMerge = false;
1167 bool Clobber = false;
1169 bool isMemOp = isMemoryOp(MBBI);
1171 int Opcode = MBBI->getOpcode();
1172 unsigned Size = getLSMultipleTransferSize(MBBI);
1173 const MachineOperand &MO = MBBI->getOperand(0);
1174 unsigned Reg = MO.getReg();
1175 bool isKill = MO.isDef() ? false : MO.isKill();
1176 unsigned Base = MBBI->getOperand(1).getReg();
1177 unsigned PredReg = 0;
1178 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MBBI, PredReg);
1179 int Offset = getMemoryOpOffset(MBBI);
1182 // r5 := ldr [r5, #4]
1183 // r6 := ldr [r5, #8]
1185 // The second ldr has effectively broken the chain even though it
1186 // looks like the later ldr(s) use the same base register. Try to
1187 // merge the ldr's so far, including this one. But don't try to
1188 // combine the following ldr(s).
1189 Clobber = (isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg());
1190 if (CurrBase == 0 && !Clobber) {
1191 // Start of a new chain.
1196 CurrPredReg = PredReg;
1197 MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI));
1206 if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
1207 // No need to match PredReg.
1208 // Continue adding to the queue.
1209 if (Offset > MemOps.back().Offset) {
1210 MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill,
1215 for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end();
1217 if (Offset < I->Offset) {
1218 MemOps.insert(I, MemOpQueueEntry(Offset, Reg, isKill,
1223 } else if (Offset == I->Offset) {
1224 // Collision! This can't be merged!
1233 if (MBBI->isDebugValue()) {
1236 // Reach the end of the block, try merging the memory instructions.
1238 } else if (Advance) {
1242 // Reach the end of the block, try merging the memory instructions.
1248 if (NumMemOps > 1) {
1249 // Try to find a free register to use as a new base in case it's needed.
1250 // First advance to the instruction just before the start of the chain.
1251 AdvanceRS(MBB, MemOps);
1252 // Find a scratch register.
1253 unsigned Scratch = RS->FindUnusedReg(ARM::GPRRegisterClass);
1254 // Process the load / store instructions.
1255 RS->forward(prior(MBBI));
1259 MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize,
1260 CurrPred, CurrPredReg, Scratch, MemOps, Merges);
1262 // Try folding preceeding/trailing base inc/dec into the generated
1264 for (unsigned i = 0, e = Merges.size(); i < e; ++i)
1265 if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI))
1267 NumMerges += Merges.size();
1269 // Try folding preceeding/trailing base inc/dec into those load/store
1270 // that were not merged to form LDM/STM ops.
1271 for (unsigned i = 0; i != NumMemOps; ++i)
1272 if (!MemOps[i].Merged)
1273 if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI))
1276 // RS may be pointing to an instruction that's deleted.
1277 RS->skipTo(prior(MBBI));
1278 } else if (NumMemOps == 1) {
1279 // Try folding preceeding/trailing base inc/dec into the single
1281 if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) {
1283 RS->forward(prior(MBBI));
1290 CurrPred = ARMCC::AL;
1297 // If iterator hasn't been advanced and this is not a memory op, skip it.
1298 // It can't start a new chain anyway.
1299 if (!Advance && !isMemOp && MBBI != E) {
1305 return NumMerges > 0;
1309 struct OffsetCompare {
1310 bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const {
1311 int LOffset = getMemoryOpOffset(LHS);
1312 int ROffset = getMemoryOpOffset(RHS);
1313 assert(LHS == RHS || LOffset != ROffset);
1314 return LOffset > ROffset;
1319 /// MergeReturnIntoLDM - If this is a exit BB, try merging the return ops
1320 /// ("bx lr" and "mov pc, lr") into the preceeding stack restore so it
1321 /// directly restore the value of LR into pc.
1322 /// ldmfd sp!, {..., lr}
1325 /// ldmfd sp!, {..., lr}
1328 /// ldmfd sp!, {..., pc}
1329 bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
1330 if (MBB.empty()) return false;
1332 MachineBasicBlock::iterator MBBI = prior(MBB.end());
1333 if (MBBI != MBB.begin() &&
1334 (MBBI->getOpcode() == ARM::BX_RET ||
1335 MBBI->getOpcode() == ARM::tBX_RET ||
1336 MBBI->getOpcode() == ARM::MOVPCLR)) {
1337 MachineInstr *PrevMI = prior(MBBI);
1338 unsigned Opcode = PrevMI->getOpcode();
1339 if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD ||
1340 Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD ||
1341 Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) {
1342 MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1);
1343 if (MO.getReg() != ARM::LR)
1345 unsigned NewOpc = (isThumb2 ? ARM::t2LDMIA_RET : ARM::LDMIA_RET);
1346 assert(((isThumb2 && Opcode == ARM::t2LDMIA_UPD) ||
1347 Opcode == ARM::LDMIA_UPD) && "Unsupported multiple load-return!");
1348 PrevMI->setDesc(TII->get(NewOpc));
1350 PrevMI->copyImplicitOps(&*MBBI);
1358 bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1359 const TargetMachine &TM = Fn.getTarget();
1360 AFI = Fn.getInfo<ARMFunctionInfo>();
1361 TII = TM.getInstrInfo();
1362 TRI = TM.getRegisterInfo();
1363 RS = new RegScavenger();
1364 isThumb2 = AFI->isThumb2Function();
1366 bool Modified = false;
1367 for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
1369 MachineBasicBlock &MBB = *MFI;
1370 Modified |= LoadStoreMultipleOpti(MBB);
1371 Modified |= MergeReturnIntoLDM(MBB);
1379 /// ARMPreAllocLoadStoreOpt - Pre- register allocation pass that move
1380 /// load / stores from consecutive locations close to make it more
1381 /// likely they will be combined later.
1384 struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
1386 ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {}
1388 const TargetData *TD;
1389 const TargetInstrInfo *TII;
1390 const TargetRegisterInfo *TRI;
1391 const ARMSubtarget *STI;
1392 MachineRegisterInfo *MRI;
1393 MachineFunction *MF;
1395 virtual bool runOnMachineFunction(MachineFunction &Fn);
1397 virtual const char *getPassName() const {
1398 return "ARM pre- register allocation load / store optimization pass";
1402 bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
1403 unsigned &NewOpc, unsigned &EvenReg,
1404 unsigned &OddReg, unsigned &BaseReg,
1406 unsigned &PredReg, ARMCC::CondCodes &Pred,
1408 bool RescheduleOps(MachineBasicBlock *MBB,
1409 SmallVector<MachineInstr*, 4> &Ops,
1410 unsigned Base, bool isLd,
1411 DenseMap<MachineInstr*, unsigned> &MI2LocMap);
1412 bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
1414 char ARMPreAllocLoadStoreOpt::ID = 0;
1417 bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1418 TD = Fn.getTarget().getTargetData();
1419 TII = Fn.getTarget().getInstrInfo();
1420 TRI = Fn.getTarget().getRegisterInfo();
1421 STI = &Fn.getTarget().getSubtarget<ARMSubtarget>();
1422 MRI = &Fn.getRegInfo();
1425 bool Modified = false;
1426 for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
1428 Modified |= RescheduleLoadStoreInstrs(MFI);
1433 static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
1434 MachineBasicBlock::iterator I,
1435 MachineBasicBlock::iterator E,
1436 SmallPtrSet<MachineInstr*, 4> &MemOps,
1437 SmallSet<unsigned, 4> &MemRegs,
1438 const TargetRegisterInfo *TRI) {
1439 // Are there stores / loads / calls between them?
1440 // FIXME: This is overly conservative. We should make use of alias information
1442 SmallSet<unsigned, 4> AddedRegPressure;
1444 if (I->isDebugValue() || MemOps.count(&*I))
1446 const TargetInstrDesc &TID = I->getDesc();
1447 if (TID.isCall() || TID.isTerminator() || TID.hasUnmodeledSideEffects())
1449 if (isLd && TID.mayStore())
1454 // It's not safe to move the first 'str' down.
1457 // str r4, [r0, #+4]
1461 for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) {
1462 MachineOperand &MO = I->getOperand(j);
1465 unsigned Reg = MO.getReg();
1466 if (MO.isDef() && TRI->regsOverlap(Reg, Base))
1468 if (Reg != Base && !MemRegs.count(Reg))
1469 AddedRegPressure.insert(Reg);
1473 // Estimate register pressure increase due to the transformation.
1474 if (MemRegs.size() <= 4)
1475 // Ok if we are moving small number of instructions.
1477 return AddedRegPressure.size() <= MemRegs.size() * 2;
1481 ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
1483 unsigned &NewOpc, unsigned &EvenReg,
1484 unsigned &OddReg, unsigned &BaseReg,
1485 int &Offset, unsigned &PredReg,
1486 ARMCC::CondCodes &Pred,
1488 // Make sure we're allowed to generate LDRD/STRD.
1489 if (!STI->hasV5TEOps())
1492 // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD
1494 unsigned Opcode = Op0->getOpcode();
1495 if (Opcode == ARM::LDRi12)
1497 else if (Opcode == ARM::STRi12)
1499 else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) {
1500 NewOpc = ARM::t2LDRDi8;
1503 } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) {
1504 NewOpc = ARM::t2STRDi8;
1510 // Make sure the base address satisfies i64 ld / st alignment requirement.
1511 if (!Op0->hasOneMemOperand() ||
1512 !(*Op0->memoperands_begin())->getValue() ||
1513 (*Op0->memoperands_begin())->isVolatile())
1516 unsigned Align = (*Op0->memoperands_begin())->getAlignment();
1517 const Function *Func = MF->getFunction();
1518 unsigned ReqAlign = STI->hasV6Ops()
1519 ? TD->getABITypeAlignment(Type::getInt64Ty(Func->getContext()))
1520 : 8; // Pre-v6 need 8-byte align
1521 if (Align < ReqAlign)
1524 // Then make sure the immediate offset fits.
1525 int OffImm = getMemoryOpOffset(Op0);
1529 // Can't fall back to t2LDRi8 / t2STRi8.
1532 int Limit = (1 << 8) * Scale;
1533 if (OffImm >= Limit || (OffImm & (Scale-1)))
1538 ARM_AM::AddrOpc AddSub = ARM_AM::add;
1540 AddSub = ARM_AM::sub;
1543 int Limit = (1 << 8) * Scale;
1544 if (OffImm >= Limit || (OffImm & (Scale-1)))
1546 Offset = ARM_AM::getAM3Opc(AddSub, OffImm);
1548 EvenReg = Op0->getOperand(0).getReg();
1549 OddReg = Op1->getOperand(0).getReg();
1550 if (EvenReg == OddReg)
1552 BaseReg = Op0->getOperand(1).getReg();
1553 Pred = llvm::getInstrPredicate(Op0, PredReg);
1554 dl = Op0->getDebugLoc();
1558 bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
1559 SmallVector<MachineInstr*, 4> &Ops,
1560 unsigned Base, bool isLd,
1561 DenseMap<MachineInstr*, unsigned> &MI2LocMap) {
1562 bool RetVal = false;
1564 // Sort by offset (in reverse order).
1565 std::sort(Ops.begin(), Ops.end(), OffsetCompare());
1567 // The loads / stores of the same base are in order. Scan them from first to
1568 // last and check for the following:
1569 // 1. Any def of base.
1571 while (Ops.size() > 1) {
1572 unsigned FirstLoc = ~0U;
1573 unsigned LastLoc = 0;
1574 MachineInstr *FirstOp = 0;
1575 MachineInstr *LastOp = 0;
1577 unsigned LastOpcode = 0;
1578 unsigned LastBytes = 0;
1579 unsigned NumMove = 0;
1580 for (int i = Ops.size() - 1; i >= 0; --i) {
1581 MachineInstr *Op = Ops[i];
1582 unsigned Loc = MI2LocMap[Op];
1583 if (Loc <= FirstLoc) {
1587 if (Loc >= LastLoc) {
1592 unsigned Opcode = Op->getOpcode();
1593 if (LastOpcode && Opcode != LastOpcode)
1596 int Offset = getMemoryOpOffset(Op);
1597 unsigned Bytes = getLSMultipleTransferSize(Op);
1599 if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes))
1602 LastOffset = Offset;
1604 LastOpcode = Opcode;
1605 if (++NumMove == 8) // FIXME: Tune this limit.
1612 SmallPtrSet<MachineInstr*, 4> MemOps;
1613 SmallSet<unsigned, 4> MemRegs;
1614 for (int i = NumMove-1; i >= 0; --i) {
1615 MemOps.insert(Ops[i]);
1616 MemRegs.insert(Ops[i]->getOperand(0).getReg());
1619 // Be conservative, if the instructions are too far apart, don't
1620 // move them. We want to limit the increase of register pressure.
1621 bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this.
1623 DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp,
1624 MemOps, MemRegs, TRI);
1626 for (unsigned i = 0; i != NumMove; ++i)
1629 // This is the new location for the loads / stores.
1630 MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
1631 while (InsertPos != MBB->end()
1632 && (MemOps.count(InsertPos) || InsertPos->isDebugValue()))
1635 // If we are moving a pair of loads / stores, see if it makes sense
1636 // to try to allocate a pair of registers that can form register pairs.
1637 MachineInstr *Op0 = Ops.back();
1638 MachineInstr *Op1 = Ops[Ops.size()-2];
1639 unsigned EvenReg = 0, OddReg = 0;
1640 unsigned BaseReg = 0, PredReg = 0;
1641 ARMCC::CondCodes Pred = ARMCC::AL;
1643 unsigned NewOpc = 0;
1646 if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc,
1647 EvenReg, OddReg, BaseReg,
1648 Offset, PredReg, Pred, isT2)) {
1652 // Form the pair instruction.
1654 MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos,
1655 dl, TII->get(NewOpc))
1656 .addReg(EvenReg, RegState::Define)
1657 .addReg(OddReg, RegState::Define)
1659 // FIXME: We're converting from LDRi12 to an insn that still
1660 // uses addrmode2, so we need an explicit offset reg. It should
1661 // always by reg0 since we're transforming LDRi12s.
1664 MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1667 MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos,
1668 dl, TII->get(NewOpc))
1672 // FIXME: We're converting from LDRi12 to an insn that still
1673 // uses addrmode2, so we need an explicit offset reg. It should
1674 // always by reg0 since we're transforming STRi12s.
1677 MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
1683 // Add register allocation hints to form register pairs.
1684 MRI->setRegAllocationHint(EvenReg, ARMRI::RegPairEven, OddReg);
1685 MRI->setRegAllocationHint(OddReg, ARMRI::RegPairOdd, EvenReg);
1687 for (unsigned i = 0; i != NumMove; ++i) {
1688 MachineInstr *Op = Ops.back();
1690 MBB->splice(InsertPos, MBB, Op);
1694 NumLdStMoved += NumMove;
1704 ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
1705 bool RetVal = false;
1707 DenseMap<MachineInstr*, unsigned> MI2LocMap;
1708 DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap;
1709 DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap;
1710 SmallVector<unsigned, 4> LdBases;
1711 SmallVector<unsigned, 4> StBases;
1714 MachineBasicBlock::iterator MBBI = MBB->begin();
1715 MachineBasicBlock::iterator E = MBB->end();
1717 for (; MBBI != E; ++MBBI) {
1718 MachineInstr *MI = MBBI;
1719 const TargetInstrDesc &TID = MI->getDesc();
1720 if (TID.isCall() || TID.isTerminator()) {
1721 // Stop at barriers.
1726 if (!MI->isDebugValue())
1727 MI2LocMap[MI] = ++Loc;
1729 if (!isMemoryOp(MI))
1731 unsigned PredReg = 0;
1732 if (llvm::getInstrPredicate(MI, PredReg) != ARMCC::AL)
1735 int Opc = MI->getOpcode();
1736 bool isLd = isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD;
1737 unsigned Base = MI->getOperand(1).getReg();
1738 int Offset = getMemoryOpOffset(MI);
1740 bool StopHere = false;
1742 DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
1743 Base2LdsMap.find(Base);
1744 if (BI != Base2LdsMap.end()) {
1745 for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
1746 if (Offset == getMemoryOpOffset(BI->second[i])) {
1752 BI->second.push_back(MI);
1754 SmallVector<MachineInstr*, 4> MIs;
1756 Base2LdsMap[Base] = MIs;
1757 LdBases.push_back(Base);
1760 DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
1761 Base2StsMap.find(Base);
1762 if (BI != Base2StsMap.end()) {
1763 for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
1764 if (Offset == getMemoryOpOffset(BI->second[i])) {
1770 BI->second.push_back(MI);
1772 SmallVector<MachineInstr*, 4> MIs;
1774 Base2StsMap[Base] = MIs;
1775 StBases.push_back(Base);
1780 // Found a duplicate (a base+offset combination that's seen earlier).
1787 // Re-schedule loads.
1788 for (unsigned i = 0, e = LdBases.size(); i != e; ++i) {
1789 unsigned Base = LdBases[i];
1790 SmallVector<MachineInstr*, 4> &Lds = Base2LdsMap[Base];
1792 RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap);
1795 // Re-schedule stores.
1796 for (unsigned i = 0, e = StBases.size(); i != e; ++i) {
1797 unsigned Base = StBases[i];
1798 SmallVector<MachineInstr*, 4> &Sts = Base2StsMap[Base];
1800 RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap);
1804 Base2LdsMap.clear();
1805 Base2StsMap.clear();
1815 /// createARMLoadStoreOptimizationPass - returns an instance of the load / store
1816 /// optimization pass.
1817 FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
1819 return new ARMPreAllocLoadStoreOpt();
1820 return new ARMLoadStoreOpt();