1 //=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file contains a pass that performs load / store related peephole
11 // optimizations. This pass should be run after register allocation.
13 //===----------------------------------------------------------------------===//
15 #include "AArch64InstrInfo.h"
16 #include "AArch64Subtarget.h"
17 #include "MCTargetDesc/AArch64AddressingModes.h"
18 #include "llvm/ADT/BitVector.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/CodeGen/MachineBasicBlock.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/CodeGen/MachineInstr.h"
24 #include "llvm/CodeGen/MachineInstrBuilder.h"
25 #include "llvm/Support/CommandLine.h"
26 #include "llvm/Support/Debug.h"
27 #include "llvm/Support/ErrorHandling.h"
28 #include "llvm/Support/raw_ostream.h"
29 #include "llvm/Target/TargetInstrInfo.h"
30 #include "llvm/Target/TargetMachine.h"
31 #include "llvm/Target/TargetRegisterInfo.h"
34 #define DEBUG_TYPE "aarch64-ldst-opt"
36 /// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine
37 /// load / store instructions to form ldp / stp instructions.
39 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
40 STATISTIC(NumPostFolded, "Number of post-index updates folded");
41 STATISTIC(NumPreFolded, "Number of pre-index updates folded");
42 STATISTIC(NumUnscaledPairCreated,
43 "Number of load/store from unscaled generated");
45 static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit",
46 cl::init(20), cl::Hidden);
48 // Place holder while testing unscaled load/store combining
49 static cl::opt<bool> EnableAArch64UnscaledMemOp(
50 "aarch64-unscaled-mem-op", cl::Hidden,
51 cl::desc("Allow AArch64 unscaled load/store combining"), cl::init(true));
54 void initializeAArch64LoadStoreOptPass(PassRegistry &);
57 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
61 typedef struct LdStPairFlags {
62 // If a matching instruction is found, MergeForward is set to true if the
63 // merge is to remove the first instruction and replace the second with
64 // a pair-wise insn, and false if the reverse is true.
67 // SExtIdx gives the index of the result of the load pair that must be
68 // extended. The value of SExtIdx assumes that the paired load produces the
69 // value in this order: (I, returned iterator), i.e., -1 means no value has
70 // to be extended, 0 means I, and 1 means the returned iterator.
73 LdStPairFlags() : MergeForward(false), SExtIdx(-1) {}
75 void setMergeForward(bool V = true) { MergeForward = V; }
76 bool getMergeForward() const { return MergeForward; }
78 void setSExtIdx(int V) { SExtIdx = V; }
79 int getSExtIdx() const { return SExtIdx; }
83 struct AArch64LoadStoreOpt : public MachineFunctionPass {
85 AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
86 initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
89 const AArch64InstrInfo *TII;
90 const TargetRegisterInfo *TRI;
92 // Scan the instructions looking for a load/store that can be combined
93 // with the current instruction into a load/store pair.
94 // Return the matching instruction if one is found, else MBB->end().
95 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
98 // Merge the two instructions indicated into a single pair-wise instruction.
99 // If MergeForward is true, erase the first instruction and fold its
100 // operation into the second. If false, the reverse. Return the instruction
101 // following the first instruction (which may change during processing).
102 MachineBasicBlock::iterator
103 mergePairedInsns(MachineBasicBlock::iterator I,
104 MachineBasicBlock::iterator Paired,
105 const LdStPairFlags &Flags);
107 // Scan the instruction list to find a base register update that can
108 // be combined with the current instruction (a load or store) using
109 // pre or post indexed addressing with writeback. Scan forwards.
110 MachineBasicBlock::iterator
111 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit,
114 // Scan the instruction list to find a base register update that can
115 // be combined with the current instruction (a load or store) using
116 // pre or post indexed addressing with writeback. Scan backwards.
117 MachineBasicBlock::iterator
118 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
120 // Find an instruction that updates the base register of the ld/st
122 bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI,
123 unsigned BaseReg, int Offset);
125 // Merge a pre- or post-index base register update into a ld/st instruction.
126 MachineBasicBlock::iterator
127 mergeUpdateInsn(MachineBasicBlock::iterator I,
128 MachineBasicBlock::iterator Update, bool IsPreIdx);
130 bool optimizeBlock(MachineBasicBlock &MBB);
132 bool runOnMachineFunction(MachineFunction &Fn) override;
134 const char *getPassName() const override {
135 return AARCH64_LOAD_STORE_OPT_NAME;
138 char AArch64LoadStoreOpt::ID = 0;
141 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
142 AARCH64_LOAD_STORE_OPT_NAME, false, false)
144 static bool isUnscaledLdSt(unsigned Opc) {
148 case AArch64::STURSi:
149 case AArch64::STURDi:
150 case AArch64::STURQi:
151 case AArch64::STURWi:
152 case AArch64::STURXi:
153 case AArch64::LDURSi:
154 case AArch64::LDURDi:
155 case AArch64::LDURQi:
156 case AArch64::LDURWi:
157 case AArch64::LDURXi:
158 case AArch64::LDURSWi:
163 static bool isUnscaledLdSt(MachineInstr *MI) {
164 return isUnscaledLdSt(MI->getOpcode());
167 // Scaling factor for unscaled load or store.
168 static int getMemScale(MachineInstr *MI) {
169 switch (MI->getOpcode()) {
171 llvm_unreachable("Opcode has unknown scale!");
172 case AArch64::LDRBBui:
173 case AArch64::STRBBui:
175 case AArch64::LDRHHui:
176 case AArch64::STRHHui:
178 case AArch64::LDRSui:
179 case AArch64::LDURSi:
180 case AArch64::LDRSWui:
181 case AArch64::LDURSWi:
182 case AArch64::LDRWui:
183 case AArch64::LDURWi:
184 case AArch64::STRSui:
185 case AArch64::STURSi:
186 case AArch64::STRWui:
187 case AArch64::STURWi:
189 case AArch64::LDPSWi:
194 case AArch64::LDRDui:
195 case AArch64::LDURDi:
196 case AArch64::LDRXui:
197 case AArch64::LDURXi:
198 case AArch64::STRDui:
199 case AArch64::STURDi:
200 case AArch64::STRXui:
201 case AArch64::STURXi:
207 case AArch64::LDRQui:
208 case AArch64::LDURQi:
209 case AArch64::STRQui:
210 case AArch64::STURQi:
217 static unsigned getMatchingNonSExtOpcode(unsigned Opc,
218 bool *IsValidLdStrOpc = nullptr) {
220 *IsValidLdStrOpc = true;
224 *IsValidLdStrOpc = false;
226 case AArch64::STRDui:
227 case AArch64::STURDi:
228 case AArch64::STRQui:
229 case AArch64::STURQi:
230 case AArch64::STRWui:
231 case AArch64::STURWi:
232 case AArch64::STRXui:
233 case AArch64::STURXi:
234 case AArch64::LDRDui:
235 case AArch64::LDURDi:
236 case AArch64::LDRQui:
237 case AArch64::LDURQi:
238 case AArch64::LDRWui:
239 case AArch64::LDURWi:
240 case AArch64::LDRXui:
241 case AArch64::LDURXi:
242 case AArch64::STRSui:
243 case AArch64::STURSi:
244 case AArch64::LDRSui:
245 case AArch64::LDURSi:
247 case AArch64::LDRSWui:
248 return AArch64::LDRWui;
249 case AArch64::LDURSWi:
250 return AArch64::LDURWi;
254 static unsigned getMatchingPairOpcode(unsigned Opc) {
257 llvm_unreachable("Opcode has no pairwise equivalent!");
258 case AArch64::STRSui:
259 case AArch64::STURSi:
260 return AArch64::STPSi;
261 case AArch64::STRDui:
262 case AArch64::STURDi:
263 return AArch64::STPDi;
264 case AArch64::STRQui:
265 case AArch64::STURQi:
266 return AArch64::STPQi;
267 case AArch64::STRWui:
268 case AArch64::STURWi:
269 return AArch64::STPWi;
270 case AArch64::STRXui:
271 case AArch64::STURXi:
272 return AArch64::STPXi;
273 case AArch64::LDRSui:
274 case AArch64::LDURSi:
275 return AArch64::LDPSi;
276 case AArch64::LDRDui:
277 case AArch64::LDURDi:
278 return AArch64::LDPDi;
279 case AArch64::LDRQui:
280 case AArch64::LDURQi:
281 return AArch64::LDPQi;
282 case AArch64::LDRWui:
283 case AArch64::LDURWi:
284 return AArch64::LDPWi;
285 case AArch64::LDRXui:
286 case AArch64::LDURXi:
287 return AArch64::LDPXi;
288 case AArch64::LDRSWui:
289 case AArch64::LDURSWi:
290 return AArch64::LDPSWi;
294 static unsigned getPreIndexedOpcode(unsigned Opc) {
297 llvm_unreachable("Opcode has no pre-indexed equivalent!");
298 case AArch64::STRSui:
299 return AArch64::STRSpre;
300 case AArch64::STRDui:
301 return AArch64::STRDpre;
302 case AArch64::STRQui:
303 return AArch64::STRQpre;
304 case AArch64::STRBBui:
305 return AArch64::STRBBpre;
306 case AArch64::STRHHui:
307 return AArch64::STRHHpre;
308 case AArch64::STRWui:
309 return AArch64::STRWpre;
310 case AArch64::STRXui:
311 return AArch64::STRXpre;
312 case AArch64::LDRSui:
313 return AArch64::LDRSpre;
314 case AArch64::LDRDui:
315 return AArch64::LDRDpre;
316 case AArch64::LDRQui:
317 return AArch64::LDRQpre;
318 case AArch64::LDRBBui:
319 return AArch64::LDRBBpre;
320 case AArch64::LDRHHui:
321 return AArch64::LDRHHpre;
322 case AArch64::LDRWui:
323 return AArch64::LDRWpre;
324 case AArch64::LDRXui:
325 return AArch64::LDRXpre;
326 case AArch64::LDRSWui:
327 return AArch64::LDRSWpre;
329 return AArch64::LDPSpre;
330 case AArch64::LDPSWi:
331 return AArch64::LDPSWpre;
333 return AArch64::LDPDpre;
335 return AArch64::LDPQpre;
337 return AArch64::LDPWpre;
339 return AArch64::LDPXpre;
341 return AArch64::STPSpre;
343 return AArch64::STPDpre;
345 return AArch64::STPQpre;
347 return AArch64::STPWpre;
349 return AArch64::STPXpre;
353 static unsigned getPostIndexedOpcode(unsigned Opc) {
356 llvm_unreachable("Opcode has no post-indexed wise equivalent!");
357 case AArch64::STRSui:
358 return AArch64::STRSpost;
359 case AArch64::STRDui:
360 return AArch64::STRDpost;
361 case AArch64::STRQui:
362 return AArch64::STRQpost;
363 case AArch64::STRBBui:
364 return AArch64::STRBBpost;
365 case AArch64::STRHHui:
366 return AArch64::STRHHpost;
367 case AArch64::STRWui:
368 return AArch64::STRWpost;
369 case AArch64::STRXui:
370 return AArch64::STRXpost;
371 case AArch64::LDRSui:
372 return AArch64::LDRSpost;
373 case AArch64::LDRDui:
374 return AArch64::LDRDpost;
375 case AArch64::LDRQui:
376 return AArch64::LDRQpost;
377 case AArch64::LDRBBui:
378 return AArch64::LDRBBpost;
379 case AArch64::LDRHHui:
380 return AArch64::LDRHHpost;
381 case AArch64::LDRWui:
382 return AArch64::LDRWpost;
383 case AArch64::LDRXui:
384 return AArch64::LDRXpost;
385 case AArch64::LDRSWui:
386 return AArch64::LDRSWpost;
388 return AArch64::LDPSpost;
389 case AArch64::LDPSWi:
390 return AArch64::LDPSWpost;
392 return AArch64::LDPDpost;
394 return AArch64::LDPQpost;
396 return AArch64::LDPWpost;
398 return AArch64::LDPXpost;
400 return AArch64::STPSpost;
402 return AArch64::STPDpost;
404 return AArch64::STPQpost;
406 return AArch64::STPWpost;
408 return AArch64::STPXpost;
412 static bool isPairedLdSt(const MachineInstr *MI) {
413 switch (MI->getOpcode()) {
417 case AArch64::LDPSWi:
431 static const MachineOperand &getLdStRegOp(const MachineInstr *MI,
432 unsigned PairedRegOp = 0) {
433 assert(PairedRegOp < 2 && "Unexpected register operand idx.");
434 unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
435 return MI->getOperand(Idx);
438 static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) {
439 unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
440 return MI->getOperand(Idx);
443 static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) {
444 unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
445 return MI->getOperand(Idx);
448 MachineBasicBlock::iterator
449 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
450 MachineBasicBlock::iterator Paired,
451 const LdStPairFlags &Flags) {
452 MachineBasicBlock::iterator NextI = I;
454 // If NextI is the second of the two instructions to be merged, we need
455 // to skip one further. Either way we merge will invalidate the iterator,
456 // and we don't need to scan the new instruction, as it's a pairwise
457 // instruction, which we're not considering for further action anyway.
461 int SExtIdx = Flags.getSExtIdx();
463 SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
464 bool IsUnscaled = isUnscaledLdSt(Opc);
466 IsUnscaled && EnableAArch64UnscaledMemOp ? getMemScale(I) : 1;
468 bool MergeForward = Flags.getMergeForward();
469 unsigned NewOpc = getMatchingPairOpcode(Opc);
470 // Insert our new paired instruction after whichever of the paired
471 // instructions MergeForward indicates.
472 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
473 // Also based on MergeForward is from where we copy the base register operand
474 // so we get the flags compatible with the input code.
475 const MachineOperand &BaseRegOp =
476 MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I);
478 // Which register is Rt and which is Rt2 depends on the offset order.
479 MachineInstr *RtMI, *Rt2MI;
480 if (getLdStOffsetOp(I).getImm() ==
481 getLdStOffsetOp(Paired).getImm() + OffsetStride) {
484 // Here we swapped the assumption made for SExtIdx.
485 // I.e., we turn ldp I, Paired into ldp Paired, I.
486 // Update the index accordingly.
488 SExtIdx = (SExtIdx + 1) % 2;
494 int OffsetImm = getLdStOffsetOp(RtMI).getImm();
495 if (IsUnscaled && EnableAArch64UnscaledMemOp)
496 OffsetImm /= OffsetStride;
498 // Construct the new instruction.
499 MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint,
500 I->getDebugLoc(), TII->get(NewOpc))
501 .addOperand(getLdStRegOp(RtMI))
502 .addOperand(getLdStRegOp(Rt2MI))
503 .addOperand(BaseRegOp)
507 // FIXME: Do we need/want to copy the mem operands from the source
508 // instructions? Probably. What uses them after this?
510 DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n ");
511 DEBUG(I->print(dbgs()));
512 DEBUG(dbgs() << " ");
513 DEBUG(Paired->print(dbgs()));
514 DEBUG(dbgs() << " with instruction:\n ");
517 // Generate the sign extension for the proper result of the ldp.
518 // I.e., with X1, that would be:
519 // %W1<def> = KILL %W1, %X1<imp-def>
520 // %X1<def> = SBFMXri %X1<kill>, 0, 31
521 MachineOperand &DstMO = MIB->getOperand(SExtIdx);
522 // Right now, DstMO has the extended register, since it comes from an
524 unsigned DstRegX = DstMO.getReg();
525 // Get the W variant of that register.
526 unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
527 // Update the result of LDP to use the W instead of the X variant.
528 DstMO.setReg(DstRegW);
529 DEBUG(((MachineInstr *)MIB)->print(dbgs()));
530 DEBUG(dbgs() << "\n");
531 // Make the machine verifier happy by providing a definition for
533 // Insert this definition right after the generated LDP, i.e., before
535 MachineInstrBuilder MIBKill =
536 BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
537 TII->get(TargetOpcode::KILL), DstRegW)
539 .addReg(DstRegX, RegState::Define);
540 MIBKill->getOperand(2).setImplicit();
541 // Create the sign extension.
542 MachineInstrBuilder MIBSXTW =
543 BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
544 TII->get(AArch64::SBFMXri), DstRegX)
549 DEBUG(dbgs() << " Extend operand:\n ");
550 DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
551 DEBUG(dbgs() << "\n");
553 DEBUG(((MachineInstr *)MIB)->print(dbgs()));
554 DEBUG(dbgs() << "\n");
557 // Erase the old instructions.
558 I->eraseFromParent();
559 Paired->eraseFromParent();
564 /// trackRegDefsUses - Remember what registers the specified instruction uses
566 static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs,
568 const TargetRegisterInfo *TRI) {
569 for (const MachineOperand &MO : MI->operands()) {
571 ModifiedRegs.setBitsNotInMask(MO.getRegMask());
575 unsigned Reg = MO.getReg();
577 for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
578 ModifiedRegs.set(*AI);
580 assert(MO.isUse() && "Reg operand not a def and not a use?!?");
581 for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
587 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
588 // Convert the byte-offset used by unscaled into an "element" offset used
589 // by the scaled pair load/store instructions.
591 Offset /= OffsetStride;
593 return Offset <= 63 && Offset >= -64;
596 // Do alignment, specialized to power of 2 and for signed ints,
597 // avoiding having to do a C-style cast from uint_64t to int when
598 // using RoundUpToAlignment from include/llvm/Support/MathExtras.h.
599 // FIXME: Move this function to include/MathExtras.h?
600 static int alignTo(int Num, int PowOf2) {
601 return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
604 static bool mayAlias(MachineInstr *MIa, MachineInstr *MIb,
605 const AArch64InstrInfo *TII) {
606 // One of the instructions must modify memory.
607 if (!MIa->mayStore() && !MIb->mayStore())
610 // Both instructions must be memory operations.
611 if (!MIa->mayLoadOrStore() && !MIb->mayLoadOrStore())
614 return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb);
617 static bool mayAlias(MachineInstr *MIa,
618 SmallVectorImpl<MachineInstr *> &MemInsns,
619 const AArch64InstrInfo *TII) {
620 for (auto &MIb : MemInsns)
621 if (mayAlias(MIa, MIb, TII))
627 /// findMatchingInsn - Scan the instructions looking for a load/store that can
628 /// be combined with the current instruction into a load/store pair.
629 MachineBasicBlock::iterator
630 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
631 LdStPairFlags &Flags,
633 MachineBasicBlock::iterator E = I->getParent()->end();
634 MachineBasicBlock::iterator MBBI = I;
635 MachineInstr *FirstMI = I;
638 unsigned Opc = FirstMI->getOpcode();
639 bool MayLoad = FirstMI->mayLoad();
640 bool IsUnscaled = isUnscaledLdSt(FirstMI);
641 unsigned Reg = getLdStRegOp(FirstMI).getReg();
642 unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
643 int Offset = getLdStOffsetOp(FirstMI).getImm();
645 // Early exit if the first instruction modifies the base register.
646 // e.g., ldr x0, [x0]
647 if (FirstMI->modifiesRegister(BaseReg, TRI))
650 // Early exit if the offset if not possible to match. (6 bits of positive
651 // range, plus allow an extra one in case we find a later insn that matches
654 IsUnscaled && EnableAArch64UnscaledMemOp ? getMemScale(FirstMI) : 1;
655 if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
658 // Track which registers have been modified and used between the first insn
659 // (inclusive) and the second insn.
660 BitVector ModifiedRegs, UsedRegs;
661 ModifiedRegs.resize(TRI->getNumRegs());
662 UsedRegs.resize(TRI->getNumRegs());
664 // Remember any instructions that read/write memory between FirstMI and MI.
665 SmallVector<MachineInstr *, 4> MemInsns;
667 for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
668 MachineInstr *MI = MBBI;
669 // Skip DBG_VALUE instructions. Otherwise debug info can affect the
670 // optimization by changing how far we scan.
671 if (MI->isDebugValue())
674 // Now that we know this is a real instruction, count it.
677 bool CanMergeOpc = Opc == MI->getOpcode();
678 Flags.setSExtIdx(-1);
680 bool IsValidLdStrOpc;
681 unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc);
682 assert(IsValidLdStrOpc &&
683 "Given Opc should be a Load or Store with an immediate");
684 // Opc will be the first instruction in the pair.
685 Flags.setSExtIdx(NonSExtOpc == (unsigned)Opc ? 1 : 0);
686 CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode());
689 if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) {
690 assert(MI->mayLoadOrStore() && "Expected memory operation.");
691 // If we've found another instruction with the same opcode, check to see
692 // if the base and offset are compatible with our starting instruction.
693 // These instructions all have scaled immediate operands, so we just
694 // check for +1/-1. Make sure to check the new instruction offset is
695 // actually an immediate and not a symbolic reference destined for
698 // Pairwise instructions have a 7-bit signed offset field. Single insns
699 // have a 12-bit unsigned offset field. To be a valid combine, the
700 // final offset must be in range.
701 unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
702 int MIOffset = getLdStOffsetOp(MI).getImm();
703 if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
704 (Offset + OffsetStride == MIOffset))) {
705 int MinOffset = Offset < MIOffset ? Offset : MIOffset;
706 // If this is a volatile load/store that otherwise matched, stop looking
707 // as something is going on that we don't have enough information to
708 // safely transform. Similarly, stop if we see a hint to avoid pairs.
709 if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
711 // If the resultant immediate offset of merging these instructions
712 // is out of range for a pairwise instruction, bail and keep looking.
713 bool MIIsUnscaled = isUnscaledLdSt(MI);
714 if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
715 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
716 MemInsns.push_back(MI);
719 // If the alignment requirements of the paired (scaled) instruction
720 // can't express the offset of the unscaled input, bail and keep
722 if (IsUnscaled && EnableAArch64UnscaledMemOp &&
723 (alignTo(MinOffset, OffsetStride) != MinOffset)) {
724 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
725 MemInsns.push_back(MI);
728 // If the destination register of the loads is the same register, bail
729 // and keep looking. A load-pair instruction with both destination
730 // registers the same is UNPREDICTABLE and will result in an exception.
731 if (MayLoad && Reg == getLdStRegOp(MI).getReg()) {
732 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
733 MemInsns.push_back(MI);
737 // If the Rt of the second instruction was not modified or used between
738 // the two instructions and none of the instructions between the second
739 // and first alias with the second, we can combine the second into the
741 if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
742 !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
743 !mayAlias(MI, MemInsns, TII)) {
744 Flags.setMergeForward(false);
748 // Likewise, if the Rt of the first instruction is not modified or used
749 // between the two instructions and none of the instructions between the
750 // first and the second alias with the first, we can combine the first
752 if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&
753 !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) &&
754 !mayAlias(FirstMI, MemInsns, TII)) {
755 Flags.setMergeForward(true);
758 // Unable to combine these instructions due to interference in between.
763 // If the instruction wasn't a matching load or store. Stop searching if we
764 // encounter a call instruction that might modify memory.
768 // Update modified / uses register lists.
769 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
771 // Otherwise, if the base register is modified, we have no match, so
773 if (ModifiedRegs[BaseReg])
776 // Update list of instructions that read/write memory.
777 if (MI->mayLoadOrStore())
778 MemInsns.push_back(MI);
783 MachineBasicBlock::iterator
784 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
785 MachineBasicBlock::iterator Update,
787 assert((Update->getOpcode() == AArch64::ADDXri ||
788 Update->getOpcode() == AArch64::SUBXri) &&
789 "Unexpected base register update instruction to merge!");
790 MachineBasicBlock::iterator NextI = I;
791 // Return the instruction following the merged instruction, which is
792 // the instruction following our unmerged load. Unless that's the add/sub
793 // instruction we're merging, in which case it's the one after that.
794 if (++NextI == Update)
797 int Value = Update->getOperand(2).getImm();
798 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
799 "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
800 if (Update->getOpcode() == AArch64::SUBXri)
803 unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
804 : getPostIndexedOpcode(I->getOpcode());
805 MachineInstrBuilder MIB;
806 if (!isPairedLdSt(I)) {
807 // Non-paired instruction.
808 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
809 .addOperand(getLdStRegOp(Update))
810 .addOperand(getLdStRegOp(I))
811 .addOperand(getLdStBaseOp(I))
814 // Paired instruction.
815 int Scale = getMemScale(I);
816 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
817 .addOperand(getLdStRegOp(Update))
818 .addOperand(getLdStRegOp(I, 0))
819 .addOperand(getLdStRegOp(I, 1))
820 .addOperand(getLdStBaseOp(I))
821 .addImm(Value / Scale);
826 DEBUG(dbgs() << "Creating pre-indexed load/store.");
828 DEBUG(dbgs() << "Creating post-indexed load/store.");
829 DEBUG(dbgs() << " Replacing instructions:\n ");
830 DEBUG(I->print(dbgs()));
831 DEBUG(dbgs() << " ");
832 DEBUG(Update->print(dbgs()));
833 DEBUG(dbgs() << " with instruction:\n ");
834 DEBUG(((MachineInstr *)MIB)->print(dbgs()));
835 DEBUG(dbgs() << "\n");
837 // Erase the old instructions for the block.
838 I->eraseFromParent();
839 Update->eraseFromParent();
844 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI,
846 unsigned BaseReg, int Offset) {
847 switch (MI->getOpcode()) {
850 case AArch64::SUBXri:
851 // Negate the offset for a SUB instruction.
854 case AArch64::ADDXri:
855 // Make sure it's a vanilla immediate operand, not a relocation or
856 // anything else we can't handle.
857 if (!MI->getOperand(2).isImm())
859 // Watch out for 1 << 12 shifted value.
860 if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm()))
863 // The update instruction source and destination register must be the
864 // same as the load/store base register.
865 if (MI->getOperand(0).getReg() != BaseReg ||
866 MI->getOperand(1).getReg() != BaseReg)
869 bool IsPairedInsn = isPairedLdSt(MemMI);
870 int UpdateOffset = MI->getOperand(2).getImm();
871 // For non-paired load/store instructions, the immediate must fit in a
872 // signed 9-bit integer.
873 if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256))
876 // For paired load/store instructions, the immediate must be a multiple of
877 // the scaling factor. The scaled offset must also fit into a signed 7-bit
880 int Scale = getMemScale(MemMI);
881 if (UpdateOffset % Scale != 0)
884 int ScaledOffset = UpdateOffset / Scale;
885 if (ScaledOffset > 64 || ScaledOffset < -64)
889 // If we have a non-zero Offset, we check that it matches the amount
890 // we're adding to the register.
891 if (!Offset || Offset == MI->getOperand(2).getImm())
898 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
899 MachineBasicBlock::iterator I, unsigned Limit, int UnscaledOffset) {
900 MachineBasicBlock::iterator E = I->getParent()->end();
901 MachineInstr *MemMI = I;
902 MachineBasicBlock::iterator MBBI = I;
904 unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
905 int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
907 // If the base register overlaps a destination register, we can't
909 bool IsPairedInsn = isPairedLdSt(MemMI);
910 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
911 unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
912 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
916 // Scan forward looking for post-index opportunities. Updating instructions
917 // can't be formed if the memory instruction doesn't have the offset we're
919 if (MIUnscaledOffset != UnscaledOffset)
922 // Track which registers have been modified and used between the first insn
923 // (inclusive) and the second insn.
924 BitVector ModifiedRegs, UsedRegs;
925 ModifiedRegs.resize(TRI->getNumRegs());
926 UsedRegs.resize(TRI->getNumRegs());
928 for (unsigned Count = 0; MBBI != E; ++MBBI) {
929 MachineInstr *MI = MBBI;
930 // Skip DBG_VALUE instructions. Otherwise debug info can affect the
931 // optimization by changing how far we scan.
932 if (MI->isDebugValue())
935 // Now that we know this is a real instruction, count it.
938 // If we found a match, return it.
939 if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset))
942 // Update the status of what the instruction clobbered and used.
943 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
945 // Otherwise, if the base register is used or modified, we have no match, so
947 if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
953 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
954 MachineBasicBlock::iterator I, unsigned Limit) {
955 MachineBasicBlock::iterator B = I->getParent()->begin();
956 MachineBasicBlock::iterator E = I->getParent()->end();
957 MachineInstr *MemMI = I;
958 MachineBasicBlock::iterator MBBI = I;
960 unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
961 int Offset = getLdStOffsetOp(MemMI).getImm();
963 // If the load/store is the first instruction in the block, there's obviously
964 // not any matching update. Ditto if the memory offset isn't zero.
965 if (MBBI == B || Offset != 0)
967 // If the base register overlaps a destination register, we can't
969 bool IsPairedInsn = isPairedLdSt(MemMI);
970 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
971 unsigned DestReg = getLdStRegOp(MemMI, i).getReg();
972 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
976 // Track which registers have been modified and used between the first insn
977 // (inclusive) and the second insn.
978 BitVector ModifiedRegs, UsedRegs;
979 ModifiedRegs.resize(TRI->getNumRegs());
980 UsedRegs.resize(TRI->getNumRegs());
982 for (unsigned Count = 0; MBBI != B; --MBBI) {
983 MachineInstr *MI = MBBI;
984 // Skip DBG_VALUE instructions. Otherwise debug info can affect the
985 // optimization by changing how far we scan.
986 if (MI->isDebugValue())
989 // Now that we know this is a real instruction, count it.
992 // If we found a match, return it.
993 if (isMatchingUpdateInsn(I, MI, BaseReg, Offset))
996 // Update the status of what the instruction clobbered and used.
997 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
999 // Otherwise, if the base register is used or modified, we have no match, so
1001 if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
1007 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
1008 bool Modified = false;
1009 // Two tranformations to do here:
1010 // 1) Find loads and stores that can be merged into a single load or store
1011 // pair instruction.
1017 // 2) Find base register updates that can be merged into the load or store
1018 // as a base-reg writeback.
1025 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1027 MachineInstr *MI = MBBI;
1028 switch (MI->getOpcode()) {
1030 // Just move on to the next instruction.
1033 // Scaled instructions.
1034 case AArch64::STRSui:
1035 case AArch64::STRDui:
1036 case AArch64::STRQui:
1037 case AArch64::STRXui:
1038 case AArch64::STRWui:
1039 case AArch64::LDRSui:
1040 case AArch64::LDRDui:
1041 case AArch64::LDRQui:
1042 case AArch64::LDRXui:
1043 case AArch64::LDRWui:
1044 case AArch64::LDRSWui:
1045 // Unscaled instructions.
1046 case AArch64::STURSi:
1047 case AArch64::STURDi:
1048 case AArch64::STURQi:
1049 case AArch64::STURWi:
1050 case AArch64::STURXi:
1051 case AArch64::LDURSi:
1052 case AArch64::LDURDi:
1053 case AArch64::LDURQi:
1054 case AArch64::LDURWi:
1055 case AArch64::LDURXi:
1056 case AArch64::LDURSWi: {
1057 // If this is a volatile load/store, don't mess with it.
1058 if (MI->hasOrderedMemoryRef()) {
1062 // Make sure this is a reg+imm (as opposed to an address reloc).
1063 if (!getLdStOffsetOp(MI).isImm()) {
1067 // Check if this load/store has a hint to avoid pair formation.
1068 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
1069 if (TII->isLdStPairSuppressed(MI)) {
1073 // Look ahead up to ScanLimit instructions for a pairable instruction.
1074 LdStPairFlags Flags;
1075 MachineBasicBlock::iterator Paired =
1076 findMatchingInsn(MBBI, Flags, ScanLimit);
1079 if (isUnscaledLdSt(MI))
1080 ++NumUnscaledPairCreated;
1082 // Merge the loads into a pair. Keeping the iterator straight is a
1083 // pain, so we let the merge routine tell us what the next instruction
1084 // is after it's done mucking about.
1085 MBBI = mergePairedInsns(MBBI, Paired, Flags);
1092 // FIXME: Do the other instructions.
1096 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
1098 MachineInstr *MI = MBBI;
1099 // Do update merging. It's simpler to keep this separate from the above
1100 // switch, though not strictly necessary.
1101 unsigned Opc = MI->getOpcode();
1104 // Just move on to the next instruction.
1107 // Scaled instructions.
1108 case AArch64::STRSui:
1109 case AArch64::STRDui:
1110 case AArch64::STRQui:
1111 case AArch64::STRXui:
1112 case AArch64::STRWui:
1113 case AArch64::STRHHui:
1114 case AArch64::STRBBui:
1115 case AArch64::LDRSui:
1116 case AArch64::LDRDui:
1117 case AArch64::LDRQui:
1118 case AArch64::LDRXui:
1119 case AArch64::LDRWui:
1120 case AArch64::LDRHHui:
1121 case AArch64::LDRBBui:
1122 // Unscaled instructions.
1123 case AArch64::STURSi:
1124 case AArch64::STURDi:
1125 case AArch64::STURQi:
1126 case AArch64::STURWi:
1127 case AArch64::STURXi:
1128 case AArch64::LDURSi:
1129 case AArch64::LDURDi:
1130 case AArch64::LDURQi:
1131 case AArch64::LDURWi:
1132 case AArch64::LDURXi:
1133 // Paired instructions.
1134 case AArch64::LDPSi:
1135 case AArch64::LDPSWi:
1136 case AArch64::LDPDi:
1137 case AArch64::LDPQi:
1138 case AArch64::LDPWi:
1139 case AArch64::LDPXi:
1140 case AArch64::STPSi:
1141 case AArch64::STPDi:
1142 case AArch64::STPQi:
1143 case AArch64::STPWi:
1144 case AArch64::STPXi: {
1145 // Make sure this is a reg+imm (as opposed to an address reloc).
1146 if (!getLdStOffsetOp(MI).isImm()) {
1150 // Look forward to try to form a post-index instruction. For example,
1152 // add x20, x20, #32
1154 // ldr x0, [x20], #32
1155 MachineBasicBlock::iterator Update =
1156 findMatchingUpdateInsnForward(MBBI, ScanLimit, 0);
1158 // Merge the update into the ld/st.
1159 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
1164 // Don't know how to handle pre/post-index versions, so move to the next
1166 if (isUnscaledLdSt(Opc)) {
1171 // Look back to try to find a pre-index instruction. For example,
1175 // ldr x1, [x0, #8]!
1176 Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit);
1178 // Merge the update into the ld/st.
1179 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
1184 // The immediate in the load/store is scaled by the size of the memory
1185 // operation. The immediate in the add we're looking for,
1186 // however, is not, so adjust here.
1187 int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
1189 // Look forward to try to find a post-index instruction. For example,
1190 // ldr x1, [x0, #64]
1193 // ldr x1, [x0, #64]!
1194 Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, UnscaledOffset);
1196 // Merge the update into the ld/st.
1197 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
1203 // Nothing found. Just move to the next instruction.
1207 // FIXME: Do the other instructions.
1214 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
1215 TII = static_cast<const AArch64InstrInfo *>(Fn.getSubtarget().getInstrInfo());
1216 TRI = Fn.getSubtarget().getRegisterInfo();
1218 bool Modified = false;
1219 for (auto &MBB : Fn)
1220 Modified |= optimizeBlock(MBB);
1225 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
1226 // loads and stores near one another?
1228 /// createAArch64LoadStoreOptimizationPass - returns an instance of the
1229 /// load / store optimization pass.
1230 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
1231 return new AArch64LoadStoreOpt();