X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=lib%2FTarget%2FAArch64%2FAArch64LoadStoreOptimizer.cpp;h=6ef4c269d8fe9d4f18dad04b714da518d90160c3;hp=22a263e3ac53818dc8ed0aba7818e281f8947b80;hb=de4969d37c17d57e87ce0365071922232dc323e0;hpb=a43aca40330cd266decea50f173bcfa4f79c0650 diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 22a263e3ac5..6ef4c269d8f 100644 --- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -41,15 +41,11 @@ STATISTIC(NumPostFolded, "Number of post-index updates folded"); STATISTIC(NumPreFolded, "Number of pre-index updates folded"); STATISTIC(NumUnscaledPairCreated, "Number of load/store from unscaled generated"); +STATISTIC(NumSmallTypeMerged, "Number of small type loads merged"); static cl::opt ScanLimit("aarch64-load-store-scan-limit", cl::init(20), cl::Hidden); -// Place holder while testing unscaled load/store combining -static cl::opt EnableAArch64UnscaledMemOp( - "aarch64-unscaled-mem-op", cl::Hidden, - cl::desc("Allow AArch64 unscaled load/store combining"), cl::init(true)); - namespace llvm { void initializeAArch64LoadStoreOptPass(PassRegistry &); } @@ -109,7 +105,7 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { // pre or post indexed addressing with writeback. Scan forwards. MachineBasicBlock::iterator findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit, - int Value); + int UnscaledOffset); // Scan the instruction list to find a base register update that can // be combined with the current instruction (a load or store) using @@ -127,7 +123,14 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { mergeUpdateInsn(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update, bool IsPreIdx); - bool optimizeBlock(MachineBasicBlock &MBB); + // Find and merge foldable ldr/str instructions. + bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI); + + // Check if converting two narrow loads into a single wider load with + // bitfield extracts could be enabled. + bool enableNarrowLdMerge(MachineFunction &Fn); + + bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt); bool runOnMachineFunction(MachineFunction &Fn) override; @@ -156,6 +159,7 @@ static bool isUnscaledLdSt(unsigned Opc) { case AArch64::LDURWi: case AArch64::LDURXi: case AArch64::LDURSWi: + case AArch64::LDURHHi: return true; } } @@ -164,44 +168,68 @@ static bool isUnscaledLdSt(MachineInstr *MI) { return isUnscaledLdSt(MI->getOpcode()); } -// Size in bytes of the data moved by an unscaled load or store -static int getMemSize(MachineInstr *MI) { +static bool isSmallTypeLdMerge(unsigned Opc) { + switch (Opc) { + default: + return false; + case AArch64::LDRHHui: + case AArch64::LDURHHi: + return true; + // FIXME: Add other instructions (e.g, LDRBBui, LDURSHWi, LDRSHWui, etc.). + } +} +static bool isSmallTypeLdMerge(MachineInstr *MI) { + return isSmallTypeLdMerge(MI->getOpcode()); +} + +// Scaling factor for unscaled load or store. +static int getMemScale(MachineInstr *MI) { switch (MI->getOpcode()) { default: - llvm_unreachable("Opcode has unknown size!"); + llvm_unreachable("Opcode has unknown scale!"); + case AArch64::LDRBBui: + case AArch64::STRBBui: + return 1; + case AArch64::LDRHHui: + case AArch64::LDURHHi: + case AArch64::STRHHui: + return 2; + case AArch64::LDRSui: + case AArch64::LDURSi: + case AArch64::LDRSWui: + case AArch64::LDURSWi: + case AArch64::LDRWui: + case AArch64::LDURWi: case AArch64::STRSui: case AArch64::STURSi: - return 4; - case AArch64::STRDui: - case AArch64::STURDi: - return 8; - case AArch64::STRQui: - case AArch64::STURQi: - return 16; case AArch64::STRWui: case AArch64::STURWi: - return 4; - case AArch64::STRXui: - case AArch64::STURXi: - return 8; - case AArch64::LDRSui: - case AArch64::LDURSi: + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPWi: + case AArch64::STPSi: + case AArch64::STPWi: return 4; case AArch64::LDRDui: case AArch64::LDURDi: + case AArch64::LDRXui: + case AArch64::LDURXi: + case AArch64::STRDui: + case AArch64::STURDi: + case AArch64::STRXui: + case AArch64::STURXi: + case AArch64::LDPDi: + case AArch64::LDPXi: + case AArch64::STPDi: + case AArch64::STPXi: return 8; case AArch64::LDRQui: case AArch64::LDURQi: + case AArch64::STRQui: + case AArch64::STURQi: + case AArch64::LDPQi: + case AArch64::STPQi: return 16; - case AArch64::LDRWui: - case AArch64::LDURWi: - return 4; - case AArch64::LDRXui: - case AArch64::LDURXi: - return 8; - case AArch64::LDRSWui: - case AArch64::LDURSWi: - return 4; } } @@ -234,6 +262,8 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc, case AArch64::STURSi: case AArch64::LDRSui: case AArch64::LDURSi: + case AArch64::LDRHHui: + case AArch64::LDURHHi: return Opc; case AArch64::LDRSWui: return AArch64::LDRWui; @@ -279,6 +309,10 @@ static unsigned getMatchingPairOpcode(unsigned Opc) { case AArch64::LDRSWui: case AArch64::LDURSWi: return AArch64::LDPSWi; + case AArch64::LDRHHui: + return AArch64::LDRWui; + case AArch64::LDURHHi: + return AArch64::LDURWi; } } @@ -292,6 +326,10 @@ static unsigned getPreIndexedOpcode(unsigned Opc) { return AArch64::STRDpre; case AArch64::STRQui: return AArch64::STRQpre; + case AArch64::STRBBui: + return AArch64::STRBBpre; + case AArch64::STRHHui: + return AArch64::STRHHpre; case AArch64::STRWui: return AArch64::STRWpre; case AArch64::STRXui: @@ -302,6 +340,10 @@ static unsigned getPreIndexedOpcode(unsigned Opc) { return AArch64::LDRDpre; case AArch64::LDRQui: return AArch64::LDRQpre; + case AArch64::LDRBBui: + return AArch64::LDRBBpre; + case AArch64::LDRHHui: + return AArch64::LDRHHpre; case AArch64::LDRWui: return AArch64::LDRWpre; case AArch64::LDRXui: @@ -310,6 +352,8 @@ static unsigned getPreIndexedOpcode(unsigned Opc) { return AArch64::LDRSWpre; case AArch64::LDPSi: return AArch64::LDPSpre; + case AArch64::LDPSWi: + return AArch64::LDPSWpre; case AArch64::LDPDi: return AArch64::LDPDpre; case AArch64::LDPQi: @@ -341,6 +385,10 @@ static unsigned getPostIndexedOpcode(unsigned Opc) { return AArch64::STRDpost; case AArch64::STRQui: return AArch64::STRQpost; + case AArch64::STRBBui: + return AArch64::STRBBpost; + case AArch64::STRHHui: + return AArch64::STRHHpost; case AArch64::STRWui: return AArch64::STRWpost; case AArch64::STRXui: @@ -351,6 +399,10 @@ static unsigned getPostIndexedOpcode(unsigned Opc) { return AArch64::LDRDpost; case AArch64::LDRQui: return AArch64::LDRQpost; + case AArch64::LDRBBui: + return AArch64::LDRBBpost; + case AArch64::LDRHHui: + return AArch64::LDRHHpost; case AArch64::LDRWui: return AArch64::LDRWpost; case AArch64::LDRXui: @@ -359,6 +411,8 @@ static unsigned getPostIndexedOpcode(unsigned Opc) { return AArch64::LDRSWpost; case AArch64::LDPSi: return AArch64::LDPSpost; + case AArch64::LDPSWi: + return AArch64::LDPSWpost; case AArch64::LDPDi: return AArch64::LDPDpost; case AArch64::LDPQi: @@ -385,6 +439,7 @@ static bool isPairedLdSt(const MachineInstr *MI) { default: return false; case AArch64::LDPSi: + case AArch64::LDPSWi: case AArch64::LDPDi: case AArch64::LDPQi: case AArch64::LDPWi: @@ -415,6 +470,21 @@ static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) { return MI->getOperand(Idx); } +// Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI. +static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0, + MachineInstr *Op1) { + assert(MI->memoperands_empty() && "expected a new machineinstr"); + size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin()) + + (Op1->memoperands_end() - Op1->memoperands_begin()); + + MachineFunction *MF = MI->getParent()->getParent(); + MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs); + MachineSDNode::mmo_iterator MemEnd = + std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin); + MemEnd = std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd); + MI->setMemRefs(MemBegin, MemEnd); +} + MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, @@ -432,8 +502,7 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, unsigned Opc = SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); bool IsUnscaled = isUnscaledLdSt(Opc); - int OffsetStride = - IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(I) : 1; + int OffsetStride = IsUnscaled ? getMemScale(I) : 1; bool MergeForward = Flags.getMergeForward(); unsigned NewOpc = getMatchingPairOpcode(Opc); @@ -460,9 +529,80 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, RtMI = I; Rt2MI = Paired; } - // Handle Unscaled + int OffsetImm = getLdStOffsetOp(RtMI).getImm(); - if (IsUnscaled && EnableAArch64UnscaledMemOp) + + if (isSmallTypeLdMerge(Opc)) { + // Change the scaled offset from small to large type. + if (!IsUnscaled) + OffsetImm /= 2; + MachineInstr *RtNewDest = MergeForward ? I : Paired; + // Construct the new load instruction. + // FIXME: currently we support only halfword unsigned load. We need to + // handle byte type, signed, and store instructions as well. + MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2; + NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(NewOpc)) + .addOperand(getLdStRegOp(RtNewDest)) + .addOperand(BaseRegOp) + .addImm(OffsetImm); + + // Copy MachineMemOperands from the original loads. + concatenateMemOperands(NewMemMI, I, Paired); + + DEBUG( + dbgs() + << "Creating the new load and extract. Replacing instructions:\n "); + DEBUG(I->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(Paired->print(dbgs())); + DEBUG(dbgs() << " with instructions:\n "); + DEBUG((NewMemMI)->print(dbgs())); + + MachineInstr *ExtDestMI = MergeForward ? Paired : I; + if (ExtDestMI == Rt2MI) { + // Create the bitfield extract for high half. + BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(AArch64::UBFMWri)) + .addOperand(getLdStRegOp(Rt2MI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(16) + .addImm(31); + // Create the bitfield extract for low half. + BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(AArch64::ANDWri)) + .addOperand(getLdStRegOp(RtMI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(15); + } else { + // Create the bitfield extract for low half. + BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(AArch64::ANDWri)) + .addOperand(getLdStRegOp(RtMI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(15); + // Create the bitfield extract for high half. + BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(AArch64::UBFMWri)) + .addOperand(getLdStRegOp(Rt2MI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(16) + .addImm(31); + } + DEBUG(dbgs() << " "); + DEBUG((BitExtMI1)->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG((BitExtMI2)->print(dbgs())); + DEBUG(dbgs() << "\n"); + + // Erase the old instructions. + I->eraseFromParent(); + Paired->eraseFromParent(); + return NextI; + } + + // Handle Unscaled + if (IsUnscaled) OffsetImm /= OffsetStride; // Construct the new instruction. @@ -598,8 +738,7 @@ static bool mayAlias(MachineInstr *MIa, /// be combined with the current instruction into a load/store pair. MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, - LdStPairFlags &Flags, - unsigned Limit) { + LdStPairFlags &Flags, unsigned Limit) { MachineBasicBlock::iterator E = I->getParent()->end(); MachineBasicBlock::iterator MBBI = I; MachineInstr *FirstMI = I; @@ -620,9 +759,9 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // Early exit if the offset if not possible to match. (6 bits of positive // range, plus allow an extra one in case we find a later insn that matches // with Offset-1) - int OffsetStride = - IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(FirstMI) : 1; - if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride)) + int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1; + if (!isSmallTypeLdMerge(Opc) && + !inBoundsForPair(IsUnscaled, Offset, OffsetStride)) return E; // Track which registers have been modified and used between the first insn @@ -681,19 +820,32 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // If the resultant immediate offset of merging these instructions // is out of range for a pairwise instruction, bail and keep looking. bool MIIsUnscaled = isUnscaledLdSt(MI); - if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) { + bool IsSmallTypeLd = isSmallTypeLdMerge(MI->getOpcode()); + if (!IsSmallTypeLd && + !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); MemInsns.push_back(MI); continue; } - // If the alignment requirements of the paired (scaled) instruction - // can't express the offset of the unscaled input, bail and keep - // looking. - if (IsUnscaled && EnableAArch64UnscaledMemOp && - (alignTo(MinOffset, OffsetStride) != MinOffset)) { - trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - MemInsns.push_back(MI); - continue; + + if (IsSmallTypeLd) { + // If the alignment requirements of the larger type scaled load + // instruction can't express the scaled offset of the smaller type + // input, bail and keep looking. + if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) { + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + MemInsns.push_back(MI); + continue; + } + } else { + // If the alignment requirements of the paired (scaled) instruction + // can't express the offset of the unscaled input, bail and keep + // looking. + if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) { + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + MemInsns.push_back(MI); + continue; + } } // If the destination register of the loads is the same register, bail // and keep looking. A load-pair instruction with both destination @@ -782,8 +934,7 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, .addImm(Value); } else { // Paired instruction. - const MachineFunction &MF = *I->getParent()->getParent(); - int Scale = TII->getRegClass(I->getDesc(), 0, TRI, MF)->getSize(); + int Scale = getMemScale(I); MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) .addOperand(getLdStRegOp(Update)) .addOperand(getLdStRegOp(I, 0)) @@ -848,8 +999,7 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, // the scaling factor. The scaled offset must also fit into a signed 7-bit // integer. if (IsPairedInsn) { - const MachineFunction &MF = *MemMI->getParent()->getParent(); - int Scale = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize(); + int Scale = getMemScale(MemMI); if (UpdateOffset % Scale != 0) break; @@ -868,15 +1018,19 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, } MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( - MachineBasicBlock::iterator I, unsigned Limit, int Value) { + MachineBasicBlock::iterator I, unsigned Limit, int UnscaledOffset) { MachineBasicBlock::iterator E = I->getParent()->end(); MachineInstr *MemMI = I; MachineBasicBlock::iterator MBBI = I; - const MachineFunction &MF = *MemMI->getParent()->getParent(); unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); - int Offset = getLdStOffsetOp(MemMI).getImm() * - TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize(); + int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI); + + // Scan forward looking for post-index opportunities. Updating instructions + // can't be formed if the memory instruction doesn't have the offset we're + // looking for. + if (MIUnscaledOffset != UnscaledOffset) + return E; // If the base register overlaps a destination register, we can't // merge the update. @@ -887,12 +1041,6 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( return E; } - // Scan forward looking for post-index opportunities. - // Updating instructions can't be formed if the memory insn already - // has an offset other than the value we're looking for. - if (Offset != Value) - return E; - // Track which registers have been modified and used between the first insn // (inclusive) and the second insn. BitVector ModifiedRegs, UsedRegs; @@ -910,7 +1058,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( ++Count; // If we found a match, return it. - if (isMatchingUpdateInsn(I, MI, BaseReg, Value)) + if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset)) return MBBI; // Update the status of what the instruction clobbered and used. @@ -930,11 +1078,9 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( MachineBasicBlock::iterator E = I->getParent()->end(); MachineInstr *MemMI = I; MachineBasicBlock::iterator MBBI = I; - const MachineFunction &MF = *MemMI->getParent()->getParent(); unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); int Offset = getLdStOffsetOp(MemMI).getImm(); - unsigned RegSize = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize(); // If the load/store is the first instruction in the block, there's obviously // not any matching update. Ditto if the memory offset isn't zero. @@ -966,7 +1112,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( ++Count; // If we found a match, return it. - if (isMatchingUpdateInsn(I, MI, BaseReg, RegSize)) + if (isMatchingUpdateInsn(I, MI, BaseReg, Offset)) return MBBI; // Update the status of what the instruction clobbered and used. @@ -980,17 +1126,65 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( return E; } -bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { +bool AArch64LoadStoreOpt::tryToMergeLdStInst( + MachineBasicBlock::iterator &MBBI) { + MachineInstr *MI = MBBI; + MachineBasicBlock::iterator E = MI->getParent()->end(); + // If this is a volatile load/store, don't mess with it. + if (MI->hasOrderedMemoryRef()) + return false; + + // Make sure this is a reg+imm (as opposed to an address reloc). + if (!getLdStOffsetOp(MI).isImm()) + return false; + + // Check if this load/store has a hint to avoid pair formation. + // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. + if (TII->isLdStPairSuppressed(MI)) + return false; + + // Look ahead up to ScanLimit instructions for a pairable instruction. + LdStPairFlags Flags; + MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit); + if (Paired != E) { + if (isSmallTypeLdMerge(MI)) { + ++NumSmallTypeMerged; + } else { + ++NumPairCreated; + if (isUnscaledLdSt(MI)) + ++NumUnscaledPairCreated; + } + + // Merge the loads into a pair. Keeping the iterator straight is a + // pain, so we let the merge routine tell us what the next instruction + // is after it's done mucking about. + MBBI = mergePairedInsns(MBBI, Paired, Flags); + return true; + } + return false; +} + +bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, + bool enableNarrowLdOpt) { bool Modified = false; - // Two tranformations to do here: - // 1) Find loads and stores that can be merged into a single load or store + // Three tranformations to do here: + // 1) Find halfword loads that can be merged into a single 32-bit word load + // with bitfield extract instructions. + // e.g., + // ldrh w0, [x2] + // ldrh w1, [x2, #2] + // ; becomes + // ldr w0, [x2] + // ubfx w1, w0, #16, #16 + // and w0, w0, #ffff + // 2) Find loads and stores that can be merged into a single load or store // pair instruction. // e.g., // ldr x0, [x2] // ldr x1, [x2, #8] // ; becomes // ldp x0, x1, [x2] - // 2) Find base register updates that can be merged into the load or store + // 3) Find base register updates that can be merged into the load or store // as a base-reg writeback. // e.g., // ldr x0, [x2] @@ -998,6 +1192,29 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { // ; becomes // ldr x0, [x2], #4 + for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + enableNarrowLdOpt && MBBI != E;) { + MachineInstr *MI = MBBI; + switch (MI->getOpcode()) { + default: + // Just move on to the next instruction. + ++MBBI; + break; + // Scaled instructions. + case AArch64::LDRHHui: + // Unscaled instructions. + case AArch64::LDURHHi: { + if (tryToMergeLdStInst(MBBI)) { + Modified = true; + break; + } + ++MBBI; + break; + } + // FIXME: Do the other instructions. + } + } + for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) { MachineInstr *MI = MBBI; @@ -1030,35 +1247,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { case AArch64::LDURWi: case AArch64::LDURXi: case AArch64::LDURSWi: { - // If this is a volatile load/store, don't mess with it. - if (MI->hasOrderedMemoryRef()) { - ++MBBI; - break; - } - // Make sure this is a reg+imm (as opposed to an address reloc). - if (!getLdStOffsetOp(MI).isImm()) { - ++MBBI; - break; - } - // Check if this load/store has a hint to avoid pair formation. - // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. - if (TII->isLdStPairSuppressed(MI)) { - ++MBBI; - break; - } - // Look ahead up to ScanLimit instructions for a pairable instruction. - LdStPairFlags Flags; - MachineBasicBlock::iterator Paired = - findMatchingInsn(MBBI, Flags, ScanLimit); - if (Paired != E) { - ++NumPairCreated; - if (isUnscaledLdSt(MI)) - ++NumUnscaledPairCreated; - - // Merge the loads into a pair. Keeping the iterator straight is a - // pain, so we let the merge routine tell us what the next instruction - // is after it's done mucking about. - MBBI = mergePairedInsns(MBBI, Paired, Flags); + if (tryToMergeLdStInst(MBBI)) { Modified = true; break; } @@ -1086,11 +1275,15 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { case AArch64::STRQui: case AArch64::STRXui: case AArch64::STRWui: + case AArch64::STRHHui: + case AArch64::STRBBui: case AArch64::LDRSui: case AArch64::LDRDui: case AArch64::LDRQui: case AArch64::LDRXui: case AArch64::LDRWui: + case AArch64::LDRHHui: + case AArch64::LDRBBui: // Unscaled instructions. case AArch64::STURSi: case AArch64::STURDi: @@ -1104,6 +1297,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { case AArch64::LDURXi: // Paired instructions. case AArch64::LDPSi: + case AArch64::LDPSWi: case AArch64::LDPDi: case AArch64::LDPQi: case AArch64::LDPWi: @@ -1152,24 +1346,17 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { ++NumPreFolded; break; } - // The immediate in the load/store is scaled by the size of the register - // being loaded. The immediate in the add we're looking for, + // The immediate in the load/store is scaled by the size of the memory + // operation. The immediate in the add we're looking for, // however, is not, so adjust here. - int Value = MI->getOperand(isPairedLdSt(MI) ? 3 : 2).getImm() * - TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent())) - ->getSize(); - - // FIXME: The immediate in the load/store should be scaled by the size of - // the memory operation, not the size of the register being loaded/stored. - // This works in general, but does not work for the LDPSW instruction, - // which defines two 64-bit registers, but loads 32-bit values. + int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI); // Look forward to try to find a post-index instruction. For example, // ldr x1, [x0, #64] // add x0, x0, #64 // merged into: // ldr x1, [x0, #64]! - Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, Value); + Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, UnscaledOffset); if (Update != E) { // Merge the update into the ld/st. MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); @@ -1189,13 +1376,25 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { return Modified; } +bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) { + const AArch64Subtarget *SubTarget = + &static_cast(Fn.getSubtarget()); + bool ProfitableArch = SubTarget->isCortexA57(); + // FIXME: The benefit from converting narrow loads into a wider load could be + // microarchitectural as it assumes that a single load with two bitfield + // extracts is cheaper than two narrow loads. Currently, this conversion is + // enabled only in cortex-a57 on which performance benefits were verified. + return ProfitableArch & (!SubTarget->requiresStrictAlign()); +} + bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { TII = static_cast(Fn.getSubtarget().getInstrInfo()); TRI = Fn.getSubtarget().getRegisterInfo(); bool Modified = false; + bool enableNarrowLdOpt = enableNarrowLdMerge(Fn); for (auto &MBB : Fn) - Modified |= optimizeBlock(MBB); + Modified |= optimizeBlock(MBB, enableNarrowLdOpt); return Modified; }