X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=lib%2FTarget%2FAArch64%2FAArch64LoadStoreOptimizer.cpp;h=6ef4c269d8fe9d4f18dad04b714da518d90160c3;hp=d5a3c69310520ea04b933e729444ecd07c9c3619;hb=de4969d37c17d57e87ce0365071922232dc323e0;hpb=20f59c8f980777a08c8c8768c43c1c69b9f5646b diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index d5a3c693105..6ef4c269d8f 100644 --- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -41,15 +41,11 @@ STATISTIC(NumPostFolded, "Number of post-index updates folded"); STATISTIC(NumPreFolded, "Number of pre-index updates folded"); STATISTIC(NumUnscaledPairCreated, "Number of load/store from unscaled generated"); +STATISTIC(NumSmallTypeMerged, "Number of small type loads merged"); static cl::opt ScanLimit("aarch64-load-store-scan-limit", cl::init(20), cl::Hidden); -// Place holder while testing unscaled load/store combining -static cl::opt EnableAArch64UnscaledMemOp( - "aarch64-unscaled-mem-op", cl::Hidden, - cl::desc("Allow AArch64 unscaled load/store combining"), cl::init(true)); - namespace llvm { void initializeAArch64LoadStoreOptPass(PassRegistry &); } @@ -109,7 +105,7 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { // pre or post indexed addressing with writeback. Scan forwards. MachineBasicBlock::iterator findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit, - int Value); + int UnscaledOffset); // Scan the instruction list to find a base register update that can // be combined with the current instruction (a load or store) using @@ -117,17 +113,24 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { MachineBasicBlock::iterator findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); - // Merge a pre-index base register update into a ld/st instruction. - MachineBasicBlock::iterator - mergePreIdxUpdateInsn(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Update); + // Find an instruction that updates the base register of the ld/st + // instruction. + bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI, + unsigned BaseReg, int Offset); - // Merge a post-index base register update into a ld/st instruction. + // Merge a pre- or post-index base register update into a ld/st instruction. MachineBasicBlock::iterator - mergePostIdxUpdateInsn(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Update); + mergeUpdateInsn(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Update, bool IsPreIdx); + + // Find and merge foldable ldr/str instructions. + bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI); - bool optimizeBlock(MachineBasicBlock &MBB); + // Check if converting two narrow loads into a single wider load with + // bitfield extracts could be enabled. + bool enableNarrowLdMerge(MachineFunction &Fn); + + bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt); bool runOnMachineFunction(MachineFunction &Fn) override; @@ -156,6 +159,7 @@ static bool isUnscaledLdSt(unsigned Opc) { case AArch64::LDURWi: case AArch64::LDURXi: case AArch64::LDURSWi: + case AArch64::LDURHHi: return true; } } @@ -164,44 +168,68 @@ static bool isUnscaledLdSt(MachineInstr *MI) { return isUnscaledLdSt(MI->getOpcode()); } -// Size in bytes of the data moved by an unscaled load or store -static int getMemSize(MachineInstr *MI) { +static bool isSmallTypeLdMerge(unsigned Opc) { + switch (Opc) { + default: + return false; + case AArch64::LDRHHui: + case AArch64::LDURHHi: + return true; + // FIXME: Add other instructions (e.g, LDRBBui, LDURSHWi, LDRSHWui, etc.). + } +} +static bool isSmallTypeLdMerge(MachineInstr *MI) { + return isSmallTypeLdMerge(MI->getOpcode()); +} + +// Scaling factor for unscaled load or store. +static int getMemScale(MachineInstr *MI) { switch (MI->getOpcode()) { default: - llvm_unreachable("Opcode has unknown size!"); + llvm_unreachable("Opcode has unknown scale!"); + case AArch64::LDRBBui: + case AArch64::STRBBui: + return 1; + case AArch64::LDRHHui: + case AArch64::LDURHHi: + case AArch64::STRHHui: + return 2; + case AArch64::LDRSui: + case AArch64::LDURSi: + case AArch64::LDRSWui: + case AArch64::LDURSWi: + case AArch64::LDRWui: + case AArch64::LDURWi: case AArch64::STRSui: case AArch64::STURSi: - return 4; - case AArch64::STRDui: - case AArch64::STURDi: - return 8; - case AArch64::STRQui: - case AArch64::STURQi: - return 16; case AArch64::STRWui: case AArch64::STURWi: - return 4; - case AArch64::STRXui: - case AArch64::STURXi: - return 8; - case AArch64::LDRSui: - case AArch64::LDURSi: + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPWi: + case AArch64::STPSi: + case AArch64::STPWi: return 4; case AArch64::LDRDui: case AArch64::LDURDi: + case AArch64::LDRXui: + case AArch64::LDURXi: + case AArch64::STRDui: + case AArch64::STURDi: + case AArch64::STRXui: + case AArch64::STURXi: + case AArch64::LDPDi: + case AArch64::LDPXi: + case AArch64::STPDi: + case AArch64::STPXi: return 8; case AArch64::LDRQui: case AArch64::LDURQi: + case AArch64::STRQui: + case AArch64::STURQi: + case AArch64::LDPQi: + case AArch64::STPQi: return 16; - case AArch64::LDRWui: - case AArch64::LDURWi: - return 4; - case AArch64::LDRXui: - case AArch64::LDURXi: - return 8; - case AArch64::LDRSWui: - case AArch64::LDURSWi: - return 4; } } @@ -234,6 +262,8 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc, case AArch64::STURSi: case AArch64::LDRSui: case AArch64::LDURSi: + case AArch64::LDRHHui: + case AArch64::LDURHHi: return Opc; case AArch64::LDRSWui: return AArch64::LDRWui; @@ -279,6 +309,10 @@ static unsigned getMatchingPairOpcode(unsigned Opc) { case AArch64::LDRSWui: case AArch64::LDURSWi: return AArch64::LDPSWi; + case AArch64::LDRHHui: + return AArch64::LDRWui; + case AArch64::LDURHHi: + return AArch64::LDURWi; } } @@ -292,6 +326,10 @@ static unsigned getPreIndexedOpcode(unsigned Opc) { return AArch64::STRDpre; case AArch64::STRQui: return AArch64::STRQpre; + case AArch64::STRBBui: + return AArch64::STRBBpre; + case AArch64::STRHHui: + return AArch64::STRHHpre; case AArch64::STRWui: return AArch64::STRWpre; case AArch64::STRXui: @@ -302,12 +340,38 @@ static unsigned getPreIndexedOpcode(unsigned Opc) { return AArch64::LDRDpre; case AArch64::LDRQui: return AArch64::LDRQpre; + case AArch64::LDRBBui: + return AArch64::LDRBBpre; + case AArch64::LDRHHui: + return AArch64::LDRHHpre; case AArch64::LDRWui: return AArch64::LDRWpre; case AArch64::LDRXui: return AArch64::LDRXpre; case AArch64::LDRSWui: return AArch64::LDRSWpre; + case AArch64::LDPSi: + return AArch64::LDPSpre; + case AArch64::LDPSWi: + return AArch64::LDPSWpre; + case AArch64::LDPDi: + return AArch64::LDPDpre; + case AArch64::LDPQi: + return AArch64::LDPQpre; + case AArch64::LDPWi: + return AArch64::LDPWpre; + case AArch64::LDPXi: + return AArch64::LDPXpre; + case AArch64::STPSi: + return AArch64::STPSpre; + case AArch64::STPDi: + return AArch64::STPDpre; + case AArch64::STPQi: + return AArch64::STPQpre; + case AArch64::STPWi: + return AArch64::STPWpre; + case AArch64::STPXi: + return AArch64::STPXpre; } } @@ -321,6 +385,10 @@ static unsigned getPostIndexedOpcode(unsigned Opc) { return AArch64::STRDpost; case AArch64::STRQui: return AArch64::STRQpost; + case AArch64::STRBBui: + return AArch64::STRBBpost; + case AArch64::STRHHui: + return AArch64::STRHHpost; case AArch64::STRWui: return AArch64::STRWpost; case AArch64::STRXui: @@ -331,25 +399,90 @@ static unsigned getPostIndexedOpcode(unsigned Opc) { return AArch64::LDRDpost; case AArch64::LDRQui: return AArch64::LDRQpost; + case AArch64::LDRBBui: + return AArch64::LDRBBpost; + case AArch64::LDRHHui: + return AArch64::LDRHHpost; case AArch64::LDRWui: return AArch64::LDRWpost; case AArch64::LDRXui: return AArch64::LDRXpost; case AArch64::LDRSWui: return AArch64::LDRSWpost; + case AArch64::LDPSi: + return AArch64::LDPSpost; + case AArch64::LDPSWi: + return AArch64::LDPSWpost; + case AArch64::LDPDi: + return AArch64::LDPDpost; + case AArch64::LDPQi: + return AArch64::LDPQpost; + case AArch64::LDPWi: + return AArch64::LDPWpost; + case AArch64::LDPXi: + return AArch64::LDPXpost; + case AArch64::STPSi: + return AArch64::STPSpost; + case AArch64::STPDi: + return AArch64::STPDpost; + case AArch64::STPQi: + return AArch64::STPQpost; + case AArch64::STPWi: + return AArch64::STPWpost; + case AArch64::STPXi: + return AArch64::STPXpost; + } +} + +static bool isPairedLdSt(const MachineInstr *MI) { + switch (MI->getOpcode()) { + default: + return false; + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPDi: + case AArch64::LDPQi: + case AArch64::LDPWi: + case AArch64::LDPXi: + case AArch64::STPSi: + case AArch64::STPDi: + case AArch64::STPQi: + case AArch64::STPWi: + case AArch64::STPXi: + return true; } } -static const MachineOperand &getLdStRegOp(const MachineInstr *MI) { - return MI->getOperand(0); +static const MachineOperand &getLdStRegOp(const MachineInstr *MI, + unsigned PairedRegOp = 0) { + assert(PairedRegOp < 2 && "Unexpected register operand idx."); + unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0; + return MI->getOperand(Idx); } static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) { - return MI->getOperand(1); + unsigned Idx = isPairedLdSt(MI) ? 2 : 1; + return MI->getOperand(Idx); } static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) { - return MI->getOperand(2); + unsigned Idx = isPairedLdSt(MI) ? 3 : 2; + return MI->getOperand(Idx); +} + +// Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI. +static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0, + MachineInstr *Op1) { + assert(MI->memoperands_empty() && "expected a new machineinstr"); + size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin()) + + (Op1->memoperands_end() - Op1->memoperands_begin()); + + MachineFunction *MF = MI->getParent()->getParent(); + MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs); + MachineSDNode::mmo_iterator MemEnd = + std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin); + MemEnd = std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd); + MI->setMemRefs(MemBegin, MemEnd); } MachineBasicBlock::iterator @@ -369,8 +502,7 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, unsigned Opc = SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); bool IsUnscaled = isUnscaledLdSt(Opc); - int OffsetStride = - IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(I) : 1; + int OffsetStride = IsUnscaled ? getMemScale(I) : 1; bool MergeForward = Flags.getMergeForward(); unsigned NewOpc = getMatchingPairOpcode(Opc); @@ -397,9 +529,80 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, RtMI = I; Rt2MI = Paired; } - // Handle Unscaled + int OffsetImm = getLdStOffsetOp(RtMI).getImm(); - if (IsUnscaled && EnableAArch64UnscaledMemOp) + + if (isSmallTypeLdMerge(Opc)) { + // Change the scaled offset from small to large type. + if (!IsUnscaled) + OffsetImm /= 2; + MachineInstr *RtNewDest = MergeForward ? I : Paired; + // Construct the new load instruction. + // FIXME: currently we support only halfword unsigned load. We need to + // handle byte type, signed, and store instructions as well. + MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2; + NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(NewOpc)) + .addOperand(getLdStRegOp(RtNewDest)) + .addOperand(BaseRegOp) + .addImm(OffsetImm); + + // Copy MachineMemOperands from the original loads. + concatenateMemOperands(NewMemMI, I, Paired); + + DEBUG( + dbgs() + << "Creating the new load and extract. Replacing instructions:\n "); + DEBUG(I->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(Paired->print(dbgs())); + DEBUG(dbgs() << " with instructions:\n "); + DEBUG((NewMemMI)->print(dbgs())); + + MachineInstr *ExtDestMI = MergeForward ? Paired : I; + if (ExtDestMI == Rt2MI) { + // Create the bitfield extract for high half. + BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(AArch64::UBFMWri)) + .addOperand(getLdStRegOp(Rt2MI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(16) + .addImm(31); + // Create the bitfield extract for low half. + BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(AArch64::ANDWri)) + .addOperand(getLdStRegOp(RtMI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(15); + } else { + // Create the bitfield extract for low half. + BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(AArch64::ANDWri)) + .addOperand(getLdStRegOp(RtMI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(15); + // Create the bitfield extract for high half. + BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(AArch64::UBFMWri)) + .addOperand(getLdStRegOp(Rt2MI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(16) + .addImm(31); + } + DEBUG(dbgs() << " "); + DEBUG((BitExtMI1)->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG((BitExtMI2)->print(dbgs())); + DEBUG(dbgs() << "\n"); + + // Erase the old instructions. + I->eraseFromParent(); + Paired->eraseFromParent(); + return NextI; + } + + // Handle Unscaled + if (IsUnscaled) OffsetImm /= OffsetStride; // Construct the new instruction. @@ -492,16 +695,12 @@ static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs, } static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { - if (!IsUnscaled && (Offset > 63 || Offset < -64)) - return false; - if (IsUnscaled) { - // Convert the byte-offset used by unscaled into an "element" offset used - // by the scaled pair load/store instructions. - int ElemOffset = Offset / OffsetStride; - if (ElemOffset > 63 || ElemOffset < -64) - return false; - } - return true; + // Convert the byte-offset used by unscaled into an "element" offset used + // by the scaled pair load/store instructions. + if (IsUnscaled) + Offset /= OffsetStride; + + return Offset <= 63 && Offset >= -64; } // Do alignment, specialized to power of 2 and for signed ints, @@ -539,8 +738,7 @@ static bool mayAlias(MachineInstr *MIa, /// be combined with the current instruction into a load/store pair. MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, - LdStPairFlags &Flags, - unsigned Limit) { + LdStPairFlags &Flags, unsigned Limit) { MachineBasicBlock::iterator E = I->getParent()->end(); MachineBasicBlock::iterator MBBI = I; MachineInstr *FirstMI = I; @@ -561,9 +759,9 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // Early exit if the offset if not possible to match. (6 bits of positive // range, plus allow an extra one in case we find a later insn that matches // with Offset-1) - int OffsetStride = - IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(FirstMI) : 1; - if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride)) + int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1; + if (!isSmallTypeLdMerge(Opc) && + !inBoundsForPair(IsUnscaled, Offset, OffsetStride)) return E; // Track which registers have been modified and used between the first insn @@ -598,6 +796,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, } if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) { + assert(MI->mayLoadOrStore() && "Expected memory operation."); // If we've found another instruction with the same opcode, check to see // if the base and offset are compatible with our starting instruction. // These instructions all have scaled immediate operands, so we just @@ -621,29 +820,39 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // If the resultant immediate offset of merging these instructions // is out of range for a pairwise instruction, bail and keep looking. bool MIIsUnscaled = isUnscaledLdSt(MI); - if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) { + bool IsSmallTypeLd = isSmallTypeLdMerge(MI->getOpcode()); + if (!IsSmallTypeLd && + !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - if (MI->mayLoadOrStore()) - MemInsns.push_back(MI); + MemInsns.push_back(MI); continue; } - // If the alignment requirements of the paired (scaled) instruction - // can't express the offset of the unscaled input, bail and keep - // looking. - if (IsUnscaled && EnableAArch64UnscaledMemOp && - (alignTo(MinOffset, OffsetStride) != MinOffset)) { - trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - if (MI->mayLoadOrStore()) + + if (IsSmallTypeLd) { + // If the alignment requirements of the larger type scaled load + // instruction can't express the scaled offset of the smaller type + // input, bail and keep looking. + if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) { + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); MemInsns.push_back(MI); - continue; + continue; + } + } else { + // If the alignment requirements of the paired (scaled) instruction + // can't express the offset of the unscaled input, bail and keep + // looking. + if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) { + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + MemInsns.push_back(MI); + continue; + } } // If the destination register of the loads is the same register, bail // and keep looking. A load-pair instruction with both destination // registers the same is UNPREDICTABLE and will result in an exception. if (MayLoad && Reg == getLdStRegOp(MI).getReg()) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - if (MI->mayLoadOrStore()) - MemInsns.push_back(MI); + MemInsns.push_back(MI); continue; } @@ -663,7 +872,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // first and the second alias with the first, we can combine the first // into the second. if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] && - !(FirstMI->mayLoad() && UsedRegs[getLdStRegOp(FirstMI).getReg()]) && + !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) && !mayAlias(FirstMI, MemInsns, TII)) { Flags.setMergeForward(true); return MBBI; @@ -694,51 +903,9 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, } MachineBasicBlock::iterator -AArch64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Update) { - assert((Update->getOpcode() == AArch64::ADDXri || - Update->getOpcode() == AArch64::SUBXri) && - "Unexpected base register update instruction to merge!"); - MachineBasicBlock::iterator NextI = I; - // Return the instruction following the merged instruction, which is - // the instruction following our unmerged load. Unless that's the add/sub - // instruction we're merging, in which case it's the one after that. - if (++NextI == Update) - ++NextI; - - int Value = Update->getOperand(2).getImm(); - assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && - "Can't merge 1 << 12 offset into pre-indexed load / store"); - if (Update->getOpcode() == AArch64::SUBXri) - Value = -Value; - - unsigned NewOpc = getPreIndexedOpcode(I->getOpcode()); - MachineInstrBuilder MIB = - BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) - .addOperand(getLdStRegOp(Update)) - .addOperand(getLdStRegOp(I)) - .addOperand(getLdStBaseOp(I)) - .addImm(Value); - (void)MIB; - - DEBUG(dbgs() << "Creating pre-indexed load/store."); - DEBUG(dbgs() << " Replacing instructions:\n "); - DEBUG(I->print(dbgs())); - DEBUG(dbgs() << " "); - DEBUG(Update->print(dbgs())); - DEBUG(dbgs() << " with instruction:\n "); - DEBUG(((MachineInstr *)MIB)->print(dbgs())); - DEBUG(dbgs() << "\n"); - - // Erase the old instructions for the block. - I->eraseFromParent(); - Update->eraseFromParent(); - - return NextI; -} - -MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn( - MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update) { +AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Update, + bool IsPreIdx) { assert((Update->getOpcode() == AArch64::ADDXri || Update->getOpcode() == AArch64::SUBXri) && "Unexpected base register update instruction to merge!"); @@ -751,20 +918,36 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn( int Value = Update->getOperand(2).getImm(); assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && - "Can't merge 1 << 12 offset into post-indexed load / store"); + "Can't merge 1 << 12 offset into pre-/post-indexed load / store"); if (Update->getOpcode() == AArch64::SUBXri) Value = -Value; - unsigned NewOpc = getPostIndexedOpcode(I->getOpcode()); - MachineInstrBuilder MIB = - BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) - .addOperand(getLdStRegOp(Update)) - .addOperand(getLdStRegOp(I)) - .addOperand(getLdStBaseOp(I)) - .addImm(Value); + unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) + : getPostIndexedOpcode(I->getOpcode()); + MachineInstrBuilder MIB; + if (!isPairedLdSt(I)) { + // Non-paired instruction. + MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) + .addOperand(getLdStRegOp(Update)) + .addOperand(getLdStRegOp(I)) + .addOperand(getLdStBaseOp(I)) + .addImm(Value); + } else { + // Paired instruction. + int Scale = getMemScale(I); + MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) + .addOperand(getLdStRegOp(Update)) + .addOperand(getLdStRegOp(I, 0)) + .addOperand(getLdStRegOp(I, 1)) + .addOperand(getLdStBaseOp(I)) + .addImm(Value / Scale); + } (void)MIB; - DEBUG(dbgs() << "Creating post-indexed load/store."); + if (IsPreIdx) + DEBUG(dbgs() << "Creating pre-indexed load/store."); + else + DEBUG(dbgs() << "Creating post-indexed load/store."); DEBUG(dbgs() << " Replacing instructions:\n "); DEBUG(I->print(dbgs())); DEBUG(dbgs() << " "); @@ -780,8 +963,9 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn( return NextI; } -static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg, - int Offset) { +bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, + MachineInstr *MI, + unsigned BaseReg, int Offset) { switch (MI->getOpcode()) { default: break; @@ -797,44 +981,65 @@ static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg, // Watch out for 1 << 12 shifted value. if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm())) break; - // If the instruction has the base register as source and dest and the - // immediate will fit in a signed 9-bit integer, then we have a match. - if (getLdStRegOp(MI).getReg() == BaseReg && - getLdStBaseOp(MI).getReg() == BaseReg && - getLdStOffsetOp(MI).getImm() <= 255 && - getLdStOffsetOp(MI).getImm() >= -256) { - // If we have a non-zero Offset, we check that it matches the amount - // we're adding to the register. - if (!Offset || Offset == MI->getOperand(2).getImm()) - return true; + + // The update instruction source and destination register must be the + // same as the load/store base register. + if (MI->getOperand(0).getReg() != BaseReg || + MI->getOperand(1).getReg() != BaseReg) + break; + + bool IsPairedInsn = isPairedLdSt(MemMI); + int UpdateOffset = MI->getOperand(2).getImm(); + // For non-paired load/store instructions, the immediate must fit in a + // signed 9-bit integer. + if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256)) + break; + + // For paired load/store instructions, the immediate must be a multiple of + // the scaling factor. The scaled offset must also fit into a signed 7-bit + // integer. + if (IsPairedInsn) { + int Scale = getMemScale(MemMI); + if (UpdateOffset % Scale != 0) + break; + + int ScaledOffset = UpdateOffset / Scale; + if (ScaledOffset > 64 || ScaledOffset < -64) + break; } + + // If we have a non-zero Offset, we check that it matches the amount + // we're adding to the register. + if (!Offset || Offset == MI->getOperand(2).getImm()) + return true; break; } return false; } MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( - MachineBasicBlock::iterator I, unsigned Limit, int Value) { + MachineBasicBlock::iterator I, unsigned Limit, int UnscaledOffset) { MachineBasicBlock::iterator E = I->getParent()->end(); MachineInstr *MemMI = I; MachineBasicBlock::iterator MBBI = I; - const MachineFunction &MF = *MemMI->getParent()->getParent(); - unsigned DestReg = getLdStRegOp(MemMI).getReg(); unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); - int Offset = getLdStOffsetOp(MemMI).getImm() * - TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize(); + int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI); - // If the base register overlaps the destination register, we can't - // merge the update. - if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) + // Scan forward looking for post-index opportunities. Updating instructions + // can't be formed if the memory instruction doesn't have the offset we're + // looking for. + if (MIUnscaledOffset != UnscaledOffset) return E; - // Scan forward looking for post-index opportunities. - // Updating instructions can't be formed if the memory insn already - // has an offset other than the value we're looking for. - if (Offset != Value) - return E; + // If the base register overlaps a destination register, we can't + // merge the update. + bool IsPairedInsn = isPairedLdSt(MemMI); + for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { + unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); + if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) + return E; + } // Track which registers have been modified and used between the first insn // (inclusive) and the second insn. @@ -853,7 +1058,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( ++Count; // If we found a match, return it. - if (isMatchingUpdateInsn(MI, BaseReg, Value)) + if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset)) return MBBI; // Update the status of what the instruction clobbered and used. @@ -873,21 +1078,22 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( MachineBasicBlock::iterator E = I->getParent()->end(); MachineInstr *MemMI = I; MachineBasicBlock::iterator MBBI = I; - const MachineFunction &MF = *MemMI->getParent()->getParent(); - unsigned DestReg = getLdStRegOp(MemMI).getReg(); unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); int Offset = getLdStOffsetOp(MemMI).getImm(); - unsigned RegSize = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize(); // If the load/store is the first instruction in the block, there's obviously // not any matching update. Ditto if the memory offset isn't zero. if (MBBI == B || Offset != 0) return E; - // If the base register overlaps the destination register, we can't + // If the base register overlaps a destination register, we can't // merge the update. - if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) - return E; + bool IsPairedInsn = isPairedLdSt(MemMI); + for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { + unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); + if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) + return E; + } // Track which registers have been modified and used between the first insn // (inclusive) and the second insn. @@ -906,7 +1112,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( ++Count; // If we found a match, return it. - if (isMatchingUpdateInsn(MI, BaseReg, RegSize)) + if (isMatchingUpdateInsn(I, MI, BaseReg, Offset)) return MBBI; // Update the status of what the instruction clobbered and used. @@ -920,17 +1126,65 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( return E; } -bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { +bool AArch64LoadStoreOpt::tryToMergeLdStInst( + MachineBasicBlock::iterator &MBBI) { + MachineInstr *MI = MBBI; + MachineBasicBlock::iterator E = MI->getParent()->end(); + // If this is a volatile load/store, don't mess with it. + if (MI->hasOrderedMemoryRef()) + return false; + + // Make sure this is a reg+imm (as opposed to an address reloc). + if (!getLdStOffsetOp(MI).isImm()) + return false; + + // Check if this load/store has a hint to avoid pair formation. + // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. + if (TII->isLdStPairSuppressed(MI)) + return false; + + // Look ahead up to ScanLimit instructions for a pairable instruction. + LdStPairFlags Flags; + MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit); + if (Paired != E) { + if (isSmallTypeLdMerge(MI)) { + ++NumSmallTypeMerged; + } else { + ++NumPairCreated; + if (isUnscaledLdSt(MI)) + ++NumUnscaledPairCreated; + } + + // Merge the loads into a pair. Keeping the iterator straight is a + // pain, so we let the merge routine tell us what the next instruction + // is after it's done mucking about. + MBBI = mergePairedInsns(MBBI, Paired, Flags); + return true; + } + return false; +} + +bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, + bool enableNarrowLdOpt) { bool Modified = false; - // Two tranformations to do here: - // 1) Find loads and stores that can be merged into a single load or store + // Three tranformations to do here: + // 1) Find halfword loads that can be merged into a single 32-bit word load + // with bitfield extract instructions. + // e.g., + // ldrh w0, [x2] + // ldrh w1, [x2, #2] + // ; becomes + // ldr w0, [x2] + // ubfx w1, w0, #16, #16 + // and w0, w0, #ffff + // 2) Find loads and stores that can be merged into a single load or store // pair instruction. // e.g., // ldr x0, [x2] // ldr x1, [x2, #8] // ; becomes // ldp x0, x1, [x2] - // 2) Find base register updates that can be merged into the load or store + // 3) Find base register updates that can be merged into the load or store // as a base-reg writeback. // e.g., // ldr x0, [x2] @@ -938,6 +1192,29 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { // ; becomes // ldr x0, [x2], #4 + for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + enableNarrowLdOpt && MBBI != E;) { + MachineInstr *MI = MBBI; + switch (MI->getOpcode()) { + default: + // Just move on to the next instruction. + ++MBBI; + break; + // Scaled instructions. + case AArch64::LDRHHui: + // Unscaled instructions. + case AArch64::LDURHHi: { + if (tryToMergeLdStInst(MBBI)) { + Modified = true; + break; + } + ++MBBI; + break; + } + // FIXME: Do the other instructions. + } + } + for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) { MachineInstr *MI = MBBI; @@ -946,6 +1223,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { // Just move on to the next instruction. ++MBBI; break; + // Scaled instructions. case AArch64::STRSui: case AArch64::STRDui: case AArch64::STRQui: @@ -957,7 +1235,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { case AArch64::LDRXui: case AArch64::LDRWui: case AArch64::LDRSWui: - // do the unscaled versions as well + // Unscaled instructions. case AArch64::STURSi: case AArch64::STURDi: case AArch64::STURQi: @@ -969,36 +1247,8 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { case AArch64::LDURWi: case AArch64::LDURXi: case AArch64::LDURSWi: { - // If this is a volatile load/store, don't mess with it. - if (MI->hasOrderedMemoryRef()) { - ++MBBI; - break; - } - // Make sure this is a reg+imm (as opposed to an address reloc). - if (!getLdStOffsetOp(MI).isImm()) { - ++MBBI; - break; - } - // Check if this load/store has a hint to avoid pair formation. - // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. - if (TII->isLdStPairSuppressed(MI)) { - ++MBBI; - break; - } - // Look ahead up to ScanLimit instructions for a pairable instruction. - LdStPairFlags Flags; - MachineBasicBlock::iterator Paired = - findMatchingInsn(MBBI, Flags, ScanLimit); - if (Paired != E) { - // Merge the loads into a pair. Keeping the iterator straight is a - // pain, so we let the merge routine tell us what the next instruction - // is after it's done mucking about. - MBBI = mergePairedInsns(MBBI, Paired, Flags); - + if (tryToMergeLdStInst(MBBI)) { Modified = true; - ++NumPairCreated; - if (isUnscaledLdSt(MI)) - ++NumUnscaledPairCreated; break; } ++MBBI; @@ -1019,17 +1269,22 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { // Just move on to the next instruction. ++MBBI; break; + // Scaled instructions. case AArch64::STRSui: case AArch64::STRDui: case AArch64::STRQui: case AArch64::STRXui: case AArch64::STRWui: + case AArch64::STRHHui: + case AArch64::STRBBui: case AArch64::LDRSui: case AArch64::LDRDui: case AArch64::LDRQui: case AArch64::LDRXui: case AArch64::LDRWui: - // do the unscaled versions as well + case AArch64::LDRHHui: + case AArch64::LDRBBui: + // Unscaled instructions. case AArch64::STURSi: case AArch64::STURDi: case AArch64::STURQi: @@ -1039,18 +1294,34 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { case AArch64::LDURDi: case AArch64::LDURQi: case AArch64::LDURWi: - case AArch64::LDURXi: { + case AArch64::LDURXi: + // Paired instructions. + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPDi: + case AArch64::LDPQi: + case AArch64::LDPWi: + case AArch64::LDPXi: + case AArch64::STPSi: + case AArch64::STPDi: + case AArch64::STPQi: + case AArch64::STPWi: + case AArch64::STPXi: { // Make sure this is a reg+imm (as opposed to an address reloc). if (!getLdStOffsetOp(MI).isImm()) { ++MBBI; break; } - // Look ahead up to ScanLimit instructions for a mergable instruction. + // Look forward to try to form a post-index instruction. For example, + // ldr x0, [x20] + // add x20, x20, #32 + // merged into: + // ldr x0, [x20], #32 MachineBasicBlock::iterator Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, 0); if (Update != E) { // Merge the update into the ld/st. - MBBI = mergePostIdxUpdateInsn(MBBI, Update); + MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); Modified = true; ++NumPostFolded; break; @@ -1070,28 +1341,25 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit); if (Update != E) { // Merge the update into the ld/st. - MBBI = mergePreIdxUpdateInsn(MBBI, Update); + MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); Modified = true; ++NumPreFolded; break; } + // The immediate in the load/store is scaled by the size of the memory + // operation. The immediate in the add we're looking for, + // however, is not, so adjust here. + int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI); // Look forward to try to find a post-index instruction. For example, // ldr x1, [x0, #64] // add x0, x0, #64 // merged into: // ldr x1, [x0, #64]! - - // The immediate in the load/store is scaled by the size of the register - // being loaded. The immediate in the add we're looking for, - // however, is not, so adjust here. - int Value = MI->getOperand(2).getImm() * - TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent())) - ->getSize(); - Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, Value); + Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, UnscaledOffset); if (Update != E) { // Merge the update into the ld/st. - MBBI = mergePreIdxUpdateInsn(MBBI, Update); + MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); Modified = true; ++NumPreFolded; break; @@ -1108,13 +1376,25 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { return Modified; } +bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) { + const AArch64Subtarget *SubTarget = + &static_cast(Fn.getSubtarget()); + bool ProfitableArch = SubTarget->isCortexA57(); + // FIXME: The benefit from converting narrow loads into a wider load could be + // microarchitectural as it assumes that a single load with two bitfield + // extracts is cheaper than two narrow loads. Currently, this conversion is + // enabled only in cortex-a57 on which performance benefits were verified. + return ProfitableArch & (!SubTarget->requiresStrictAlign()); +} + bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { TII = static_cast(Fn.getSubtarget().getInstrInfo()); TRI = Fn.getSubtarget().getRegisterInfo(); bool Modified = false; + bool enableNarrowLdOpt = enableNarrowLdMerge(Fn); for (auto &MBB : Fn) - Modified |= optimizeBlock(MBB); + Modified |= optimizeBlock(MBB, enableNarrowLdOpt); return Modified; }