From 8466fa1842ad4f2d6fadcf5c23c15319ae96b972 Mon Sep 17 00:00:00 2001 From: Bob Wilson Date: Mon, 13 Sep 2010 23:01:35 +0000 Subject: [PATCH] Switch all the NEON vld-lane and vst-lane instructions over to the new pseudo-instruction approach. Change ARMExpandPseudoInsts to use a table to record all the NEON load/store information. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@113812 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMExpandPseudoInsts.cpp | 595 +++++++++++++++++------- lib/Target/ARM/ARMISelDAGToDAG.cpp | 231 +++------ lib/Target/ARM/ARMInstrNEON.td | 147 +++++- lib/Target/ARM/NEONPreAllocPass.cpp | 138 ------ 4 files changed, 624 insertions(+), 487 deletions(-) diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 3988a84eed6..38535dc6650 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -24,13 +24,6 @@ using namespace llvm; namespace { class ARMExpandPseudo : public MachineFunctionPass { - // Constants for register spacing in NEON load/store instructions. - enum NEONRegSpacing { - SingleSpc, - EvenDblSpc, - OddDblSpc - }; - public: static char ID; ARMExpandPseudo() : MachineFunctionPass(ID) {} @@ -48,10 +41,9 @@ namespace { void TransferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI, MachineInstrBuilder &DefMI); bool ExpandMBB(MachineBasicBlock &MBB); - void ExpandVLD(MachineBasicBlock::iterator &MBBI, unsigned Opc, - bool hasWriteBack, NEONRegSpacing RegSpc, unsigned NumRegs); - void ExpandVST(MachineBasicBlock::iterator &MBBI, unsigned Opc, - bool hasWriteBack, NEONRegSpacing RegSpc, unsigned NumRegs); + void ExpandVLD(MachineBasicBlock::iterator &MBBI); + void ExpandVST(MachineBasicBlock::iterator &MBBI); + void ExpandLaneOp(MachineBasicBlock::iterator &MBBI); }; char ARMExpandPseudo::ID = 0; } @@ -73,37 +65,289 @@ void ARMExpandPseudo::TransferImpOps(MachineInstr &OldMI, } } +namespace { + // Constants for register spacing in NEON load/store instructions. + // For quad-register load-lane and store-lane pseudo instructors, the + // spacing is initially assumed to be EvenDblSpc, and that is changed to + // OddDblSpc depending on the lane number operand. + enum NEONRegSpacing { + SingleSpc, + EvenDblSpc, + OddDblSpc + }; + + // Entries for NEON load/store information table. The table is sorted by + // PseudoOpc for fast binary-search lookups. + struct NEONLdStTableEntry { + unsigned PseudoOpc; + unsigned RealOpc; + bool IsLoad; + bool HasWriteBack; + NEONRegSpacing RegSpacing; + unsigned char NumRegs; // D registers loaded or stored + unsigned char RegElts; // elements per D register; used for lane ops + + // Comparison methods for binary search of the table. + bool operator<(const NEONLdStTableEntry &TE) const { + return PseudoOpc < TE.PseudoOpc; + } + friend bool operator<(const NEONLdStTableEntry &TE, unsigned PseudoOpc) { + return TE.PseudoOpc < PseudoOpc; + } + friend bool ATTRIBUTE_UNUSED operator<(unsigned PseudoOpc, + const NEONLdStTableEntry &TE) { + return PseudoOpc < TE.PseudoOpc; + } + }; +} + +static const NEONLdStTableEntry NEONLdStTable[] = { +{ ARM::VLD1d64QPseudo, ARM::VLD1d64Q, true, false, SingleSpc, 4, 1 }, +{ ARM::VLD1d64QPseudo_UPD, ARM::VLD1d64Q_UPD, true, true, SingleSpc, 4, 1 }, +{ ARM::VLD1d64TPseudo, ARM::VLD1d64T, true, false, SingleSpc, 3, 1 }, +{ ARM::VLD1d64TPseudo_UPD, ARM::VLD1d64T_UPD, true, true, SingleSpc, 3, 1 }, + +{ ARM::VLD1q16Pseudo, ARM::VLD1q16, true, false, SingleSpc, 2, 4 }, +{ ARM::VLD1q16Pseudo_UPD, ARM::VLD1q16_UPD, true, true, SingleSpc, 2, 4 }, +{ ARM::VLD1q32Pseudo, ARM::VLD1q32, true, false, SingleSpc, 2, 2 }, +{ ARM::VLD1q32Pseudo_UPD, ARM::VLD1q32_UPD, true, true, SingleSpc, 2, 2 }, +{ ARM::VLD1q64Pseudo, ARM::VLD1q64, true, false, SingleSpc, 2, 1 }, +{ ARM::VLD1q64Pseudo_UPD, ARM::VLD1q64_UPD, true, true, SingleSpc, 2, 1 }, +{ ARM::VLD1q8Pseudo, ARM::VLD1q8, true, false, SingleSpc, 2, 8 }, +{ ARM::VLD1q8Pseudo_UPD, ARM::VLD1q8_UPD, true, true, SingleSpc, 2, 8 }, + +{ ARM::VLD2LNd16Pseudo, ARM::VLD2LNd16, true, false, SingleSpc, 2, 4 }, +{ ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, SingleSpc, 2, 4 }, +{ ARM::VLD2LNd32Pseudo, ARM::VLD2LNd32, true, false, SingleSpc, 2, 2 }, +{ ARM::VLD2LNd32Pseudo_UPD, ARM::VLD2LNd32_UPD, true, true, SingleSpc, 2, 2 }, +{ ARM::VLD2LNd8Pseudo, ARM::VLD2LNd8, true, false, SingleSpc, 2, 8 }, +{ ARM::VLD2LNd8Pseudo_UPD, ARM::VLD2LNd8_UPD, true, true, SingleSpc, 2, 8 }, +{ ARM::VLD2LNq16Pseudo, ARM::VLD2LNq16, true, false, EvenDblSpc, 2, 4 }, +{ ARM::VLD2LNq16Pseudo_UPD, ARM::VLD2LNq16_UPD, true, true, EvenDblSpc, 2, 4 }, +{ ARM::VLD2LNq32Pseudo, ARM::VLD2LNq32, true, false, EvenDblSpc, 2, 2 }, +{ ARM::VLD2LNq32Pseudo_UPD, ARM::VLD2LNq32_UPD, true, true, EvenDblSpc, 2, 2 }, + +{ ARM::VLD2d16Pseudo, ARM::VLD2d16, true, false, SingleSpc, 2, 4 }, +{ ARM::VLD2d16Pseudo_UPD, ARM::VLD2d16_UPD, true, true, SingleSpc, 2, 4 }, +{ ARM::VLD2d32Pseudo, ARM::VLD2d32, true, false, SingleSpc, 2, 2 }, +{ ARM::VLD2d32Pseudo_UPD, ARM::VLD2d32_UPD, true, true, SingleSpc, 2, 2 }, +{ ARM::VLD2d8Pseudo, ARM::VLD2d8, true, false, SingleSpc, 2, 8 }, +{ ARM::VLD2d8Pseudo_UPD, ARM::VLD2d8_UPD, true, true, SingleSpc, 2, 8 }, + +{ ARM::VLD2q16Pseudo, ARM::VLD2q16, true, false, SingleSpc, 4, 4 }, +{ ARM::VLD2q16Pseudo_UPD, ARM::VLD2q16_UPD, true, true, SingleSpc, 4, 4 }, +{ ARM::VLD2q32Pseudo, ARM::VLD2q32, true, false, SingleSpc, 4, 2 }, +{ ARM::VLD2q32Pseudo_UPD, ARM::VLD2q32_UPD, true, true, SingleSpc, 4, 2 }, +{ ARM::VLD2q8Pseudo, ARM::VLD2q8, true, false, SingleSpc, 4, 8 }, +{ ARM::VLD2q8Pseudo_UPD, ARM::VLD2q8_UPD, true, true, SingleSpc, 4, 8 }, + +{ ARM::VLD3LNd16Pseudo, ARM::VLD3LNd16, true, false, SingleSpc, 3, 4 }, +{ ARM::VLD3LNd16Pseudo_UPD, ARM::VLD3LNd16_UPD, true, true, SingleSpc, 3, 4 }, +{ ARM::VLD3LNd32Pseudo, ARM::VLD3LNd32, true, false, SingleSpc, 3, 2 }, +{ ARM::VLD3LNd32Pseudo_UPD, ARM::VLD3LNd32_UPD, true, true, SingleSpc, 3, 2 }, +{ ARM::VLD3LNd8Pseudo, ARM::VLD3LNd8, true, false, SingleSpc, 3, 8 }, +{ ARM::VLD3LNd8Pseudo_UPD, ARM::VLD3LNd8_UPD, true, true, SingleSpc, 3, 8 }, +{ ARM::VLD3LNq16Pseudo, ARM::VLD3LNq16, true, false, EvenDblSpc, 3, 4 }, +{ ARM::VLD3LNq16Pseudo_UPD, ARM::VLD3LNq16_UPD, true, true, EvenDblSpc, 3, 4 }, +{ ARM::VLD3LNq32Pseudo, ARM::VLD3LNq32, true, false, EvenDblSpc, 3, 2 }, +{ ARM::VLD3LNq32Pseudo_UPD, ARM::VLD3LNq32_UPD, true, true, EvenDblSpc, 3, 2 }, + +{ ARM::VLD3d16Pseudo, ARM::VLD3d16, true, false, SingleSpc, 3, 4 }, +{ ARM::VLD3d16Pseudo_UPD, ARM::VLD3d16_UPD, true, true, SingleSpc, 3, 4 }, +{ ARM::VLD3d32Pseudo, ARM::VLD3d32, true, false, SingleSpc, 3, 2 }, +{ ARM::VLD3d32Pseudo_UPD, ARM::VLD3d32_UPD, true, true, SingleSpc, 3, 2 }, +{ ARM::VLD3d8Pseudo, ARM::VLD3d8, true, false, SingleSpc, 3, 8 }, +{ ARM::VLD3d8Pseudo_UPD, ARM::VLD3d8_UPD, true, true, SingleSpc, 3, 8 }, + +{ ARM::VLD3q16Pseudo_UPD, ARM::VLD3q16_UPD, true, true, EvenDblSpc, 3, 4 }, +{ ARM::VLD3q16oddPseudo_UPD, ARM::VLD3q16_UPD, true, true, OddDblSpc, 3, 4 }, +{ ARM::VLD3q32Pseudo_UPD, ARM::VLD3q32_UPD, true, true, EvenDblSpc, 3, 2 }, +{ ARM::VLD3q32oddPseudo_UPD, ARM::VLD3q32_UPD, true, true, OddDblSpc, 3, 2 }, +{ ARM::VLD3q8Pseudo_UPD, ARM::VLD3q8_UPD, true, true, EvenDblSpc, 3, 8 }, +{ ARM::VLD3q8oddPseudo_UPD, ARM::VLD3q8_UPD, true, true, OddDblSpc, 3, 8 }, + +{ ARM::VLD4LNd16Pseudo, ARM::VLD4LNd16, true, false, SingleSpc, 4, 4 }, +{ ARM::VLD4LNd16Pseudo_UPD, ARM::VLD4LNd16_UPD, true, true, SingleSpc, 4, 4 }, +{ ARM::VLD4LNd32Pseudo, ARM::VLD4LNd32, true, false, SingleSpc, 4, 2 }, +{ ARM::VLD4LNd32Pseudo_UPD, ARM::VLD4LNd32_UPD, true, true, SingleSpc, 4, 2 }, +{ ARM::VLD4LNd8Pseudo, ARM::VLD4LNd8, true, false, SingleSpc, 4, 8 }, +{ ARM::VLD4LNd8Pseudo_UPD, ARM::VLD4LNd8_UPD, true, true, SingleSpc, 4, 8 }, +{ ARM::VLD4LNq16Pseudo, ARM::VLD4LNq16, true, false, EvenDblSpc, 4, 4 }, +{ ARM::VLD4LNq16Pseudo_UPD, ARM::VLD4LNq16_UPD, true, true, EvenDblSpc, 4, 4 }, +{ ARM::VLD4LNq32Pseudo, ARM::VLD4LNq32, true, false, EvenDblSpc, 4, 2 }, +{ ARM::VLD4LNq32Pseudo_UPD, ARM::VLD4LNq32_UPD, true, true, EvenDblSpc, 4, 2 }, + +{ ARM::VLD4d16Pseudo, ARM::VLD4d16, true, false, SingleSpc, 4, 4 }, +{ ARM::VLD4d16Pseudo_UPD, ARM::VLD4d16_UPD, true, true, SingleSpc, 4, 4 }, +{ ARM::VLD4d32Pseudo, ARM::VLD4d32, true, false, SingleSpc, 4, 2 }, +{ ARM::VLD4d32Pseudo_UPD, ARM::VLD4d32_UPD, true, true, SingleSpc, 4, 2 }, +{ ARM::VLD4d8Pseudo, ARM::VLD4d8, true, false, SingleSpc, 4, 8 }, +{ ARM::VLD4d8Pseudo_UPD, ARM::VLD4d8_UPD, true, true, SingleSpc, 4, 8 }, + +{ ARM::VLD4q16Pseudo_UPD, ARM::VLD4q16_UPD, true, true, EvenDblSpc, 4, 4 }, +{ ARM::VLD4q16oddPseudo_UPD, ARM::VLD4q16_UPD, true, true, OddDblSpc, 4, 4 }, +{ ARM::VLD4q32Pseudo_UPD, ARM::VLD4q32_UPD, true, true, EvenDblSpc, 4, 2 }, +{ ARM::VLD4q32oddPseudo_UPD, ARM::VLD4q32_UPD, true, true, OddDblSpc, 4, 2 }, +{ ARM::VLD4q8Pseudo_UPD, ARM::VLD4q8_UPD, true, true, EvenDblSpc, 4, 8 }, +{ ARM::VLD4q8oddPseudo_UPD, ARM::VLD4q8_UPD, true, true, OddDblSpc, 4, 8 }, + +{ ARM::VST1d64QPseudo, ARM::VST1d64Q, false, false, SingleSpc, 4, 1 }, +{ ARM::VST1d64QPseudo_UPD, ARM::VST1d64Q_UPD, false, true, SingleSpc, 4, 1 }, +{ ARM::VST1d64TPseudo, ARM::VST1d64T, false, false, SingleSpc, 3, 1 }, +{ ARM::VST1d64TPseudo_UPD, ARM::VST1d64T_UPD, false, true, SingleSpc, 3, 1 }, + +{ ARM::VST1q16Pseudo, ARM::VST1q16, false, false, SingleSpc, 2, 4 }, +{ ARM::VST1q16Pseudo_UPD, ARM::VST1q16_UPD, false, true, SingleSpc, 2, 4 }, +{ ARM::VST1q32Pseudo, ARM::VST1q32, false, false, SingleSpc, 2, 2 }, +{ ARM::VST1q32Pseudo_UPD, ARM::VST1q32_UPD, false, true, SingleSpc, 2, 2 }, +{ ARM::VST1q64Pseudo, ARM::VST1q64, false, false, SingleSpc, 2, 1 }, +{ ARM::VST1q64Pseudo_UPD, ARM::VST1q64_UPD, false, true, SingleSpc, 2, 1 }, +{ ARM::VST1q8Pseudo, ARM::VST1q8, false, false, SingleSpc, 2, 8 }, +{ ARM::VST1q8Pseudo_UPD, ARM::VST1q8_UPD, false, true, SingleSpc, 2, 8 }, + +{ ARM::VST2LNd16Pseudo, ARM::VST2LNd16, false, false, SingleSpc, 2, 4 }, +{ ARM::VST2LNd16Pseudo_UPD, ARM::VST2LNd16_UPD, false, true, SingleSpc, 2, 4 }, +{ ARM::VST2LNd32Pseudo, ARM::VST2LNd32, false, false, SingleSpc, 2, 2 }, +{ ARM::VST2LNd32Pseudo_UPD, ARM::VST2LNd32_UPD, false, true, SingleSpc, 2, 2 }, +{ ARM::VST2LNd8Pseudo, ARM::VST2LNd8, false, false, SingleSpc, 2, 8 }, +{ ARM::VST2LNd8Pseudo_UPD, ARM::VST2LNd8_UPD, false, true, SingleSpc, 2, 8 }, +{ ARM::VST2LNq16Pseudo, ARM::VST2LNq16, false, false, EvenDblSpc, 2, 4}, +{ ARM::VST2LNq16Pseudo_UPD, ARM::VST2LNq16_UPD, false, true, EvenDblSpc, 2, 4}, +{ ARM::VST2LNq32Pseudo, ARM::VST2LNq32, false, false, EvenDblSpc, 2, 2}, +{ ARM::VST2LNq32Pseudo_UPD, ARM::VST2LNq32_UPD, false, true, EvenDblSpc, 2, 2}, + +{ ARM::VST2d16Pseudo, ARM::VST2d16, false, false, SingleSpc, 2, 4 }, +{ ARM::VST2d16Pseudo_UPD, ARM::VST2d16_UPD, false, true, SingleSpc, 2, 4 }, +{ ARM::VST2d32Pseudo, ARM::VST2d32, false, false, SingleSpc, 2, 2 }, +{ ARM::VST2d32Pseudo_UPD, ARM::VST2d32_UPD, false, true, SingleSpc, 2, 2 }, +{ ARM::VST2d8Pseudo, ARM::VST2d8, false, false, SingleSpc, 2, 8 }, +{ ARM::VST2d8Pseudo_UPD, ARM::VST2d8_UPD, false, true, SingleSpc, 2, 8 }, + +{ ARM::VST2q16Pseudo, ARM::VST2q16, false, false, SingleSpc, 4, 4 }, +{ ARM::VST2q16Pseudo_UPD, ARM::VST2q16_UPD, false, true, SingleSpc, 4, 4 }, +{ ARM::VST2q32Pseudo, ARM::VST2q32, false, false, SingleSpc, 4, 2 }, +{ ARM::VST2q32Pseudo_UPD, ARM::VST2q32_UPD, false, true, SingleSpc, 4, 2 }, +{ ARM::VST2q8Pseudo, ARM::VST2q8, false, false, SingleSpc, 4, 8 }, +{ ARM::VST2q8Pseudo_UPD, ARM::VST2q8_UPD, false, true, SingleSpc, 4, 8 }, + +{ ARM::VST3LNd16Pseudo, ARM::VST3LNd16, false, false, SingleSpc, 3, 4 }, +{ ARM::VST3LNd16Pseudo_UPD, ARM::VST3LNd16_UPD, false, true, SingleSpc, 3, 4 }, +{ ARM::VST3LNd32Pseudo, ARM::VST3LNd32, false, false, SingleSpc, 3, 2 }, +{ ARM::VST3LNd32Pseudo_UPD, ARM::VST3LNd32_UPD, false, true, SingleSpc, 3, 2 }, +{ ARM::VST3LNd8Pseudo, ARM::VST3LNd8, false, false, SingleSpc, 3, 8 }, +{ ARM::VST3LNd8Pseudo_UPD, ARM::VST3LNd8_UPD, false, true, SingleSpc, 3, 8 }, +{ ARM::VST3LNq16Pseudo, ARM::VST3LNq16, false, false, EvenDblSpc, 3, 4}, +{ ARM::VST3LNq16Pseudo_UPD, ARM::VST3LNq16_UPD, false, true, EvenDblSpc, 3, 4}, +{ ARM::VST3LNq32Pseudo, ARM::VST3LNq32, false, false, EvenDblSpc, 3, 2}, +{ ARM::VST3LNq32Pseudo_UPD, ARM::VST3LNq32_UPD, false, true, EvenDblSpc, 3, 2}, + +{ ARM::VST3d16Pseudo, ARM::VST3d16, false, false, SingleSpc, 3, 4 }, +{ ARM::VST3d16Pseudo_UPD, ARM::VST3d16_UPD, false, true, SingleSpc, 3, 4 }, +{ ARM::VST3d32Pseudo, ARM::VST3d32, false, false, SingleSpc, 3, 2 }, +{ ARM::VST3d32Pseudo_UPD, ARM::VST3d32_UPD, false, true, SingleSpc, 3, 2 }, +{ ARM::VST3d8Pseudo, ARM::VST3d8, false, false, SingleSpc, 3, 8 }, +{ ARM::VST3d8Pseudo_UPD, ARM::VST3d8_UPD, false, true, SingleSpc, 3, 8 }, + +{ ARM::VST3q16Pseudo_UPD, ARM::VST3q16_UPD, false, true, EvenDblSpc, 3, 4 }, +{ ARM::VST3q16oddPseudo_UPD, ARM::VST3q16_UPD, false, true, OddDblSpc, 3, 4 }, +{ ARM::VST3q32Pseudo_UPD, ARM::VST3q32_UPD, false, true, EvenDblSpc, 3, 2 }, +{ ARM::VST3q32oddPseudo_UPD, ARM::VST3q32_UPD, false, true, OddDblSpc, 3, 2 }, +{ ARM::VST3q8Pseudo_UPD, ARM::VST3q8_UPD, false, true, EvenDblSpc, 3, 8 }, +{ ARM::VST3q8oddPseudo_UPD, ARM::VST3q8_UPD, false, true, OddDblSpc, 3, 8 }, + +{ ARM::VST4LNd16Pseudo, ARM::VST4LNd16, false, false, SingleSpc, 4, 4 }, +{ ARM::VST4LNd16Pseudo_UPD, ARM::VST4LNd16_UPD, false, true, SingleSpc, 4, 4 }, +{ ARM::VST4LNd32Pseudo, ARM::VST4LNd32, false, false, SingleSpc, 4, 2 }, +{ ARM::VST4LNd32Pseudo_UPD, ARM::VST4LNd32_UPD, false, true, SingleSpc, 4, 2 }, +{ ARM::VST4LNd8Pseudo, ARM::VST4LNd8, false, false, SingleSpc, 4, 8 }, +{ ARM::VST4LNd8Pseudo_UPD, ARM::VST4LNd8_UPD, false, true, SingleSpc, 4, 8 }, +{ ARM::VST4LNq16Pseudo, ARM::VST4LNq16, false, false, EvenDblSpc, 4, 4}, +{ ARM::VST4LNq16Pseudo_UPD, ARM::VST4LNq16_UPD, false, true, EvenDblSpc, 4, 4}, +{ ARM::VST4LNq32Pseudo, ARM::VST4LNq32, false, false, EvenDblSpc, 4, 2}, +{ ARM::VST4LNq32Pseudo_UPD, ARM::VST4LNq32_UPD, false, true, EvenDblSpc, 4, 2}, + +{ ARM::VST4d16Pseudo, ARM::VST4d16, false, false, SingleSpc, 4, 4 }, +{ ARM::VST4d16Pseudo_UPD, ARM::VST4d16_UPD, false, true, SingleSpc, 4, 4 }, +{ ARM::VST4d32Pseudo, ARM::VST4d32, false, false, SingleSpc, 4, 2 }, +{ ARM::VST4d32Pseudo_UPD, ARM::VST4d32_UPD, false, true, SingleSpc, 4, 2 }, +{ ARM::VST4d8Pseudo, ARM::VST4d8, false, false, SingleSpc, 4, 8 }, +{ ARM::VST4d8Pseudo_UPD, ARM::VST4d8_UPD, false, true, SingleSpc, 4, 8 }, + +{ ARM::VST4q16Pseudo_UPD, ARM::VST4q16_UPD, false, true, EvenDblSpc, 4, 4 }, +{ ARM::VST4q16oddPseudo_UPD, ARM::VST4q16_UPD, false, true, OddDblSpc, 4, 4 }, +{ ARM::VST4q32Pseudo_UPD, ARM::VST4q32_UPD, false, true, EvenDblSpc, 4, 2 }, +{ ARM::VST4q32oddPseudo_UPD, ARM::VST4q32_UPD, false, true, OddDblSpc, 4, 2 }, +{ ARM::VST4q8Pseudo_UPD, ARM::VST4q8_UPD, false, true, EvenDblSpc, 4, 8 }, +{ ARM::VST4q8oddPseudo_UPD , ARM::VST4q8_UPD, false, true, OddDblSpc, 4, 8 } +}; + +/// LookupNEONLdSt - Search the NEONLdStTable for information about a NEON +/// load or store pseudo instruction. +static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) { + unsigned NumEntries = array_lengthof(NEONLdStTable); + +#ifndef NDEBUG + // Make sure the table is sorted. + static bool TableChecked = false; + if (!TableChecked) { + for (unsigned i = 0; i != NumEntries-1; ++i) + assert(NEONLdStTable[i] < NEONLdStTable[i+1] && + "NEONLdStTable is not sorted!"); + TableChecked = true; + } +#endif + + const NEONLdStTableEntry *I = + std::lower_bound(NEONLdStTable, NEONLdStTable + NumEntries, Opcode); + if (I != NEONLdStTable + NumEntries && I->PseudoOpc == Opcode) + return I; + return NULL; +} + +/// GetDSubRegs - Get 4 D subregisters of a Q, QQ, or QQQQ register, +/// corresponding to the specified register spacing. Not all of the results +/// are necessarily valid, e.g., a Q register only has 2 D subregisters. +static void GetDSubRegs(unsigned Reg, NEONRegSpacing RegSpc, + const TargetRegisterInfo *TRI, unsigned &D0, + unsigned &D1, unsigned &D2, unsigned &D3) { + if (RegSpc == SingleSpc) { + D0 = TRI->getSubReg(Reg, ARM::dsub_0); + D1 = TRI->getSubReg(Reg, ARM::dsub_1); + D2 = TRI->getSubReg(Reg, ARM::dsub_2); + D3 = TRI->getSubReg(Reg, ARM::dsub_3); + } else if (RegSpc == EvenDblSpc) { + D0 = TRI->getSubReg(Reg, ARM::dsub_0); + D1 = TRI->getSubReg(Reg, ARM::dsub_2); + D2 = TRI->getSubReg(Reg, ARM::dsub_4); + D3 = TRI->getSubReg(Reg, ARM::dsub_6); + } else { + assert(RegSpc == OddDblSpc && "unknown register spacing"); + D0 = TRI->getSubReg(Reg, ARM::dsub_1); + D1 = TRI->getSubReg(Reg, ARM::dsub_3); + D2 = TRI->getSubReg(Reg, ARM::dsub_5); + D3 = TRI->getSubReg(Reg, ARM::dsub_7); + } +} + /// ExpandVLD - Translate VLD pseudo instructions with Q, QQ or QQQQ register /// operands to real VLD instructions with D register operands. -void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI, - unsigned Opc, bool hasWriteBack, - NEONRegSpacing RegSpc, unsigned NumRegs) { +void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) { MachineInstr &MI = *MBBI; MachineBasicBlock &MBB = *MI.getParent(); - MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)); + const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode()); + assert(TableEntry && TableEntry->IsLoad && "NEONLdStTable lookup failed"); + NEONRegSpacing RegSpc = TableEntry->RegSpacing; + unsigned NumRegs = TableEntry->NumRegs; + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(TableEntry->RealOpc)); unsigned OpIdx = 0; bool DstIsDead = MI.getOperand(OpIdx).isDead(); unsigned DstReg = MI.getOperand(OpIdx++).getReg(); unsigned D0, D1, D2, D3; - if (RegSpc == SingleSpc) { - D0 = TRI->getSubReg(DstReg, ARM::dsub_0); - D1 = TRI->getSubReg(DstReg, ARM::dsub_1); - D2 = TRI->getSubReg(DstReg, ARM::dsub_2); - D3 = TRI->getSubReg(DstReg, ARM::dsub_3); - } else if (RegSpc == EvenDblSpc) { - D0 = TRI->getSubReg(DstReg, ARM::dsub_0); - D1 = TRI->getSubReg(DstReg, ARM::dsub_2); - D2 = TRI->getSubReg(DstReg, ARM::dsub_4); - D3 = TRI->getSubReg(DstReg, ARM::dsub_6); - } else { - assert(RegSpc == OddDblSpc && "unknown register spacing for VLD"); - D0 = TRI->getSubReg(DstReg, ARM::dsub_1); - D1 = TRI->getSubReg(DstReg, ARM::dsub_3); - D2 = TRI->getSubReg(DstReg, ARM::dsub_5); - D3 = TRI->getSubReg(DstReg, ARM::dsub_7); - } + GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3); MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead)) .addReg(D1, RegState::Define | getDeadRegState(DstIsDead)); if (NumRegs > 2) @@ -111,14 +355,14 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI, if (NumRegs > 3) MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead)); - if (hasWriteBack) + if (TableEntry->HasWriteBack) MIB.addOperand(MI.getOperand(OpIdx++)); // Copy the addrmode6 operands. MIB.addOperand(MI.getOperand(OpIdx++)); MIB.addOperand(MI.getOperand(OpIdx++)); // Copy the am6offset operand. - if (hasWriteBack) + if (TableEntry->HasWriteBack) MIB.addOperand(MI.getOperand(OpIdx++)); MIB = AddDefaultPred(MIB); @@ -138,45 +382,32 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI, /// ExpandVST - Translate VST pseudo instructions with Q, QQ or QQQQ register /// operands to real VST instructions with D register operands. -void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI, - unsigned Opc, bool hasWriteBack, - NEONRegSpacing RegSpc, unsigned NumRegs) { +void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) { MachineInstr &MI = *MBBI; MachineBasicBlock &MBB = *MI.getParent(); - MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)); + const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode()); + assert(TableEntry && !TableEntry->IsLoad && "NEONLdStTable lookup failed"); + NEONRegSpacing RegSpc = TableEntry->RegSpacing; + unsigned NumRegs = TableEntry->NumRegs; + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(TableEntry->RealOpc)); unsigned OpIdx = 0; - if (hasWriteBack) + if (TableEntry->HasWriteBack) MIB.addOperand(MI.getOperand(OpIdx++)); // Copy the addrmode6 operands. MIB.addOperand(MI.getOperand(OpIdx++)); MIB.addOperand(MI.getOperand(OpIdx++)); // Copy the am6offset operand. - if (hasWriteBack) + if (TableEntry->HasWriteBack) MIB.addOperand(MI.getOperand(OpIdx++)); bool SrcIsKill = MI.getOperand(OpIdx).isKill(); unsigned SrcReg = MI.getOperand(OpIdx).getReg(); unsigned D0, D1, D2, D3; - if (RegSpc == SingleSpc) { - D0 = TRI->getSubReg(SrcReg, ARM::dsub_0); - D1 = TRI->getSubReg(SrcReg, ARM::dsub_1); - D2 = TRI->getSubReg(SrcReg, ARM::dsub_2); - D3 = TRI->getSubReg(SrcReg, ARM::dsub_3); - } else if (RegSpc == EvenDblSpc) { - D0 = TRI->getSubReg(SrcReg, ARM::dsub_0); - D1 = TRI->getSubReg(SrcReg, ARM::dsub_2); - D2 = TRI->getSubReg(SrcReg, ARM::dsub_4); - D3 = TRI->getSubReg(SrcReg, ARM::dsub_6); - } else { - assert(RegSpc == OddDblSpc && "unknown register spacing for VST"); - D0 = TRI->getSubReg(SrcReg, ARM::dsub_1); - D1 = TRI->getSubReg(SrcReg, ARM::dsub_3); - D2 = TRI->getSubReg(SrcReg, ARM::dsub_5); - D3 = TRI->getSubReg(SrcReg, ARM::dsub_7); - } - + GetDSubRegs(SrcReg, RegSpc, TRI, D0, D1, D2, D3); MIB.addReg(D0).addReg(D1); if (NumRegs > 2) MIB.addReg(D2); @@ -190,6 +421,85 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI, MI.eraseFromParent(); } +/// ExpandLaneOp - Translate VLD*LN and VST*LN instructions with Q, QQ or QQQQ +/// register operands to real instructions with D register operands. +void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) { + MachineInstr &MI = *MBBI; + MachineBasicBlock &MBB = *MI.getParent(); + + const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode()); + assert(TableEntry && "NEONLdStTable lookup failed"); + NEONRegSpacing RegSpc = TableEntry->RegSpacing; + unsigned NumRegs = TableEntry->NumRegs; + unsigned RegElts = TableEntry->RegElts; + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(TableEntry->RealOpc)); + unsigned OpIdx = 0; + // The lane operand is always the 3rd from last operand, before the 2 + // predicate operands. + unsigned Lane = MI.getOperand(MI.getDesc().getNumOperands() - 3).getImm(); + + // Adjust the lane and spacing as needed for Q registers. + assert(RegSpc != OddDblSpc && "unexpected register spacing for VLD/VST-lane"); + if (RegSpc == EvenDblSpc && Lane >= RegElts) { + RegSpc = OddDblSpc; + Lane -= RegElts; + } + assert(Lane < RegElts && "out of range lane for VLD/VST-lane"); + + unsigned DstReg, D0, D1, D2, D3; + bool DstIsDead; + if (TableEntry->IsLoad) { + DstIsDead = MI.getOperand(OpIdx).isDead(); + DstReg = MI.getOperand(OpIdx++).getReg(); + GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3); + MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(D1, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 2) + MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead)); + if (NumRegs > 3) + MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead)); + } + + if (TableEntry->HasWriteBack) + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Copy the addrmode6 operands. + MIB.addOperand(MI.getOperand(OpIdx++)); + MIB.addOperand(MI.getOperand(OpIdx++)); + // Copy the am6offset operand. + if (TableEntry->HasWriteBack) + MIB.addOperand(MI.getOperand(OpIdx++)); + + // Grab the super-register source. + MachineOperand MO = MI.getOperand(OpIdx++); + if (!TableEntry->IsLoad) + GetDSubRegs(MO.getReg(), RegSpc, TRI, D0, D1, D2, D3); + + // Add the subregs as sources of the new instruction. + unsigned SrcFlags = (getUndefRegState(MO.isUndef()) | + getKillRegState(MO.isKill())); + MIB.addReg(D0, SrcFlags).addReg(D1, SrcFlags); + if (NumRegs > 2) + MIB.addReg(D2, SrcFlags); + if (NumRegs > 3) + MIB.addReg(D3, SrcFlags); + + // Add the lane number operand. + MIB.addImm(Lane); + + MIB = AddDefaultPred(MIB); + // Copy the super-register source to be an implicit source. + MO.setImplicit(true); + MIB.addOperand(MO); + if (TableEntry->IsLoad) + // Add an implicit def for the super-register. + MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead)); + TransferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); +} + bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { bool Modified = false; @@ -292,204 +602,169 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { } case ARM::VLD1q8Pseudo: - ExpandVLD(MBBI, ARM::VLD1q8, false, SingleSpc, 2); break; case ARM::VLD1q16Pseudo: - ExpandVLD(MBBI, ARM::VLD1q16, false, SingleSpc, 2); break; case ARM::VLD1q32Pseudo: - ExpandVLD(MBBI, ARM::VLD1q32, false, SingleSpc, 2); break; case ARM::VLD1q64Pseudo: - ExpandVLD(MBBI, ARM::VLD1q64, false, SingleSpc, 2); break; case ARM::VLD1q8Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD1q8, true, SingleSpc, 2); break; case ARM::VLD1q16Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD1q16, true, SingleSpc, 2); break; case ARM::VLD1q32Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD1q32, true, SingleSpc, 2); break; case ARM::VLD1q64Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD1q64, true, SingleSpc, 2); break; - case ARM::VLD2d8Pseudo: - ExpandVLD(MBBI, ARM::VLD2d8, false, SingleSpc, 2); break; case ARM::VLD2d16Pseudo: - ExpandVLD(MBBI, ARM::VLD2d16, false, SingleSpc, 2); break; case ARM::VLD2d32Pseudo: - ExpandVLD(MBBI, ARM::VLD2d32, false, SingleSpc, 2); break; case ARM::VLD2q8Pseudo: - ExpandVLD(MBBI, ARM::VLD2q8, false, SingleSpc, 4); break; case ARM::VLD2q16Pseudo: - ExpandVLD(MBBI, ARM::VLD2q16, false, SingleSpc, 4); break; case ARM::VLD2q32Pseudo: - ExpandVLD(MBBI, ARM::VLD2q32, false, SingleSpc, 4); break; case ARM::VLD2d8Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD2d8, true, SingleSpc, 2); break; case ARM::VLD2d16Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD2d16, true, SingleSpc, 2); break; case ARM::VLD2d32Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD2d32, true, SingleSpc, 2); break; case ARM::VLD2q8Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD2q8, true, SingleSpc, 4); break; case ARM::VLD2q16Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD2q16, true, SingleSpc, 4); break; case ARM::VLD2q32Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD2q32, true, SingleSpc, 4); break; - case ARM::VLD3d8Pseudo: - ExpandVLD(MBBI, ARM::VLD3d8, false, SingleSpc, 3); break; case ARM::VLD3d16Pseudo: - ExpandVLD(MBBI, ARM::VLD3d16, false, SingleSpc, 3); break; case ARM::VLD3d32Pseudo: - ExpandVLD(MBBI, ARM::VLD3d32, false, SingleSpc, 3); break; case ARM::VLD1d64TPseudo: - ExpandVLD(MBBI, ARM::VLD1d64T, false, SingleSpc, 3); break; case ARM::VLD3d8Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD3d8_UPD, true, SingleSpc, 3); break; case ARM::VLD3d16Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD3d16_UPD, true, SingleSpc, 3); break; case ARM::VLD3d32Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD3d32_UPD, true, SingleSpc, 3); break; case ARM::VLD1d64TPseudo_UPD: - ExpandVLD(MBBI, ARM::VLD1d64T_UPD, true, SingleSpc, 3); break; case ARM::VLD3q8Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD3q8_UPD, true, EvenDblSpc, 3); break; case ARM::VLD3q16Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD3q16_UPD, true, EvenDblSpc, 3); break; case ARM::VLD3q32Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD3q32_UPD, true, EvenDblSpc, 3); break; case ARM::VLD3q8oddPseudo_UPD: - ExpandVLD(MBBI, ARM::VLD3q8_UPD, true, OddDblSpc, 3); break; case ARM::VLD3q16oddPseudo_UPD: - ExpandVLD(MBBI, ARM::VLD3q16_UPD, true, OddDblSpc, 3); break; case ARM::VLD3q32oddPseudo_UPD: - ExpandVLD(MBBI, ARM::VLD3q32_UPD, true, OddDblSpc, 3); break; - case ARM::VLD4d8Pseudo: - ExpandVLD(MBBI, ARM::VLD4d8, false, SingleSpc, 4); break; case ARM::VLD4d16Pseudo: - ExpandVLD(MBBI, ARM::VLD4d16, false, SingleSpc, 4); break; case ARM::VLD4d32Pseudo: - ExpandVLD(MBBI, ARM::VLD4d32, false, SingleSpc, 4); break; case ARM::VLD1d64QPseudo: - ExpandVLD(MBBI, ARM::VLD1d64Q, false, SingleSpc, 4); break; case ARM::VLD4d8Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD4d8_UPD, true, SingleSpc, 4); break; case ARM::VLD4d16Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD4d16_UPD, true, SingleSpc, 4); break; case ARM::VLD4d32Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD4d32_UPD, true, SingleSpc, 4); break; case ARM::VLD1d64QPseudo_UPD: - ExpandVLD(MBBI, ARM::VLD1d64Q_UPD, true, SingleSpc, 4); break; case ARM::VLD4q8Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD4q8_UPD, true, EvenDblSpc, 4); break; case ARM::VLD4q16Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD4q16_UPD, true, EvenDblSpc, 4); break; case ARM::VLD4q32Pseudo_UPD: - ExpandVLD(MBBI, ARM::VLD4q32_UPD, true, EvenDblSpc, 4); break; case ARM::VLD4q8oddPseudo_UPD: - ExpandVLD(MBBI, ARM::VLD4q8_UPD, true, OddDblSpc, 4); break; case ARM::VLD4q16oddPseudo_UPD: - ExpandVLD(MBBI, ARM::VLD4q16_UPD, true, OddDblSpc, 4); break; case ARM::VLD4q32oddPseudo_UPD: - ExpandVLD(MBBI, ARM::VLD4q32_UPD, true, OddDblSpc, 4); break; + ExpandVLD(MBBI); + break; case ARM::VST1q8Pseudo: - ExpandVST(MBBI, ARM::VST1q8, false, SingleSpc, 2); break; case ARM::VST1q16Pseudo: - ExpandVST(MBBI, ARM::VST1q16, false, SingleSpc, 2); break; case ARM::VST1q32Pseudo: - ExpandVST(MBBI, ARM::VST1q32, false, SingleSpc, 2); break; case ARM::VST1q64Pseudo: - ExpandVST(MBBI, ARM::VST1q64, false, SingleSpc, 2); break; case ARM::VST1q8Pseudo_UPD: - ExpandVST(MBBI, ARM::VST1q8_UPD, true, SingleSpc, 2); break; case ARM::VST1q16Pseudo_UPD: - ExpandVST(MBBI, ARM::VST1q16_UPD, true, SingleSpc, 2); break; case ARM::VST1q32Pseudo_UPD: - ExpandVST(MBBI, ARM::VST1q32_UPD, true, SingleSpc, 2); break; case ARM::VST1q64Pseudo_UPD: - ExpandVST(MBBI, ARM::VST1q64_UPD, true, SingleSpc, 2); break; - case ARM::VST2d8Pseudo: - ExpandVST(MBBI, ARM::VST2d8, false, SingleSpc, 2); break; case ARM::VST2d16Pseudo: - ExpandVST(MBBI, ARM::VST2d16, false, SingleSpc, 2); break; case ARM::VST2d32Pseudo: - ExpandVST(MBBI, ARM::VST2d32, false, SingleSpc, 2); break; case ARM::VST2q8Pseudo: - ExpandVST(MBBI, ARM::VST2q8, false, SingleSpc, 4); break; case ARM::VST2q16Pseudo: - ExpandVST(MBBI, ARM::VST2q16, false, SingleSpc, 4); break; case ARM::VST2q32Pseudo: - ExpandVST(MBBI, ARM::VST2q32, false, SingleSpc, 4); break; case ARM::VST2d8Pseudo_UPD: - ExpandVST(MBBI, ARM::VST2d8_UPD, true, SingleSpc, 2); break; case ARM::VST2d16Pseudo_UPD: - ExpandVST(MBBI, ARM::VST2d16_UPD, true, SingleSpc, 2); break; case ARM::VST2d32Pseudo_UPD: - ExpandVST(MBBI, ARM::VST2d32_UPD, true, SingleSpc, 2); break; case ARM::VST2q8Pseudo_UPD: - ExpandVST(MBBI, ARM::VST2q8_UPD, true, SingleSpc, 4); break; case ARM::VST2q16Pseudo_UPD: - ExpandVST(MBBI, ARM::VST2q16_UPD, true, SingleSpc, 4); break; case ARM::VST2q32Pseudo_UPD: - ExpandVST(MBBI, ARM::VST2q32_UPD, true, SingleSpc, 4); break; - case ARM::VST3d8Pseudo: - ExpandVST(MBBI, ARM::VST3d8, false, SingleSpc, 3); break; case ARM::VST3d16Pseudo: - ExpandVST(MBBI, ARM::VST3d16, false, SingleSpc, 3); break; case ARM::VST3d32Pseudo: - ExpandVST(MBBI, ARM::VST3d32, false, SingleSpc, 3); break; case ARM::VST1d64TPseudo: - ExpandVST(MBBI, ARM::VST1d64T, false, SingleSpc, 3); break; case ARM::VST3d8Pseudo_UPD: - ExpandVST(MBBI, ARM::VST3d8_UPD, true, SingleSpc, 3); break; case ARM::VST3d16Pseudo_UPD: - ExpandVST(MBBI, ARM::VST3d16_UPD, true, SingleSpc, 3); break; case ARM::VST3d32Pseudo_UPD: - ExpandVST(MBBI, ARM::VST3d32_UPD, true, SingleSpc, 3); break; case ARM::VST1d64TPseudo_UPD: - ExpandVST(MBBI, ARM::VST1d64T_UPD, true, SingleSpc, 3); break; case ARM::VST3q8Pseudo_UPD: - ExpandVST(MBBI, ARM::VST3q8_UPD, true, EvenDblSpc, 3); break; case ARM::VST3q16Pseudo_UPD: - ExpandVST(MBBI, ARM::VST3q16_UPD, true, EvenDblSpc, 3); break; case ARM::VST3q32Pseudo_UPD: - ExpandVST(MBBI, ARM::VST3q32_UPD, true, EvenDblSpc, 3); break; case ARM::VST3q8oddPseudo_UPD: - ExpandVST(MBBI, ARM::VST3q8_UPD, true, OddDblSpc, 3); break; case ARM::VST3q16oddPseudo_UPD: - ExpandVST(MBBI, ARM::VST3q16_UPD, true, OddDblSpc, 3); break; case ARM::VST3q32oddPseudo_UPD: - ExpandVST(MBBI, ARM::VST3q32_UPD, true, OddDblSpc, 3); break; - case ARM::VST4d8Pseudo: - ExpandVST(MBBI, ARM::VST4d8, false, SingleSpc, 4); break; case ARM::VST4d16Pseudo: - ExpandVST(MBBI, ARM::VST4d16, false, SingleSpc, 4); break; case ARM::VST4d32Pseudo: - ExpandVST(MBBI, ARM::VST4d32, false, SingleSpc, 4); break; case ARM::VST1d64QPseudo: - ExpandVST(MBBI, ARM::VST1d64Q, false, SingleSpc, 4); break; case ARM::VST4d8Pseudo_UPD: - ExpandVST(MBBI, ARM::VST4d8_UPD, true, SingleSpc, 4); break; case ARM::VST4d16Pseudo_UPD: - ExpandVST(MBBI, ARM::VST4d16_UPD, true, SingleSpc, 4); break; case ARM::VST4d32Pseudo_UPD: - ExpandVST(MBBI, ARM::VST4d32_UPD, true, SingleSpc, 4); break; case ARM::VST1d64QPseudo_UPD: - ExpandVST(MBBI, ARM::VST1d64Q_UPD, true, SingleSpc, 4); break; case ARM::VST4q8Pseudo_UPD: - ExpandVST(MBBI, ARM::VST4q8_UPD, true, EvenDblSpc, 4); break; case ARM::VST4q16Pseudo_UPD: - ExpandVST(MBBI, ARM::VST4q16_UPD, true, EvenDblSpc, 4); break; case ARM::VST4q32Pseudo_UPD: - ExpandVST(MBBI, ARM::VST4q32_UPD, true, EvenDblSpc, 4); break; case ARM::VST4q8oddPseudo_UPD: - ExpandVST(MBBI, ARM::VST4q8_UPD, true, OddDblSpc, 4); break; case ARM::VST4q16oddPseudo_UPD: - ExpandVST(MBBI, ARM::VST4q16_UPD, true, OddDblSpc, 4); break; case ARM::VST4q32oddPseudo_UPD: - ExpandVST(MBBI, ARM::VST4q32_UPD, true, OddDblSpc, 4); break; + ExpandVST(MBBI); + break; + + case ARM::VLD2LNd8Pseudo: + case ARM::VLD2LNd16Pseudo: + case ARM::VLD2LNd32Pseudo: + case ARM::VLD2LNq16Pseudo: + case ARM::VLD2LNq32Pseudo: + case ARM::VLD2LNd8Pseudo_UPD: + case ARM::VLD2LNd16Pseudo_UPD: + case ARM::VLD2LNd32Pseudo_UPD: + case ARM::VLD2LNq16Pseudo_UPD: + case ARM::VLD2LNq32Pseudo_UPD: + case ARM::VLD3LNd8Pseudo: + case ARM::VLD3LNd16Pseudo: + case ARM::VLD3LNd32Pseudo: + case ARM::VLD3LNq16Pseudo: + case ARM::VLD3LNq32Pseudo: + case ARM::VLD3LNd8Pseudo_UPD: + case ARM::VLD3LNd16Pseudo_UPD: + case ARM::VLD3LNd32Pseudo_UPD: + case ARM::VLD3LNq16Pseudo_UPD: + case ARM::VLD3LNq32Pseudo_UPD: + case ARM::VLD4LNd8Pseudo: + case ARM::VLD4LNd16Pseudo: + case ARM::VLD4LNd32Pseudo: + case ARM::VLD4LNq16Pseudo: + case ARM::VLD4LNq32Pseudo: + case ARM::VLD4LNd8Pseudo_UPD: + case ARM::VLD4LNd16Pseudo_UPD: + case ARM::VLD4LNd32Pseudo_UPD: + case ARM::VLD4LNq16Pseudo_UPD: + case ARM::VLD4LNq32Pseudo_UPD: + case ARM::VST2LNd8Pseudo: + case ARM::VST2LNd16Pseudo: + case ARM::VST2LNd32Pseudo: + case ARM::VST2LNq16Pseudo: + case ARM::VST2LNq32Pseudo: + case ARM::VST2LNd8Pseudo_UPD: + case ARM::VST2LNd16Pseudo_UPD: + case ARM::VST2LNd32Pseudo_UPD: + case ARM::VST2LNq16Pseudo_UPD: + case ARM::VST2LNq32Pseudo_UPD: + case ARM::VST3LNd8Pseudo: + case ARM::VST3LNd16Pseudo: + case ARM::VST3LNd32Pseudo: + case ARM::VST3LNq16Pseudo: + case ARM::VST3LNq32Pseudo: + case ARM::VST3LNd8Pseudo_UPD: + case ARM::VST3LNd16Pseudo_UPD: + case ARM::VST3LNd32Pseudo_UPD: + case ARM::VST3LNq16Pseudo_UPD: + case ARM::VST3LNq32Pseudo_UPD: + case ARM::VST4LNd8Pseudo: + case ARM::VST4LNd16Pseudo: + case ARM::VST4LNd32Pseudo: + case ARM::VST4LNq16Pseudo: + case ARM::VST4LNq32Pseudo: + case ARM::VST4LNd8Pseudo_UPD: + case ARM::VST4LNd16Pseudo_UPD: + case ARM::VST4LNd32Pseudo_UPD: + case ARM::VST4LNq16Pseudo_UPD: + case ARM::VST4LNq32Pseudo_UPD: + ExpandLaneOp(MBBI); + break; } if (ModifiedOp) diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 456311ff25e..a477344f375 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -151,10 +151,9 @@ private: /// SelectVLDSTLane - Select NEON load/store lane intrinsics. NumVecs should /// be 2, 3 or 4. The opcode arrays specify the instructions used for - /// load/store of D registers and even subregs and odd subregs of Q registers. + /// load/store of D registers and Q registers. SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, unsigned NumVecs, - unsigned *DOpcodes, unsigned *QOpcodes0, - unsigned *QOpcodes1); + unsigned *DOpcodes, unsigned *QOpcodes); /// SelectVTBL - Select NEON VTBL and VTBX intrinsics. NumVecs should be 2, /// 3 or 4. These are custom-selected so that a REG_SEQUENCE can be @@ -196,10 +195,6 @@ private: SDNode *QuadSRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3); SDNode *QuadDRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3); SDNode *QuadQRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3); - - // Form sequences of 8 consecutive D registers. - SDNode *OctoDRegs(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3, - SDValue V4, SDValue V5, SDValue V6, SDValue V7); }; } @@ -1015,39 +1010,6 @@ SDNode *ARMDAGToDAGISel::QuadQRegs(EVT VT, SDValue V0, SDValue V1, return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 8); } -/// OctoDRegs - Form 8 consecutive D registers. -/// -SDNode *ARMDAGToDAGISel::OctoDRegs(EVT VT, SDValue V0, SDValue V1, - SDValue V2, SDValue V3, - SDValue V4, SDValue V5, - SDValue V6, SDValue V7) { - DebugLoc dl = V0.getNode()->getDebugLoc(); - SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32); - SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32); - SDValue SubReg2 = CurDAG->getTargetConstant(ARM::dsub_2, MVT::i32); - SDValue SubReg3 = CurDAG->getTargetConstant(ARM::dsub_3, MVT::i32); - SDValue SubReg4 = CurDAG->getTargetConstant(ARM::dsub_4, MVT::i32); - SDValue SubReg5 = CurDAG->getTargetConstant(ARM::dsub_5, MVT::i32); - SDValue SubReg6 = CurDAG->getTargetConstant(ARM::dsub_6, MVT::i32); - SDValue SubReg7 = CurDAG->getTargetConstant(ARM::dsub_7, MVT::i32); - const SDValue Ops[] ={ V0, SubReg0, V1, SubReg1, V2, SubReg2, V3, SubReg3, - V4, SubReg4, V5, SubReg5, V6, SubReg6, V7, SubReg7 }; - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 16); -} - -/// GetNEONSubregVT - Given a type for a 128-bit NEON vector, return the type -/// for a 64-bit subregister of the vector. -static EVT GetNEONSubregVT(EVT VT) { - switch (VT.getSimpleVT().SimpleTy) { - default: llvm_unreachable("unhandled NEON type"); - case MVT::v16i8: return MVT::v8i8; - case MVT::v8i16: return MVT::v4i16; - case MVT::v4f32: return MVT::v2f32; - case MVT::v4i32: return MVT::v2i32; - case MVT::v2i64: return MVT::v1i64; - } -} - SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs, unsigned *DOpcodes, unsigned *QOpcodes0, unsigned *QOpcodes1) { @@ -1281,8 +1243,7 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, unsigned NumVecs, unsigned *DOpcodes, - unsigned *QOpcodes0, - unsigned *QOpcodes1) { + unsigned *QOpcodes) { assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range"); DebugLoc dl = N->getDebugLoc(); @@ -1296,16 +1257,6 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, EVT VT = IsLoad ? N->getValueType(0) : N->getOperand(3).getValueType(); bool is64BitVector = VT.is64BitVector(); - // Quad registers are handled by load/store of subregs. Find the subreg info. - unsigned NumElts = 0; - bool Even = false; - EVT RegVT = VT; - if (!is64BitVector) { - RegVT = GetNEONSubregVT(VT); - NumElts = RegVT.getVectorNumElements(); - Even = Lane < NumElts; - } - unsigned OpcodeIndex; switch (VT.getSimpleVT().SimpleTy) { default: llvm_unreachable("unhandled vld/vst lane type"); @@ -1323,121 +1274,59 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, SDValue Pred = getAL(CurDAG); SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); - SmallVector Ops; + SmallVector Ops; Ops.push_back(MemAddr); Ops.push_back(Align); - unsigned Opc = 0; - if (is64BitVector) { - Opc = DOpcodes[OpcodeIndex]; - SDValue RegSeq; - SDValue V0 = N->getOperand(0+3); - SDValue V1 = N->getOperand(1+3); - if (NumVecs == 2) { - RegSeq = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0); - } else { - SDValue V2 = N->getOperand(2+3); - SDValue V3 = (NumVecs == 3) - ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0) - : N->getOperand(3+3); - RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0); - } + unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] : + Opc = QOpcodes[OpcodeIndex]); - // Now extract the D registers back out. - Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, VT, RegSeq)); - Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_1, dl, VT, RegSeq)); - if (NumVecs > 2) - Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_2, dl, VT,RegSeq)); - if (NumVecs > 3) - Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_3, dl, VT,RegSeq)); + SDValue SuperReg; + SDValue V0 = N->getOperand(0+3); + SDValue V1 = N->getOperand(1+3); + if (NumVecs == 2) { + if (is64BitVector) + SuperReg = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0); + else + SuperReg = SDValue(PairQRegs(MVT::v4i64, V0, V1), 0); } else { - // Check if this is loading the even or odd subreg of a Q register. - if (Lane < NumElts) { - Opc = QOpcodes0[OpcodeIndex]; - } else { - Lane -= NumElts; - Opc = QOpcodes1[OpcodeIndex]; - } - - SDValue RegSeq; - SDValue V0 = N->getOperand(0+3); - SDValue V1 = N->getOperand(1+3); - if (NumVecs == 2) { - RegSeq = SDValue(PairQRegs(MVT::v4i64, V0, V1), 0); - } else { - SDValue V2 = N->getOperand(2+3); - SDValue V3 = (NumVecs == 3) - ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0) - : N->getOperand(3+3); - RegSeq = SDValue(QuadQRegs(MVT::v8i64, V0, V1, V2, V3), 0); - } - - // Extract the subregs of the input vector. - unsigned SubIdx = Even ? ARM::dsub_0 : ARM::dsub_1; - for (unsigned Vec = 0; Vec < NumVecs; ++Vec) - Ops.push_back(CurDAG->getTargetExtractSubreg(SubIdx+Vec*2, dl, RegVT, - RegSeq)); + SDValue V2 = N->getOperand(2+3); + SDValue V3 = (NumVecs == 3) + ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0) + : N->getOperand(3+3); + if (is64BitVector) + SuperReg = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0); + else + SuperReg = SDValue(QuadQRegs(MVT::v8i64, V0, V1, V2, V3), 0); } + Ops.push_back(SuperReg); Ops.push_back(getI32Imm(Lane)); Ops.push_back(Pred); Ops.push_back(Reg0); Ops.push_back(Chain); if (!IsLoad) - return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), NumVecs+6); + return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), 7); - std::vector ResTys(NumVecs, RegVT); - ResTys.push_back(MVT::Other); - SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(),NumVecs+6); - - // Form a REG_SEQUENCE to force register allocation. - SDValue RegSeq; - if (is64BitVector) { - SDValue V0 = SDValue(VLdLn, 0); - SDValue V1 = SDValue(VLdLn, 1); - if (NumVecs == 2) { - RegSeq = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0); - } else { - SDValue V2 = SDValue(VLdLn, 2); - // If it's a vld3, form a quad D-register but discard the last part. - SDValue V3 = (NumVecs == 3) - ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0) - : SDValue(VLdLn, 3); - RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0); - } - } else { - // For 128-bit vectors, take the 64-bit results of the load and insert - // them as subregs into the result. - SDValue V[8]; - for (unsigned Vec = 0, i = 0; Vec < NumVecs; ++Vec, i+=2) { - if (Even) { - V[i] = SDValue(VLdLn, Vec); - V[i+1] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, - dl, RegVT), 0); - } else { - V[i] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, - dl, RegVT), 0); - V[i+1] = SDValue(VLdLn, Vec); - } - } - if (NumVecs == 3) - V[6] = V[7] = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, - dl, RegVT), 0); + EVT ResTy; + unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs; + if (!is64BitVector) + ResTyElts *= 2; + ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts); - if (NumVecs == 2) - RegSeq = SDValue(QuadDRegs(MVT::v4i64, V[0], V[1], V[2], V[3]), 0); - else - RegSeq = SDValue(OctoDRegs(MVT::v8i64, V[0], V[1], V[2], V[3], - V[4], V[5], V[6], V[7]), 0); - } + SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTy, MVT::Other, + Ops.data(), 7); + SuperReg = SDValue(VLdLn, 0); + Chain = SDValue(VLdLn, 1); + // Extract the subregisters. assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering"); unsigned SubIdx = is64BitVector ? ARM::dsub_0 : ARM::qsub_0; for (unsigned Vec = 0; Vec < NumVecs; ++Vec) ReplaceUses(SDValue(N, Vec), - CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, RegSeq)); - ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, NumVecs)); + CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg)); + ReplaceUses(SDValue(N, NumVecs), Chain); return NULL; } @@ -2119,24 +2008,24 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { } case Intrinsic::arm_neon_vld2lane: { - unsigned DOpcodes[] = { ARM::VLD2LNd8, ARM::VLD2LNd16, ARM::VLD2LNd32 }; - unsigned QOpcodes0[] = { ARM::VLD2LNq16, ARM::VLD2LNq32 }; - unsigned QOpcodes1[] = { ARM::VLD2LNq16odd, ARM::VLD2LNq32odd }; - return SelectVLDSTLane(N, true, 2, DOpcodes, QOpcodes0, QOpcodes1); + unsigned DOpcodes[] = { ARM::VLD2LNd8Pseudo, ARM::VLD2LNd16Pseudo, + ARM::VLD2LNd32Pseudo }; + unsigned QOpcodes[] = { ARM::VLD2LNq16Pseudo, ARM::VLD2LNq32Pseudo }; + return SelectVLDSTLane(N, true, 2, DOpcodes, QOpcodes); } case Intrinsic::arm_neon_vld3lane: { - unsigned DOpcodes[] = { ARM::VLD3LNd8, ARM::VLD3LNd16, ARM::VLD3LNd32 }; - unsigned QOpcodes0[] = { ARM::VLD3LNq16, ARM::VLD3LNq32 }; - unsigned QOpcodes1[] = { ARM::VLD3LNq16odd, ARM::VLD3LNq32odd }; - return SelectVLDSTLane(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1); + unsigned DOpcodes[] = { ARM::VLD3LNd8Pseudo, ARM::VLD3LNd16Pseudo, + ARM::VLD3LNd32Pseudo }; + unsigned QOpcodes[] = { ARM::VLD3LNq16Pseudo, ARM::VLD3LNq32Pseudo }; + return SelectVLDSTLane(N, true, 3, DOpcodes, QOpcodes); } case Intrinsic::arm_neon_vld4lane: { - unsigned DOpcodes[] = { ARM::VLD4LNd8, ARM::VLD4LNd16, ARM::VLD4LNd32 }; - unsigned QOpcodes0[] = { ARM::VLD4LNq16, ARM::VLD4LNq32 }; - unsigned QOpcodes1[] = { ARM::VLD4LNq16odd, ARM::VLD4LNq32odd }; - return SelectVLDSTLane(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); + unsigned DOpcodes[] = { ARM::VLD4LNd8Pseudo, ARM::VLD4LNd16Pseudo, + ARM::VLD4LNd32Pseudo }; + unsigned QOpcodes[] = { ARM::VLD4LNq16Pseudo, ARM::VLD4LNq32Pseudo }; + return SelectVLDSTLane(N, true, 4, DOpcodes, QOpcodes); } case Intrinsic::arm_neon_vst1: { @@ -2180,24 +2069,24 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { } case Intrinsic::arm_neon_vst2lane: { - unsigned DOpcodes[] = { ARM::VST2LNd8, ARM::VST2LNd16, ARM::VST2LNd32 }; - unsigned QOpcodes0[] = { ARM::VST2LNq16, ARM::VST2LNq32 }; - unsigned QOpcodes1[] = { ARM::VST2LNq16odd, ARM::VST2LNq32odd }; - return SelectVLDSTLane(N, false, 2, DOpcodes, QOpcodes0, QOpcodes1); + unsigned DOpcodes[] = { ARM::VST2LNd8Pseudo, ARM::VST2LNd16Pseudo, + ARM::VST2LNd32Pseudo }; + unsigned QOpcodes[] = { ARM::VST2LNq16Pseudo, ARM::VST2LNq32Pseudo }; + return SelectVLDSTLane(N, false, 2, DOpcodes, QOpcodes); } case Intrinsic::arm_neon_vst3lane: { - unsigned DOpcodes[] = { ARM::VST3LNd8, ARM::VST3LNd16, ARM::VST3LNd32 }; - unsigned QOpcodes0[] = { ARM::VST3LNq16, ARM::VST3LNq32 }; - unsigned QOpcodes1[] = { ARM::VST3LNq16odd, ARM::VST3LNq32odd }; - return SelectVLDSTLane(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1); + unsigned DOpcodes[] = { ARM::VST3LNd8Pseudo, ARM::VST3LNd16Pseudo, + ARM::VST3LNd32Pseudo }; + unsigned QOpcodes[] = { ARM::VST3LNq16Pseudo, ARM::VST3LNq32Pseudo }; + return SelectVLDSTLane(N, false, 3, DOpcodes, QOpcodes); } case Intrinsic::arm_neon_vst4lane: { - unsigned DOpcodes[] = { ARM::VST4LNd8, ARM::VST4LNd16, ARM::VST4LNd32 }; - unsigned QOpcodes0[] = { ARM::VST4LNq16, ARM::VST4LNq32 }; - unsigned QOpcodes1[] = { ARM::VST4LNq16odd, ARM::VST4LNq32odd }; - return SelectVLDSTLane(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1); + unsigned DOpcodes[] = { ARM::VST4LNd8Pseudo, ARM::VST4LNd16Pseudo, + ARM::VST4LNd32Pseudo }; + unsigned QOpcodes[] = { ARM::VST4LNq16Pseudo, ARM::VST4LNq32Pseudo }; + return SelectVLDSTLane(N, false, 4, DOpcodes, QOpcodes); } } break; diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index d22839c23aa..02820b35124 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -445,6 +445,33 @@ def VLD4q8oddPseudo_UPD : VLDQQQQWBPseudo; def VLD4q16oddPseudo_UPD : VLDQQQQWBPseudo; def VLD4q32oddPseudo_UPD : VLDQQQQWBPseudo; +// Classes for VLD*LN pseudo-instructions with multi-register operands. +// These are expanded to real instructions after register allocation. +class VLDQLNPseudo + : PseudoNLdSt<(outs QPR:$dst), + (ins addrmode6:$addr, QPR:$src, nohash_imm:$lane), + itin, "$src = $dst">; +class VLDQLNWBPseudo + : PseudoNLdSt<(outs QPR:$dst, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QPR:$src, + nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">; +class VLDQQLNPseudo + : PseudoNLdSt<(outs QQPR:$dst), + (ins addrmode6:$addr, QQPR:$src, nohash_imm:$lane), + itin, "$src = $dst">; +class VLDQQLNWBPseudo + : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QQPR:$src, + nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">; +class VLDQQQQLNPseudo + : PseudoNLdSt<(outs QQQQPR:$dst), + (ins addrmode6:$addr, QQQQPR:$src, nohash_imm:$lane), + itin, "$src = $dst">; +class VLDQQQQLNWBPseudo + : PseudoNLdSt<(outs QQQQPR:$dst, GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src, + nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">; + // VLD1LN : Vector Load (single element to one lane) // FIXME: Not yet implemented. @@ -459,13 +486,16 @@ def VLD2LNd8 : VLD2LN<0b0001, {?,?,?,?}, "8">; def VLD2LNd16 : VLD2LN<0b0101, {?,?,0,?}, "16">; def VLD2LNd32 : VLD2LN<0b1001, {?,0,?,?}, "32">; +def VLD2LNd8Pseudo : VLDQLNPseudo; +def VLD2LNd16Pseudo : VLDQLNPseudo; +def VLD2LNd32Pseudo : VLDQLNPseudo; + // ...with double-spaced registers: def VLD2LNq16 : VLD2LN<0b0101, {?,?,1,?}, "16">; def VLD2LNq32 : VLD2LN<0b1001, {?,1,?,?}, "32">; -// ...alternate versions to be allocated odd register numbers: -def VLD2LNq16odd : VLD2LN<0b0101, {?,?,1,?}, "16">; -def VLD2LNq32odd : VLD2LN<0b1001, {?,1,?,?}, "32">; +def VLD2LNq16Pseudo : VLDQQLNPseudo; +def VLD2LNq32Pseudo : VLDQQLNPseudo; // ...with address register writeback: class VLD2LNWB op11_8, bits<4> op7_4, string Dt> @@ -479,9 +509,16 @@ def VLD2LNd8_UPD : VLD2LNWB<0b0001, {?,?,?,?}, "8">; def VLD2LNd16_UPD : VLD2LNWB<0b0101, {?,?,0,?}, "16">; def VLD2LNd32_UPD : VLD2LNWB<0b1001, {?,0,?,?}, "32">; +def VLD2LNd8Pseudo_UPD : VLDQLNWBPseudo; +def VLD2LNd16Pseudo_UPD : VLDQLNWBPseudo; +def VLD2LNd32Pseudo_UPD : VLDQLNWBPseudo; + def VLD2LNq16_UPD : VLD2LNWB<0b0101, {?,?,1,?}, "16">; def VLD2LNq32_UPD : VLD2LNWB<0b1001, {?,1,?,?}, "32">; +def VLD2LNq16Pseudo_UPD : VLDQQLNWBPseudo; +def VLD2LNq32Pseudo_UPD : VLDQQLNWBPseudo; + // VLD3LN : Vector Load (single 3-element structure to one lane) class VLD3LN op11_8, bits<4> op7_4, string Dt> : NLdSt<1, 0b10, op11_8, op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3), @@ -494,13 +531,16 @@ def VLD3LNd8 : VLD3LN<0b0010, {?,?,?,0}, "8">; def VLD3LNd16 : VLD3LN<0b0110, {?,?,0,0}, "16">; def VLD3LNd32 : VLD3LN<0b1010, {?,0,0,0}, "32">; +def VLD3LNd8Pseudo : VLDQQLNPseudo; +def VLD3LNd16Pseudo : VLDQQLNPseudo; +def VLD3LNd32Pseudo : VLDQQLNPseudo; + // ...with double-spaced registers: def VLD3LNq16 : VLD3LN<0b0110, {?,?,1,0}, "16">; def VLD3LNq32 : VLD3LN<0b1010, {?,1,0,0}, "32">; -// ...alternate versions to be allocated odd register numbers: -def VLD3LNq16odd : VLD3LN<0b0110, {?,?,1,0}, "16">; -def VLD3LNq32odd : VLD3LN<0b1010, {?,1,0,0}, "32">; +def VLD3LNq16Pseudo : VLDQQQQLNPseudo; +def VLD3LNq32Pseudo : VLDQQQQLNPseudo; // ...with address register writeback: class VLD3LNWB op11_8, bits<4> op7_4, string Dt> @@ -517,9 +557,16 @@ def VLD3LNd8_UPD : VLD3LNWB<0b0010, {?,?,?,0}, "8">; def VLD3LNd16_UPD : VLD3LNWB<0b0110, {?,?,0,0}, "16">; def VLD3LNd32_UPD : VLD3LNWB<0b1010, {?,0,0,0}, "32">; +def VLD3LNd8Pseudo_UPD : VLDQQLNWBPseudo; +def VLD3LNd16Pseudo_UPD : VLDQQLNWBPseudo; +def VLD3LNd32Pseudo_UPD : VLDQQLNWBPseudo; + def VLD3LNq16_UPD : VLD3LNWB<0b0110, {?,?,1,0}, "16">; def VLD3LNq32_UPD : VLD3LNWB<0b1010, {?,1,0,0}, "32">; +def VLD3LNq16Pseudo_UPD : VLDQQQQLNWBPseudo; +def VLD3LNq32Pseudo_UPD : VLDQQQQLNWBPseudo; + // VLD4LN : Vector Load (single 4-element structure to one lane) class VLD4LN op11_8, bits<4> op7_4, string Dt> : NLdSt<1, 0b10, op11_8, op7_4, @@ -533,13 +580,16 @@ def VLD4LNd8 : VLD4LN<0b0011, {?,?,?,?}, "8">; def VLD4LNd16 : VLD4LN<0b0111, {?,?,0,?}, "16">; def VLD4LNd32 : VLD4LN<0b1011, {?,0,?,?}, "32">; +def VLD4LNd8Pseudo : VLDQQLNPseudo; +def VLD4LNd16Pseudo : VLDQQLNPseudo; +def VLD4LNd32Pseudo : VLDQQLNPseudo; + // ...with double-spaced registers: def VLD4LNq16 : VLD4LN<0b0111, {?,?,1,?}, "16">; def VLD4LNq32 : VLD4LN<0b1011, {?,1,?,?}, "32">; -// ...alternate versions to be allocated odd register numbers: -def VLD4LNq16odd : VLD4LN<0b0111, {?,?,1,?}, "16">; -def VLD4LNq32odd : VLD4LN<0b1011, {?,1,?,?}, "32">; +def VLD4LNq16Pseudo : VLDQQQQLNPseudo; +def VLD4LNq32Pseudo : VLDQQQQLNPseudo; // ...with address register writeback: class VLD4LNWB op11_8, bits<4> op7_4, string Dt> @@ -556,9 +606,16 @@ def VLD4LNd8_UPD : VLD4LNWB<0b0011, {?,?,?,?}, "8">; def VLD4LNd16_UPD : VLD4LNWB<0b0111, {?,?,0,?}, "16">; def VLD4LNd32_UPD : VLD4LNWB<0b1011, {?,0,?,?}, "32">; +def VLD4LNd8Pseudo_UPD : VLDQQLNWBPseudo; +def VLD4LNd16Pseudo_UPD : VLDQQLNWBPseudo; +def VLD4LNd32Pseudo_UPD : VLDQQLNWBPseudo; + def VLD4LNq16_UPD : VLD4LNWB<0b0111, {?,?,1,?}, "16">; def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32">; +def VLD4LNq16Pseudo_UPD : VLDQQQQLNWBPseudo; +def VLD4LNq32Pseudo_UPD : VLDQQQQLNWBPseudo; + // VLD1DUP : Vector Load (single element to all lanes) // VLD2DUP : Vector Load (single 2-element structure to all lanes) // VLD3DUP : Vector Load (single 3-element structure to all lanes) @@ -846,6 +903,30 @@ def VST4q8oddPseudo_UPD : VSTQQQQWBPseudo; def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo; def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo; +// Classes for VST*LN pseudo-instructions with multi-register operands. +// These are expanded to real instructions after register allocation. +class VSTQLNPseudo + : PseudoNLdSt<(outs), (ins addrmode6:$addr, QPR:$src, nohash_imm:$lane), + itin, "">; +class VSTQLNWBPseudo + : PseudoNLdSt<(outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QPR:$src, + nohash_imm:$lane), itin, "$addr.addr = $wb">; +class VSTQQLNPseudo + : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQPR:$src, nohash_imm:$lane), + itin, "">; +class VSTQQLNWBPseudo + : PseudoNLdSt<(outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QQPR:$src, + nohash_imm:$lane), itin, "$addr.addr = $wb">; +class VSTQQQQLNPseudo + : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQQQPR:$src, nohash_imm:$lane), + itin, "">; +class VSTQQQQLNWBPseudo + : PseudoNLdSt<(outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src, + nohash_imm:$lane), itin, "$addr.addr = $wb">; + // VST1LN : Vector Store (single element from one lane) // FIXME: Not yet implemented. @@ -860,13 +941,16 @@ def VST2LNd8 : VST2LN<0b0001, {?,?,?,?}, "8">; def VST2LNd16 : VST2LN<0b0101, {?,?,0,?}, "16">; def VST2LNd32 : VST2LN<0b1001, {?,0,?,?}, "32">; +def VST2LNd8Pseudo : VSTQLNPseudo; +def VST2LNd16Pseudo : VSTQLNPseudo; +def VST2LNd32Pseudo : VSTQLNPseudo; + // ...with double-spaced registers: def VST2LNq16 : VST2LN<0b0101, {?,?,1,?}, "16">; def VST2LNq32 : VST2LN<0b1001, {?,1,?,?}, "32">; -// ...alternate versions to be allocated odd register numbers: -def VST2LNq16odd : VST2LN<0b0101, {?,?,1,?}, "16">; -def VST2LNq32odd : VST2LN<0b1001, {?,1,?,?}, "32">; +def VST2LNq16Pseudo : VSTQQLNPseudo; +def VST2LNq32Pseudo : VSTQQLNPseudo; // ...with address register writeback: class VST2LNWB op11_8, bits<4> op7_4, string Dt> @@ -880,9 +964,16 @@ def VST2LNd8_UPD : VST2LNWB<0b0001, {?,?,?,?}, "8">; def VST2LNd16_UPD : VST2LNWB<0b0101, {?,?,0,?}, "16">; def VST2LNd32_UPD : VST2LNWB<0b1001, {?,0,?,?}, "32">; +def VST2LNd8Pseudo_UPD : VSTQLNWBPseudo; +def VST2LNd16Pseudo_UPD : VSTQLNWBPseudo; +def VST2LNd32Pseudo_UPD : VSTQLNWBPseudo; + def VST2LNq16_UPD : VST2LNWB<0b0101, {?,?,1,?}, "16">; def VST2LNq32_UPD : VST2LNWB<0b1001, {?,1,?,?}, "32">; +def VST2LNq16Pseudo_UPD : VSTQQLNWBPseudo; +def VST2LNq32Pseudo_UPD : VSTQQLNWBPseudo; + // VST3LN : Vector Store (single 3-element structure from one lane) class VST3LN op11_8, bits<4> op7_4, string Dt> : NLdSt<1, 0b00, op11_8, op7_4, (outs), @@ -894,13 +985,16 @@ def VST3LNd8 : VST3LN<0b0010, {?,?,?,0}, "8">; def VST3LNd16 : VST3LN<0b0110, {?,?,0,0}, "16">; def VST3LNd32 : VST3LN<0b1010, {?,0,0,0}, "32">; +def VST3LNd8Pseudo : VSTQQLNPseudo; +def VST3LNd16Pseudo : VSTQQLNPseudo; +def VST3LNd32Pseudo : VSTQQLNPseudo; + // ...with double-spaced registers: def VST3LNq16 : VST3LN<0b0110, {?,?,1,0}, "16">; def VST3LNq32 : VST3LN<0b1010, {?,1,0,0}, "32">; -// ...alternate versions to be allocated odd register numbers: -def VST3LNq16odd : VST3LN<0b0110, {?,?,1,0}, "16">; -def VST3LNq32odd : VST3LN<0b1010, {?,1,0,0}, "32">; +def VST3LNq16Pseudo : VSTQQQQLNPseudo; +def VST3LNq32Pseudo : VSTQQQQLNPseudo; // ...with address register writeback: class VST3LNWB op11_8, bits<4> op7_4, string Dt> @@ -915,9 +1009,16 @@ def VST3LNd8_UPD : VST3LNWB<0b0010, {?,?,?,0}, "8">; def VST3LNd16_UPD : VST3LNWB<0b0110, {?,?,0,0}, "16">; def VST3LNd32_UPD : VST3LNWB<0b1010, {?,0,0,0}, "32">; +def VST3LNd8Pseudo_UPD : VSTQQLNWBPseudo; +def VST3LNd16Pseudo_UPD : VSTQQLNWBPseudo; +def VST3LNd32Pseudo_UPD : VSTQQLNWBPseudo; + def VST3LNq16_UPD : VST3LNWB<0b0110, {?,?,1,0}, "16">; def VST3LNq32_UPD : VST3LNWB<0b1010, {?,1,0,0}, "32">; +def VST3LNq16Pseudo_UPD : VSTQQQQLNWBPseudo; +def VST3LNq32Pseudo_UPD : VSTQQQQLNWBPseudo; + // VST4LN : Vector Store (single 4-element structure from one lane) class VST4LN op11_8, bits<4> op7_4, string Dt> : NLdSt<1, 0b00, op11_8, op7_4, (outs), @@ -930,13 +1031,16 @@ def VST4LNd8 : VST4LN<0b0011, {?,?,?,?}, "8">; def VST4LNd16 : VST4LN<0b0111, {?,?,0,?}, "16">; def VST4LNd32 : VST4LN<0b1011, {?,0,?,?}, "32">; +def VST4LNd8Pseudo : VSTQQLNPseudo; +def VST4LNd16Pseudo : VSTQQLNPseudo; +def VST4LNd32Pseudo : VSTQQLNPseudo; + // ...with double-spaced registers: def VST4LNq16 : VST4LN<0b0111, {?,?,1,?}, "16">; def VST4LNq32 : VST4LN<0b1011, {?,1,?,?}, "32">; -// ...alternate versions to be allocated odd register numbers: -def VST4LNq16odd : VST4LN<0b0111, {?,?,1,?}, "16">; -def VST4LNq32odd : VST4LN<0b1011, {?,1,?,?}, "32">; +def VST4LNq16Pseudo : VSTQQQQLNPseudo; +def VST4LNq32Pseudo : VSTQQQQLNPseudo; // ...with address register writeback: class VST4LNWB op11_8, bits<4> op7_4, string Dt> @@ -951,9 +1055,16 @@ def VST4LNd8_UPD : VST4LNWB<0b0011, {?,?,?,?}, "8">; def VST4LNd16_UPD : VST4LNWB<0b0111, {?,?,0,?}, "16">; def VST4LNd32_UPD : VST4LNWB<0b1011, {?,0,?,?}, "32">; +def VST4LNd8Pseudo_UPD : VSTQQLNWBPseudo; +def VST4LNd16Pseudo_UPD : VSTQQLNWBPseudo; +def VST4LNd32Pseudo_UPD : VSTQQLNWBPseudo; + def VST4LNq16_UPD : VST4LNWB<0b0111, {?,?,1,?}, "16">; def VST4LNq32_UPD : VST4LNWB<0b1011, {?,1,?,?}, "32">; +def VST4LNq16Pseudo_UPD : VSTQQQQLNWBPseudo; +def VST4LNq32Pseudo_UPD : VSTQQQQLNWBPseudo; + } // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 diff --git a/lib/Target/ARM/NEONPreAllocPass.cpp b/lib/Target/ARM/NEONPreAllocPass.cpp index 3407ac6fe08..03ef272b873 100644 --- a/lib/Target/ARM/NEONPreAllocPass.cpp +++ b/lib/Target/ARM/NEONPreAllocPass.cpp @@ -51,144 +51,6 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs, default: break; - case ARM::VLD2LNd8: - case ARM::VLD2LNd16: - case ARM::VLD2LNd32: - FirstOpnd = 0; - NumRegs = 2; - return true; - - case ARM::VLD2LNq16: - case ARM::VLD2LNq32: - FirstOpnd = 0; - NumRegs = 2; - Offset = 0; - Stride = 2; - return true; - - case ARM::VLD2LNq16odd: - case ARM::VLD2LNq32odd: - FirstOpnd = 0; - NumRegs = 2; - Offset = 1; - Stride = 2; - return true; - - case ARM::VLD3LNd8: - case ARM::VLD3LNd16: - case ARM::VLD3LNd32: - FirstOpnd = 0; - NumRegs = 3; - return true; - - case ARM::VLD3LNq16: - case ARM::VLD3LNq32: - FirstOpnd = 0; - NumRegs = 3; - Offset = 0; - Stride = 2; - return true; - - case ARM::VLD3LNq16odd: - case ARM::VLD3LNq32odd: - FirstOpnd = 0; - NumRegs = 3; - Offset = 1; - Stride = 2; - return true; - - case ARM::VLD4LNd8: - case ARM::VLD4LNd16: - case ARM::VLD4LNd32: - FirstOpnd = 0; - NumRegs = 4; - return true; - - case ARM::VLD4LNq16: - case ARM::VLD4LNq32: - FirstOpnd = 0; - NumRegs = 4; - Offset = 0; - Stride = 2; - return true; - - case ARM::VLD4LNq16odd: - case ARM::VLD4LNq32odd: - FirstOpnd = 0; - NumRegs = 4; - Offset = 1; - Stride = 2; - return true; - - case ARM::VST2LNd8: - case ARM::VST2LNd16: - case ARM::VST2LNd32: - FirstOpnd = 2; - NumRegs = 2; - return true; - - case ARM::VST2LNq16: - case ARM::VST2LNq32: - FirstOpnd = 2; - NumRegs = 2; - Offset = 0; - Stride = 2; - return true; - - case ARM::VST2LNq16odd: - case ARM::VST2LNq32odd: - FirstOpnd = 2; - NumRegs = 2; - Offset = 1; - Stride = 2; - return true; - - case ARM::VST3LNd8: - case ARM::VST3LNd16: - case ARM::VST3LNd32: - FirstOpnd = 2; - NumRegs = 3; - return true; - - case ARM::VST3LNq16: - case ARM::VST3LNq32: - FirstOpnd = 2; - NumRegs = 3; - Offset = 0; - Stride = 2; - return true; - - case ARM::VST3LNq16odd: - case ARM::VST3LNq32odd: - FirstOpnd = 2; - NumRegs = 3; - Offset = 1; - Stride = 2; - return true; - - case ARM::VST4LNd8: - case ARM::VST4LNd16: - case ARM::VST4LNd32: - FirstOpnd = 2; - NumRegs = 4; - return true; - - case ARM::VST4LNq16: - case ARM::VST4LNq32: - FirstOpnd = 2; - NumRegs = 4; - Offset = 0; - Stride = 2; - return true; - - case ARM::VST4LNq16odd: - case ARM::VST4LNq32odd: - FirstOpnd = 2; - NumRegs = 4; - Offset = 1; - Stride = 2; - return true; - case ARM::VTBL2: FirstOpnd = 1; NumRegs = 2; -- 2.34.1