void X86InstrInfo::anchor() {}
X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
- : X86GenInstrInfo(
- (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32),
- (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)),
- Subtarget(STI), RI(STI) {
+ : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
+ : X86::ADJCALLSTACKDOWN32),
+ (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
+ : X86::ADJCALLSTACKUP32),
+ X86::CATCHRET),
+ Subtarget(STI), RI(STI.getTargetTriple()) {
static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
{ X86::ADC32ri, X86::ADC32mi, 0 },
{ X86::XOR8rr, X86::XOR8mr, 0 }
};
- for (unsigned i = 0, e = array_lengthof(MemoryFoldTable2Addr); i != e; ++i) {
- unsigned RegOp = MemoryFoldTable2Addr[i].RegOp;
- unsigned MemOp = MemoryFoldTable2Addr[i].MemOp;
- unsigned Flags = MemoryFoldTable2Addr[i].Flags;
+ for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2Addr) {
AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable,
- RegOp, MemOp,
+ Entry.RegOp, Entry.MemOp,
// Index 0, folded load and store, no alignment requirement.
- Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
+ Entry.Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
}
static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
{ X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD },
{ X86::PEXTRDrr, X86::PEXTRDmr, TB_FOLDED_STORE },
{ X86::PEXTRQrr, X86::PEXTRQmr, TB_FOLDED_STORE },
+ { X86::PUSH16r, X86::PUSH16rmm, TB_FOLDED_LOAD },
+ { X86::PUSH32r, X86::PUSH32rmm, TB_FOLDED_LOAD },
+ { X86::PUSH64r, X86::PUSH64rmm, TB_FOLDED_LOAD },
{ X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE },
{ X86::SETAr, X86::SETAm, TB_FOLDED_STORE },
{ X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE },
{ X86::VCVTPS2PHYrr, X86::VCVTPS2PHYmr, TB_FOLDED_STORE }
};
- for (unsigned i = 0, e = array_lengthof(MemoryFoldTable0); i != e; ++i) {
- unsigned RegOp = MemoryFoldTable0[i].RegOp;
- unsigned MemOp = MemoryFoldTable0[i].MemOp;
- unsigned Flags = MemoryFoldTable0[i].Flags;
+ for (X86MemoryFoldTableEntry Entry : MemoryFoldTable0) {
AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable,
- RegOp, MemOp, TB_INDEX_0 | Flags);
+ Entry.RegOp, Entry.MemOp, TB_INDEX_0 | Entry.Flags);
}
static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
+ { X86::BSF16rr, X86::BSF16rm, 0 },
+ { X86::BSF32rr, X86::BSF32rm, 0 },
+ { X86::BSF64rr, X86::BSF64rm, 0 },
+ { X86::BSR16rr, X86::BSR16rm, 0 },
+ { X86::BSR32rr, X86::BSR32rm, 0 },
+ { X86::BSR64rr, X86::BSR64rm, 0 },
{ X86::CMP16rr, X86::CMP16rm, 0 },
{ X86::CMP32rr, X86::CMP32rm, 0 },
{ X86::CMP64rr, X86::CMP64rm, 0 },
{ X86::MOVSX64rr8, X86::MOVSX64rm8, 0 },
{ X86::MOVUPDrr, X86::MOVUPDrm, TB_ALIGN_16 },
{ X86::MOVUPSrr, X86::MOVUPSrm, 0 },
- { X86::MOVZQI2PQIrr, X86::MOVZQI2PQIrm, 0 },
{ X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, TB_ALIGN_16 },
{ X86::MOVZX16rr8, X86::MOVZX16rm8, 0 },
{ X86::MOVZX32rr16, X86::MOVZX32rm16, 0 },
{ X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 },
{ X86::PTESTrr, X86::PTESTrm, TB_ALIGN_16 },
{ X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 },
- { X86::RCPPSr_Int, X86::RCPPSm_Int, TB_ALIGN_16 },
+ { X86::RCPSSr, X86::RCPSSm, 0 },
+ { X86::RCPSSr_Int, X86::RCPSSm_Int, 0 },
{ X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 },
{ X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 },
{ X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 },
- { X86::RSQRTPSr_Int, X86::RSQRTPSm_Int, TB_ALIGN_16 },
{ X86::RSQRTSSr, X86::RSQRTSSm, 0 },
{ X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, 0 },
{ X86::SQRTPDr, X86::SQRTPDm, TB_ALIGN_16 },
{ X86::MMX_PABSWrr64, X86::MMX_PABSWrm64, 0 },
{ X86::MMX_PSHUFWri, X86::MMX_PSHUFWmi, 0 },
+ // 3DNow! version of foldable instructions
+ { X86::PF2IDrr, X86::PF2IDrm, 0 },
+ { X86::PF2IWrr, X86::PF2IWrm, 0 },
+ { X86::PFRCPrr, X86::PFRCPrm, 0 },
+ { X86::PFRSQRTrr, X86::PFRSQRTrm, 0 },
+ { X86::PI2FDrr, X86::PI2FDrm, 0 },
+ { X86::PI2FWrr, X86::PI2FWrm, 0 },
+ { X86::PSWAPDrr, X86::PSWAPDrm, 0 },
+
// AVX 128-bit versions of foldable instructions
{ X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, 0 },
{ X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, 0 },
{ X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, 0 },
{ X86::VMOVUPDrr, X86::VMOVUPDrm, 0 },
{ X86::VMOVUPSrr, X86::VMOVUPSrm, 0 },
- { X86::VMOVZQI2PQIrr, X86::VMOVZQI2PQIrm, 0 },
{ X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm, TB_ALIGN_16 },
{ X86::VPABSBrr128, X86::VPABSBrm128, 0 },
{ X86::VPABSDrr128, X86::VPABSDrm128, 0 },
{ X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 },
{ X86::VPTESTrr, X86::VPTESTrm, 0 },
{ X86::VRCPPSr, X86::VRCPPSm, 0 },
- { X86::VRCPPSr_Int, X86::VRCPPSm_Int, 0 },
{ X86::VROUNDPDr, X86::VROUNDPDm, 0 },
{ X86::VROUNDPSr, X86::VROUNDPSm, 0 },
{ X86::VRSQRTPSr, X86::VRSQRTPSm, 0 },
- { X86::VRSQRTPSr_Int, X86::VRSQRTPSm_Int, 0 },
{ X86::VSQRTPDr, X86::VSQRTPDm, 0 },
{ X86::VSQRTPSr, X86::VSQRTPSm, 0 },
{ X86::VTESTPDrr, X86::VTESTPDrm, 0 },
{ X86::VPERMILPSYri, X86::VPERMILPSYmi, 0 },
{ X86::VPTESTYrr, X86::VPTESTYrm, 0 },
{ X86::VRCPPSYr, X86::VRCPPSYm, 0 },
- { X86::VRCPPSYr_Int, X86::VRCPPSYm_Int, 0 },
{ X86::VROUNDYPDr, X86::VROUNDYPDm, 0 },
{ X86::VROUNDYPSr, X86::VROUNDYPSm, 0 },
{ X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 },
- { X86::VRSQRTPSYr_Int, X86::VRSQRTPSYm_Int, 0 },
{ X86::VSQRTPDYr, X86::VSQRTPDYm, 0 },
{ X86::VSQRTPSYr, X86::VSQRTPSYm, 0 },
{ X86::VTESTPDYrr, X86::VTESTPDYrm, 0 },
{ X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, 0 }
};
- for (unsigned i = 0, e = array_lengthof(MemoryFoldTable1); i != e; ++i) {
- unsigned RegOp = MemoryFoldTable1[i].RegOp;
- unsigned MemOp = MemoryFoldTable1[i].MemOp;
- unsigned Flags = MemoryFoldTable1[i].Flags;
+ for (X86MemoryFoldTableEntry Entry : MemoryFoldTable1) {
AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable,
- RegOp, MemOp,
+ Entry.RegOp, Entry.MemOp,
// Index 1, folded load
- Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
+ Entry.Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
}
static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16 },
{ X86::CMPSDrr, X86::CMPSDrm, 0 },
{ X86::CMPSSrr, X86::CMPSSrm, 0 },
+ { X86::CRC32r32r32, X86::CRC32r32m32, 0 },
+ { X86::CRC32r64r64, X86::CRC32r64m64, 0 },
{ X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 },
{ X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 },
{ X86::DIVSDrr, X86::DIVSDrm, 0 },
{ X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 },
{ X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 },
- // FIXME: We should not be folding Fs* scalar loads into vector
- // instructions because the vector instructions require vector-sized
- // loads. Lowering should create vector-sized instructions (the Fv*
- // variants below) to allow load folding.
- { X86::FsANDNPDrr, X86::FsANDNPDrm, TB_ALIGN_16 },
- { X86::FsANDNPSrr, X86::FsANDNPSrm, TB_ALIGN_16 },
- { X86::FsANDPDrr, X86::FsANDPDrm, TB_ALIGN_16 },
- { X86::FsANDPSrr, X86::FsANDPSrm, TB_ALIGN_16 },
- { X86::FsORPDrr, X86::FsORPDrm, TB_ALIGN_16 },
- { X86::FsORPSrr, X86::FsORPSrm, TB_ALIGN_16 },
- { X86::FsXORPDrr, X86::FsXORPDrm, TB_ALIGN_16 },
- { X86::FsXORPSrr, X86::FsXORPSrm, TB_ALIGN_16 },
+ // Do not fold Fs* scalar logical op loads because there are no scalar
+ // load variants for these instructions. When folded, the load is required
+ // to be 128-bits, so the load size would not match.
{ X86::FvANDNPDrr, X86::FvANDNPDrm, TB_ALIGN_16 },
{ X86::FvANDNPSrr, X86::FvANDNPSrm, TB_ALIGN_16 },
{ X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 },
{ X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 },
{ X86::PXORrr, X86::PXORrm, TB_ALIGN_16 },
+ { X86::ROUNDSDr, X86::ROUNDSDm, 0 },
+ { X86::ROUNDSSr, X86::ROUNDSSm, 0 },
{ X86::SBB32rr, X86::SBB32rm, 0 },
{ X86::SBB64rr, X86::SBB64rm, 0 },
{ X86::SHUFPDrri, X86::SHUFPDrmi, TB_ALIGN_16 },
{ X86::MMX_PUNPCKLWDirr, X86::MMX_PUNPCKLWDirm, 0 },
{ X86::MMX_PXORirr, X86::MMX_PXORirm, 0 },
+ // 3DNow! version of foldable instructions
+ { X86::PAVGUSBrr, X86::PAVGUSBrm, 0 },
+ { X86::PFACCrr, X86::PFACCrm, 0 },
+ { X86::PFADDrr, X86::PFADDrm, 0 },
+ { X86::PFCMPEQrr, X86::PFCMPEQrm, 0 },
+ { X86::PFCMPGErr, X86::PFCMPGErm, 0 },
+ { X86::PFCMPGTrr, X86::PFCMPGTrm, 0 },
+ { X86::PFMAXrr, X86::PFMAXrm, 0 },
+ { X86::PFMINrr, X86::PFMINrm, 0 },
+ { X86::PFMULrr, X86::PFMULrm, 0 },
+ { X86::PFNACCrr, X86::PFNACCrm, 0 },
+ { X86::PFPNACCrr, X86::PFPNACCrm, 0 },
+ { X86::PFRCPIT1rr, X86::PFRCPIT1rm, 0 },
+ { X86::PFRCPIT2rr, X86::PFRCPIT2rm, 0 },
+ { X86::PFRSQIT1rr, X86::PFRSQIT1rm, 0 },
+ { X86::PFSUBrr, X86::PFSUBrm, 0 },
+ { X86::PFSUBRrr, X86::PFSUBRrm, 0 },
+ { X86::PMULHRWrr, X86::PMULHRWrm, 0 },
+
// AVX 128-bit versions of foldable instructions
{ X86::VCVTSD2SSrr, X86::VCVTSD2SSrm, 0 },
{ X86::Int_VCVTSD2SSrr, X86::Int_VCVTSD2SSrm, 0 },
{ X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 },
{ X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, 0 },
{ X86::VRCPSSr, X86::VRCPSSm, 0 },
+ { X86::VRCPSSr_Int, X86::VRCPSSm_Int, 0 },
{ X86::VRSQRTSSr, X86::VRSQRTSSm, 0 },
+ { X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, 0 },
{ X86::VSQRTSDr, X86::VSQRTSDm, 0 },
+ { X86::VSQRTSDr_Int, X86::VSQRTSDm_Int, 0 },
{ X86::VSQRTSSr, X86::VSQRTSSm, 0 },
+ { X86::VSQRTSSr_Int, X86::VSQRTSSm_Int, 0 },
{ X86::VADDPDrr, X86::VADDPDrm, 0 },
{ X86::VADDPSrr, X86::VADDPSrm, 0 },
{ X86::VADDSDrr, X86::VADDSDrm, 0 },
{ X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 },
{ X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, 0 },
{ X86::VPXORrr, X86::VPXORrm, 0 },
+ { X86::VROUNDSDr, X86::VROUNDSDm, 0 },
+ { X86::VROUNDSSr, X86::VROUNDSSm, 0 },
{ X86::VSHUFPDrri, X86::VSHUFPDrmi, 0 },
{ X86::VSHUFPSrri, X86::VSHUFPSrmi, 0 },
{ X86::VSUBPDrr, X86::VSUBPDrm, 0 },
{ X86::VPXORYrr, X86::VPXORYrm, 0 },
// FMA4 foldable patterns
- { X86::VFMADDSS4rr, X86::VFMADDSS4mr, 0 },
- { X86::VFMADDSD4rr, X86::VFMADDSD4mr, 0 },
- { X86::VFMADDPS4rr, X86::VFMADDPS4mr, 0 },
- { X86::VFMADDPD4rr, X86::VFMADDPD4mr, 0 },
- { X86::VFMADDPS4rrY, X86::VFMADDPS4mrY, 0 },
- { X86::VFMADDPD4rrY, X86::VFMADDPD4mrY, 0 },
- { X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, 0 },
- { X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, 0 },
- { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, 0 },
- { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, 0 },
- { X86::VFNMADDPS4rrY, X86::VFNMADDPS4mrY, 0 },
- { X86::VFNMADDPD4rrY, X86::VFNMADDPD4mrY, 0 },
- { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, 0 },
- { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, 0 },
- { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, 0 },
- { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, 0 },
- { X86::VFMSUBPS4rrY, X86::VFMSUBPS4mrY, 0 },
- { X86::VFMSUBPD4rrY, X86::VFMSUBPD4mrY, 0 },
- { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, 0 },
- { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, 0 },
- { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, 0 },
- { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, 0 },
- { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4mrY, 0 },
- { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4mrY, 0 },
- { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, 0 },
- { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, 0 },
- { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4mrY, 0 },
- { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4mrY, 0 },
- { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, 0 },
- { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, 0 },
- { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4mrY, 0 },
- { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, 0 },
+ { X86::VFMADDSS4rr, X86::VFMADDSS4mr, TB_ALIGN_NONE },
+ { X86::VFMADDSD4rr, X86::VFMADDSD4mr, TB_ALIGN_NONE },
+ { X86::VFMADDPS4rr, X86::VFMADDPS4mr, TB_ALIGN_NONE },
+ { X86::VFMADDPD4rr, X86::VFMADDPD4mr, TB_ALIGN_NONE },
+ { X86::VFMADDPS4rrY, X86::VFMADDPS4mrY, TB_ALIGN_NONE },
+ { X86::VFMADDPD4rrY, X86::VFMADDPD4mrY, TB_ALIGN_NONE },
+ { X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, TB_ALIGN_NONE },
+ { X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, TB_ALIGN_NONE },
+ { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, TB_ALIGN_NONE },
+ { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, TB_ALIGN_NONE },
+ { X86::VFNMADDPS4rrY, X86::VFNMADDPS4mrY, TB_ALIGN_NONE },
+ { X86::VFNMADDPD4rrY, X86::VFNMADDPD4mrY, TB_ALIGN_NONE },
+ { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBPS4rrY, X86::VFMSUBPS4mrY, TB_ALIGN_NONE },
+ { X86::VFMSUBPD4rrY, X86::VFMSUBPD4mrY, TB_ALIGN_NONE },
+ { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, TB_ALIGN_NONE },
+ { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, TB_ALIGN_NONE },
+ { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, TB_ALIGN_NONE },
+ { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, TB_ALIGN_NONE },
+ { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4mrY, TB_ALIGN_NONE },
+ { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4mrY, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4mrY, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4mrY, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4mrY, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, TB_ALIGN_NONE },
// XOP foldable instructions
{ X86::VPCMOVrr, X86::VPCMOVmr, 0 },
{ X86::VPSUBQZrr, X86::VPSUBQZrm, 0 },
{ X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 },
{ X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 },
- { X86::VALIGNQrri, X86::VALIGNQrmi, 0 },
- { X86::VALIGNDrri, X86::VALIGNDrmi, 0 },
+ { X86::VALIGNQZrri, X86::VALIGNQZrmi, 0 },
+ { X86::VALIGNDZrri, X86::VALIGNDZrmi, 0 },
{ X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 },
{ X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE },
{ X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE },
{ X86::SHA256RNDS2rr, X86::SHA256RNDS2rm, TB_ALIGN_16 }
};
- for (unsigned i = 0, e = array_lengthof(MemoryFoldTable2); i != e; ++i) {
- unsigned RegOp = MemoryFoldTable2[i].RegOp;
- unsigned MemOp = MemoryFoldTable2[i].MemOp;
- unsigned Flags = MemoryFoldTable2[i].Flags;
+ for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2) {
AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable,
- RegOp, MemOp,
+ Entry.RegOp, Entry.MemOp,
// Index 2, folded load
- Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
+ Entry.Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
}
static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
// FMA foldable instructions
{ X86::VFMADDSSr231r, X86::VFMADDSSr231m, TB_ALIGN_NONE },
+ { X86::VFMADDSSr231r_Int, X86::VFMADDSSr231m_Int, TB_ALIGN_NONE },
{ X86::VFMADDSDr231r, X86::VFMADDSDr231m, TB_ALIGN_NONE },
+ { X86::VFMADDSDr231r_Int, X86::VFMADDSDr231m_Int, TB_ALIGN_NONE },
{ X86::VFMADDSSr132r, X86::VFMADDSSr132m, TB_ALIGN_NONE },
+ { X86::VFMADDSSr132r_Int, X86::VFMADDSSr132m_Int, TB_ALIGN_NONE },
{ X86::VFMADDSDr132r, X86::VFMADDSDr132m, TB_ALIGN_NONE },
+ { X86::VFMADDSDr132r_Int, X86::VFMADDSDr132m_Int, TB_ALIGN_NONE },
{ X86::VFMADDSSr213r, X86::VFMADDSSr213m, TB_ALIGN_NONE },
+ { X86::VFMADDSSr213r_Int, X86::VFMADDSSr213m_Int, TB_ALIGN_NONE },
{ X86::VFMADDSDr213r, X86::VFMADDSDr213m, TB_ALIGN_NONE },
+ { X86::VFMADDSDr213r_Int, X86::VFMADDSDr213m_Int, TB_ALIGN_NONE },
{ X86::VFMADDPSr231r, X86::VFMADDPSr231m, TB_ALIGN_NONE },
{ X86::VFMADDPDr231r, X86::VFMADDPDr231m, TB_ALIGN_NONE },
{ X86::VFMADDPDr213rY, X86::VFMADDPDr213mY, TB_ALIGN_NONE },
{ X86::VFNMADDSSr231r, X86::VFNMADDSSr231m, TB_ALIGN_NONE },
+ { X86::VFNMADDSSr231r_Int, X86::VFNMADDSSr231m_Int, TB_ALIGN_NONE },
{ X86::VFNMADDSDr231r, X86::VFNMADDSDr231m, TB_ALIGN_NONE },
+ { X86::VFNMADDSDr231r_Int, X86::VFNMADDSDr231m_Int, TB_ALIGN_NONE },
{ X86::VFNMADDSSr132r, X86::VFNMADDSSr132m, TB_ALIGN_NONE },
+ { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr132m_Int, TB_ALIGN_NONE },
{ X86::VFNMADDSDr132r, X86::VFNMADDSDr132m, TB_ALIGN_NONE },
+ { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr132m_Int, TB_ALIGN_NONE },
{ X86::VFNMADDSSr213r, X86::VFNMADDSSr213m, TB_ALIGN_NONE },
+ { X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr213m_Int, TB_ALIGN_NONE },
{ X86::VFNMADDSDr213r, X86::VFNMADDSDr213m, TB_ALIGN_NONE },
+ { X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr213m_Int, TB_ALIGN_NONE },
{ X86::VFNMADDPSr231r, X86::VFNMADDPSr231m, TB_ALIGN_NONE },
{ X86::VFNMADDPDr231r, X86::VFNMADDPDr231m, TB_ALIGN_NONE },
{ X86::VFNMADDPDr213rY, X86::VFNMADDPDr213mY, TB_ALIGN_NONE },
{ X86::VFMSUBSSr231r, X86::VFMSUBSSr231m, TB_ALIGN_NONE },
+ { X86::VFMSUBSSr231r_Int, X86::VFMSUBSSr231m_Int, TB_ALIGN_NONE },
{ X86::VFMSUBSDr231r, X86::VFMSUBSDr231m, TB_ALIGN_NONE },
+ { X86::VFMSUBSDr231r_Int, X86::VFMSUBSDr231m_Int, TB_ALIGN_NONE },
{ X86::VFMSUBSSr132r, X86::VFMSUBSSr132m, TB_ALIGN_NONE },
+ { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr132m_Int, TB_ALIGN_NONE },
{ X86::VFMSUBSDr132r, X86::VFMSUBSDr132m, TB_ALIGN_NONE },
+ { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr132m_Int, TB_ALIGN_NONE },
{ X86::VFMSUBSSr213r, X86::VFMSUBSSr213m, TB_ALIGN_NONE },
+ { X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr213m_Int, TB_ALIGN_NONE },
{ X86::VFMSUBSDr213r, X86::VFMSUBSDr213m, TB_ALIGN_NONE },
+ { X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr213m_Int, TB_ALIGN_NONE },
{ X86::VFMSUBPSr231r, X86::VFMSUBPSr231m, TB_ALIGN_NONE },
{ X86::VFMSUBPDr231r, X86::VFMSUBPDr231m, TB_ALIGN_NONE },
{ X86::VFMSUBPDr213rY, X86::VFMSUBPDr213mY, TB_ALIGN_NONE },
{ X86::VFNMSUBSSr231r, X86::VFNMSUBSSr231m, TB_ALIGN_NONE },
+ { X86::VFNMSUBSSr231r_Int, X86::VFNMSUBSSr231m_Int, TB_ALIGN_NONE },
{ X86::VFNMSUBSDr231r, X86::VFNMSUBSDr231m, TB_ALIGN_NONE },
+ { X86::VFNMSUBSDr231r_Int, X86::VFNMSUBSDr231m_Int, TB_ALIGN_NONE },
{ X86::VFNMSUBSSr132r, X86::VFNMSUBSSr132m, TB_ALIGN_NONE },
+ { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr132m_Int, TB_ALIGN_NONE },
{ X86::VFNMSUBSDr132r, X86::VFNMSUBSDr132m, TB_ALIGN_NONE },
+ { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr132m_Int, TB_ALIGN_NONE },
{ X86::VFNMSUBSSr213r, X86::VFNMSUBSSr213m, TB_ALIGN_NONE },
+ { X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr213m_Int, TB_ALIGN_NONE },
{ X86::VFNMSUBSDr213r, X86::VFNMSUBSDr213m, TB_ALIGN_NONE },
+ { X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr213m_Int, TB_ALIGN_NONE },
{ X86::VFNMSUBPSr231r, X86::VFNMSUBPSr231m, TB_ALIGN_NONE },
{ X86::VFNMSUBPDr231r, X86::VFNMSUBPDr231m, TB_ALIGN_NONE },
{ X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr213mY, TB_ALIGN_NONE },
// FMA4 foldable patterns
- { X86::VFMADDSS4rr, X86::VFMADDSS4rm, 0 },
- { X86::VFMADDSD4rr, X86::VFMADDSD4rm, 0 },
- { X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_16 },
- { X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_16 },
- { X86::VFMADDPS4rrY, X86::VFMADDPS4rmY, TB_ALIGN_32 },
- { X86::VFMADDPD4rrY, X86::VFMADDPD4rmY, TB_ALIGN_32 },
- { X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, 0 },
- { X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, 0 },
- { X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_16 },
- { X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_16 },
- { X86::VFNMADDPS4rrY, X86::VFNMADDPS4rmY, TB_ALIGN_32 },
- { X86::VFNMADDPD4rrY, X86::VFNMADDPD4rmY, TB_ALIGN_32 },
- { X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, 0 },
- { X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, 0 },
- { X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_16 },
- { X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_16 },
- { X86::VFMSUBPS4rrY, X86::VFMSUBPS4rmY, TB_ALIGN_32 },
- { X86::VFMSUBPD4rrY, X86::VFMSUBPD4rmY, TB_ALIGN_32 },
- { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, 0 },
- { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, 0 },
- { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_16 },
- { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_16 },
- { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4rmY, TB_ALIGN_32 },
- { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4rmY, TB_ALIGN_32 },
- { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_16 },
- { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_16 },
- { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4rmY, TB_ALIGN_32 },
- { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4rmY, TB_ALIGN_32 },
- { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_16 },
- { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_16 },
- { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4rmY, TB_ALIGN_32 },
- { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4rmY, TB_ALIGN_32 },
+ { X86::VFMADDSS4rr, X86::VFMADDSS4rm, TB_ALIGN_NONE },
+ { X86::VFMADDSD4rr, X86::VFMADDSD4rm, TB_ALIGN_NONE },
+ { X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_NONE },
+ { X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_NONE },
+ { X86::VFMADDPS4rrY, X86::VFMADDPS4rmY, TB_ALIGN_NONE },
+ { X86::VFMADDPD4rrY, X86::VFMADDPD4rmY, TB_ALIGN_NONE },
+ { X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, TB_ALIGN_NONE },
+ { X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, TB_ALIGN_NONE },
+ { X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_NONE },
+ { X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_NONE },
+ { X86::VFNMADDPS4rrY, X86::VFNMADDPS4rmY, TB_ALIGN_NONE },
+ { X86::VFNMADDPD4rrY, X86::VFNMADDPD4rmY, TB_ALIGN_NONE },
+ { X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBPS4rrY, X86::VFMSUBPS4rmY, TB_ALIGN_NONE },
+ { X86::VFMSUBPD4rrY, X86::VFMSUBPD4rmY, TB_ALIGN_NONE },
+ { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, TB_ALIGN_NONE },
+ { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, TB_ALIGN_NONE },
+ { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_NONE },
+ { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_NONE },
+ { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4rmY, TB_ALIGN_NONE },
+ { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4rmY, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4rmY, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4rmY, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4rmY, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4rmY, TB_ALIGN_NONE },
// XOP foldable instructions
{ X86::VPCMOVrr, X86::VPCMOVrm, 0 },
{ X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 }
};
- for (unsigned i = 0, e = array_lengthof(MemoryFoldTable3); i != e; ++i) {
- unsigned RegOp = MemoryFoldTable3[i].RegOp;
- unsigned MemOp = MemoryFoldTable3[i].MemOp;
- unsigned Flags = MemoryFoldTable3[i].Flags;
+ for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) {
AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
- RegOp, MemOp,
+ Entry.RegOp, Entry.MemOp,
// Index 3, folded load
- Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
+ Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
}
static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 }
};
- for (unsigned i = 0, e = array_lengthof(MemoryFoldTable4); i != e; ++i) {
- unsigned RegOp = MemoryFoldTable4[i].RegOp;
- unsigned MemOp = MemoryFoldTable4[i].MemOp;
- unsigned Flags = MemoryFoldTable4[i].Flags;
+ for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) {
AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
- RegOp, MemOp,
+ Entry.RegOp, Entry.MemOp,
// Index 4, folded load
- Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
+ Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
}
}
case X86::FsVMOVAPSrm:
case X86::FsVMOVAPDrm:
case X86::FsMOVAPSrm:
- case X86::FsMOVAPDrm: {
+ case X86::FsMOVAPDrm:
+ // AVX-512
+ case X86::VMOVAPDZ128rm:
+ case X86::VMOVAPDZ256rm:
+ case X86::VMOVAPDZrm:
+ case X86::VMOVAPSZ128rm:
+ case X86::VMOVAPSZ256rm:
+ case X86::VMOVAPSZrm:
+ case X86::VMOVDQA32Z128rm:
+ case X86::VMOVDQA32Z256rm:
+ case X86::VMOVDQA32Zrm:
+ case X86::VMOVDQA64Z128rm:
+ case X86::VMOVDQA64Z256rm:
+ case X86::VMOVDQA64Zrm:
+ case X86::VMOVDQU16Z128rm:
+ case X86::VMOVDQU16Z256rm:
+ case X86::VMOVDQU16Zrm:
+ case X86::VMOVDQU32Z128rm:
+ case X86::VMOVDQU32Z256rm:
+ case X86::VMOVDQU32Zrm:
+ case X86::VMOVDQU64Z128rm:
+ case X86::VMOVDQU64Z256rm:
+ case X86::VMOVDQU64Zrm:
+ case X86::VMOVDQU8Z128rm:
+ case X86::VMOVDQU8Z256rm:
+ case X86::VMOVDQU8Zrm:
+ case X86::VMOVUPSZ128rm:
+ case X86::VMOVUPSZ256rm:
+ case X86::VMOVUPSZrm: {
// Loads from constant pools are trivially rematerializable.
if (MI->getOperand(1+X86::AddrBaseReg).isReg() &&
MI->getOperand(1+X86::AddrScaleAmt).isImm() &&
}
/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
-static bool hasLiveCondCodeDef(MachineInstr *MI) {
+bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr *MI) const {
for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
MachineOperand &MO = MI->getOperand(i);
if (MO.isReg() && MO.isDef() &&
inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
// Left shift instructions can be transformed into load-effective-address
// instructions if we can encode them appropriately.
- // A LEA instruction utilizes a SIB byte to encode it's scale factor.
+ // A LEA instruction utilizes a SIB byte to encode its scale factor.
// The SIB.scale field is two bits wide which means that we can encode any
// shift amount less than 4.
return ShAmt < 4 && ShAmt > 0;
return NewMI;
}
-/// We have a few instructions that must be hacked on to commute them.
-///
-MachineInstr *
-X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
+/// Returns true if the given instruction opcode is FMA3.
+/// Otherwise, returns false.
+/// The second parameter is optional and is used as the second return from
+/// the function. It is set to true if the given instruction has FMA3 opcode
+/// that is used for lowering of scalar FMA intrinsics, and it is set to false
+/// otherwise.
+static bool isFMA3(unsigned Opcode, bool *IsIntrinsic = nullptr) {
+ if (IsIntrinsic)
+ *IsIntrinsic = false;
+
+ switch (Opcode) {
+ case X86::VFMADDSDr132r: case X86::VFMADDSDr132m:
+ case X86::VFMADDSSr132r: case X86::VFMADDSSr132m:
+ case X86::VFMSUBSDr132r: case X86::VFMSUBSDr132m:
+ case X86::VFMSUBSSr132r: case X86::VFMSUBSSr132m:
+ case X86::VFNMADDSDr132r: case X86::VFNMADDSDr132m:
+ case X86::VFNMADDSSr132r: case X86::VFNMADDSSr132m:
+ case X86::VFNMSUBSDr132r: case X86::VFNMSUBSDr132m:
+ case X86::VFNMSUBSSr132r: case X86::VFNMSUBSSr132m:
+
+ case X86::VFMADDSDr213r: case X86::VFMADDSDr213m:
+ case X86::VFMADDSSr213r: case X86::VFMADDSSr213m:
+ case X86::VFMSUBSDr213r: case X86::VFMSUBSDr213m:
+ case X86::VFMSUBSSr213r: case X86::VFMSUBSSr213m:
+ case X86::VFNMADDSDr213r: case X86::VFNMADDSDr213m:
+ case X86::VFNMADDSSr213r: case X86::VFNMADDSSr213m:
+ case X86::VFNMSUBSDr213r: case X86::VFNMSUBSDr213m:
+ case X86::VFNMSUBSSr213r: case X86::VFNMSUBSSr213m:
+
+ case X86::VFMADDSDr231r: case X86::VFMADDSDr231m:
+ case X86::VFMADDSSr231r: case X86::VFMADDSSr231m:
+ case X86::VFMSUBSDr231r: case X86::VFMSUBSDr231m:
+ case X86::VFMSUBSSr231r: case X86::VFMSUBSSr231m:
+ case X86::VFNMADDSDr231r: case X86::VFNMADDSDr231m:
+ case X86::VFNMADDSSr231r: case X86::VFNMADDSSr231m:
+ case X86::VFNMSUBSDr231r: case X86::VFNMSUBSDr231m:
+ case X86::VFNMSUBSSr231r: case X86::VFNMSUBSSr231m:
+
+ case X86::VFMADDSUBPDr132r: case X86::VFMADDSUBPDr132m:
+ case X86::VFMADDSUBPSr132r: case X86::VFMADDSUBPSr132m:
+ case X86::VFMSUBADDPDr132r: case X86::VFMSUBADDPDr132m:
+ case X86::VFMSUBADDPSr132r: case X86::VFMSUBADDPSr132m:
+ case X86::VFMADDSUBPDr132rY: case X86::VFMADDSUBPDr132mY:
+ case X86::VFMADDSUBPSr132rY: case X86::VFMADDSUBPSr132mY:
+ case X86::VFMSUBADDPDr132rY: case X86::VFMSUBADDPDr132mY:
+ case X86::VFMSUBADDPSr132rY: case X86::VFMSUBADDPSr132mY:
+
+ case X86::VFMADDPDr132r: case X86::VFMADDPDr132m:
+ case X86::VFMADDPSr132r: case X86::VFMADDPSr132m:
+ case X86::VFMSUBPDr132r: case X86::VFMSUBPDr132m:
+ case X86::VFMSUBPSr132r: case X86::VFMSUBPSr132m:
+ case X86::VFNMADDPDr132r: case X86::VFNMADDPDr132m:
+ case X86::VFNMADDPSr132r: case X86::VFNMADDPSr132m:
+ case X86::VFNMSUBPDr132r: case X86::VFNMSUBPDr132m:
+ case X86::VFNMSUBPSr132r: case X86::VFNMSUBPSr132m:
+ case X86::VFMADDPDr132rY: case X86::VFMADDPDr132mY:
+ case X86::VFMADDPSr132rY: case X86::VFMADDPSr132mY:
+ case X86::VFMSUBPDr132rY: case X86::VFMSUBPDr132mY:
+ case X86::VFMSUBPSr132rY: case X86::VFMSUBPSr132mY:
+ case X86::VFNMADDPDr132rY: case X86::VFNMADDPDr132mY:
+ case X86::VFNMADDPSr132rY: case X86::VFNMADDPSr132mY:
+ case X86::VFNMSUBPDr132rY: case X86::VFNMSUBPDr132mY:
+ case X86::VFNMSUBPSr132rY: case X86::VFNMSUBPSr132mY:
+
+ case X86::VFMADDSUBPDr213r: case X86::VFMADDSUBPDr213m:
+ case X86::VFMADDSUBPSr213r: case X86::VFMADDSUBPSr213m:
+ case X86::VFMSUBADDPDr213r: case X86::VFMSUBADDPDr213m:
+ case X86::VFMSUBADDPSr213r: case X86::VFMSUBADDPSr213m:
+ case X86::VFMADDSUBPDr213rY: case X86::VFMADDSUBPDr213mY:
+ case X86::VFMADDSUBPSr213rY: case X86::VFMADDSUBPSr213mY:
+ case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPDr213mY:
+ case X86::VFMSUBADDPSr213rY: case X86::VFMSUBADDPSr213mY:
+
+ case X86::VFMADDPDr213r: case X86::VFMADDPDr213m:
+ case X86::VFMADDPSr213r: case X86::VFMADDPSr213m:
+ case X86::VFMSUBPDr213r: case X86::VFMSUBPDr213m:
+ case X86::VFMSUBPSr213r: case X86::VFMSUBPSr213m:
+ case X86::VFNMADDPDr213r: case X86::VFNMADDPDr213m:
+ case X86::VFNMADDPSr213r: case X86::VFNMADDPSr213m:
+ case X86::VFNMSUBPDr213r: case X86::VFNMSUBPDr213m:
+ case X86::VFNMSUBPSr213r: case X86::VFNMSUBPSr213m:
+ case X86::VFMADDPDr213rY: case X86::VFMADDPDr213mY:
+ case X86::VFMADDPSr213rY: case X86::VFMADDPSr213mY:
+ case X86::VFMSUBPDr213rY: case X86::VFMSUBPDr213mY:
+ case X86::VFMSUBPSr213rY: case X86::VFMSUBPSr213mY:
+ case X86::VFNMADDPDr213rY: case X86::VFNMADDPDr213mY:
+ case X86::VFNMADDPSr213rY: case X86::VFNMADDPSr213mY:
+ case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPDr213mY:
+ case X86::VFNMSUBPSr213rY: case X86::VFNMSUBPSr213mY:
+
+ case X86::VFMADDSUBPDr231r: case X86::VFMADDSUBPDr231m:
+ case X86::VFMADDSUBPSr231r: case X86::VFMADDSUBPSr231m:
+ case X86::VFMSUBADDPDr231r: case X86::VFMSUBADDPDr231m:
+ case X86::VFMSUBADDPSr231r: case X86::VFMSUBADDPSr231m:
+ case X86::VFMADDSUBPDr231rY: case X86::VFMADDSUBPDr231mY:
+ case X86::VFMADDSUBPSr231rY: case X86::VFMADDSUBPSr231mY:
+ case X86::VFMSUBADDPDr231rY: case X86::VFMSUBADDPDr231mY:
+ case X86::VFMSUBADDPSr231rY: case X86::VFMSUBADDPSr231mY:
+
+ case X86::VFMADDPDr231r: case X86::VFMADDPDr231m:
+ case X86::VFMADDPSr231r: case X86::VFMADDPSr231m:
+ case X86::VFMSUBPDr231r: case X86::VFMSUBPDr231m:
+ case X86::VFMSUBPSr231r: case X86::VFMSUBPSr231m:
+ case X86::VFNMADDPDr231r: case X86::VFNMADDPDr231m:
+ case X86::VFNMADDPSr231r: case X86::VFNMADDPSr231m:
+ case X86::VFNMSUBPDr231r: case X86::VFNMSUBPDr231m:
+ case X86::VFNMSUBPSr231r: case X86::VFNMSUBPSr231m:
+ case X86::VFMADDPDr231rY: case X86::VFMADDPDr231mY:
+ case X86::VFMADDPSr231rY: case X86::VFMADDPSr231mY:
+ case X86::VFMSUBPDr231rY: case X86::VFMSUBPDr231mY:
+ case X86::VFMSUBPSr231rY: case X86::VFMSUBPSr231mY:
+ case X86::VFNMADDPDr231rY: case X86::VFNMADDPDr231mY:
+ case X86::VFNMADDPSr231rY: case X86::VFNMADDPSr231mY:
+ case X86::VFNMSUBPDr231rY: case X86::VFNMSUBPDr231mY:
+ case X86::VFNMSUBPSr231rY: case X86::VFNMSUBPSr231mY:
+ return true;
+
+ case X86::VFMADDSDr132r_Int: case X86::VFMADDSDr132m_Int:
+ case X86::VFMADDSSr132r_Int: case X86::VFMADDSSr132m_Int:
+ case X86::VFMSUBSDr132r_Int: case X86::VFMSUBSDr132m_Int:
+ case X86::VFMSUBSSr132r_Int: case X86::VFMSUBSSr132m_Int:
+ case X86::VFNMADDSDr132r_Int: case X86::VFNMADDSDr132m_Int:
+ case X86::VFNMADDSSr132r_Int: case X86::VFNMADDSSr132m_Int:
+ case X86::VFNMSUBSDr132r_Int: case X86::VFNMSUBSDr132m_Int:
+ case X86::VFNMSUBSSr132r_Int: case X86::VFNMSUBSSr132m_Int:
+
+ case X86::VFMADDSDr213r_Int: case X86::VFMADDSDr213m_Int:
+ case X86::VFMADDSSr213r_Int: case X86::VFMADDSSr213m_Int:
+ case X86::VFMSUBSDr213r_Int: case X86::VFMSUBSDr213m_Int:
+ case X86::VFMSUBSSr213r_Int: case X86::VFMSUBSSr213m_Int:
+ case X86::VFNMADDSDr213r_Int: case X86::VFNMADDSDr213m_Int:
+ case X86::VFNMADDSSr213r_Int: case X86::VFNMADDSSr213m_Int:
+ case X86::VFNMSUBSDr213r_Int: case X86::VFNMSUBSDr213m_Int:
+ case X86::VFNMSUBSSr213r_Int: case X86::VFNMSUBSSr213m_Int:
+
+ case X86::VFMADDSDr231r_Int: case X86::VFMADDSDr231m_Int:
+ case X86::VFMADDSSr231r_Int: case X86::VFMADDSSr231m_Int:
+ case X86::VFMSUBSDr231r_Int: case X86::VFMSUBSDr231m_Int:
+ case X86::VFMSUBSSr231r_Int: case X86::VFMSUBSSr231m_Int:
+ case X86::VFNMADDSDr231r_Int: case X86::VFNMADDSDr231m_Int:
+ case X86::VFNMADDSSr231r_Int: case X86::VFNMADDSSr231m_Int:
+ case X86::VFNMSUBSDr231r_Int: case X86::VFNMSUBSDr231m_Int:
+ case X86::VFNMSUBSSr231r_Int: case X86::VFNMSUBSSr231m_Int:
+ if (IsIntrinsic)
+ *IsIntrinsic = true;
+ return true;
+ default:
+ return false;
+ }
+ llvm_unreachable("Opcode not handled by the switch");
+}
+
+MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI,
+ bool NewMI,
+ unsigned OpIdx1,
+ unsigned OpIdx2) const {
switch (MI->getOpcode()) {
case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
}
MI->setDesc(get(Opc));
MI->getOperand(3).setImm(Size-Amt);
- return TargetInstrInfo::commuteInstruction(MI, NewMI);
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
}
case X86::BLENDPDrri:
case X86::BLENDPSrri:
NewMI = false;
}
MI->getOperand(3).setImm(Mask ^ Imm);
- return TargetInstrInfo::commuteInstruction(MI, NewMI);
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
}
case X86::PCLMULQDQrr:
case X86::VPCLMULQDQrr:{
NewMI = false;
}
MI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
- return TargetInstrInfo::commuteInstruction(MI, NewMI);
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
}
case X86::CMPPDrri:
case X86::CMPPSrri:
MI = MF.CloneMachineInstr(MI);
NewMI = false;
}
- return TargetInstrInfo::commuteInstruction(MI, NewMI);
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
default:
return nullptr;
}
NewMI = false;
}
MI->getOperand(3).setImm(Imm);
- return TargetInstrInfo::commuteInstruction(MI, NewMI);
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
}
case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr:
case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
// Fallthrough intended.
}
default:
- return TargetInstrInfo::commuteInstruction(MI, NewMI);
+ if (isFMA3(MI->getOpcode())) {
+ unsigned Opc = getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2);
+ if (Opc == 0)
+ return nullptr;
+ if (NewMI) {
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MI = MF.CloneMachineInstr(MI);
+ NewMI = false;
+ }
+ MI->setDesc(get(Opc));
+ }
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
}
}
-bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
+bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr *MI,
+ unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const {
+
+ unsigned RegOpsNum = isMem(MI, 3) ? 2 : 3;
+
+ // Only the first RegOpsNum operands are commutable.
+ // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
+ // that the operand is not specified/fixed.
+ if (SrcOpIdx1 != CommuteAnyOperandIndex &&
+ (SrcOpIdx1 < 1 || SrcOpIdx1 > RegOpsNum))
+ return false;
+ if (SrcOpIdx2 != CommuteAnyOperandIndex &&
+ (SrcOpIdx2 < 1 || SrcOpIdx2 > RegOpsNum))
+ return false;
+
+ // Look for two different register operands assumed to be commutable
+ // regardless of the FMA opcode. The FMA opcode is adjusted later.
+ if (SrcOpIdx1 == CommuteAnyOperandIndex ||
+ SrcOpIdx2 == CommuteAnyOperandIndex) {
+ unsigned CommutableOpIdx1 = SrcOpIdx1;
+ unsigned CommutableOpIdx2 = SrcOpIdx2;
+
+ // At least one of operands to be commuted is not specified and
+ // this method is free to choose appropriate commutable operands.
+ if (SrcOpIdx1 == SrcOpIdx2)
+ // Both of operands are not fixed. By default set one of commutable
+ // operands to the last register operand of the instruction.
+ CommutableOpIdx2 = RegOpsNum;
+ else if (SrcOpIdx2 == CommuteAnyOperandIndex)
+ // Only one of operands is not fixed.
+ CommutableOpIdx2 = SrcOpIdx1;
+
+ // CommutableOpIdx2 is well defined now. Let's choose another commutable
+ // operand and assign its index to CommutableOpIdx1.
+ unsigned Op2Reg = MI->getOperand(CommutableOpIdx2).getReg();
+ for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) {
+ // The commuted operands must have different registers.
+ // Otherwise, the commute transformation does not change anything and
+ // is useless then.
+ if (Op2Reg != MI->getOperand(CommutableOpIdx1).getReg())
+ break;
+ }
+
+ // No appropriate commutable operands were found.
+ if (CommutableOpIdx1 == 0)
+ return false;
+
+ // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
+ // to return those values.
+ if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
+ CommutableOpIdx1, CommutableOpIdx2))
+ return false;
+ }
+
+ // Check if we can adjust the opcode to preserve the semantics when
+ // commute the register operands.
+ return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2) != 0;
+}
+
+unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
+ unsigned SrcOpIdx1,
+ unsigned SrcOpIdx2) const {
+ unsigned Opc = MI->getOpcode();
+
+ // Define the array that holds FMA opcodes in groups
+ // of 3 opcodes(132, 213, 231) in each group.
+ static const unsigned RegularOpcodeGroups[][3] = {
+ { X86::VFMADDSSr132r, X86::VFMADDSSr213r, X86::VFMADDSSr231r },
+ { X86::VFMADDSDr132r, X86::VFMADDSDr213r, X86::VFMADDSDr231r },
+ { X86::VFMADDPSr132r, X86::VFMADDPSr213r, X86::VFMADDPSr231r },
+ { X86::VFMADDPDr132r, X86::VFMADDPDr213r, X86::VFMADDPDr231r },
+ { X86::VFMADDPSr132rY, X86::VFMADDPSr213rY, X86::VFMADDPSr231rY },
+ { X86::VFMADDPDr132rY, X86::VFMADDPDr213rY, X86::VFMADDPDr231rY },
+ { X86::VFMADDSSr132m, X86::VFMADDSSr213m, X86::VFMADDSSr231m },
+ { X86::VFMADDSDr132m, X86::VFMADDSDr213m, X86::VFMADDSDr231m },
+ { X86::VFMADDPSr132m, X86::VFMADDPSr213m, X86::VFMADDPSr231m },
+ { X86::VFMADDPDr132m, X86::VFMADDPDr213m, X86::VFMADDPDr231m },
+ { X86::VFMADDPSr132mY, X86::VFMADDPSr213mY, X86::VFMADDPSr231mY },
+ { X86::VFMADDPDr132mY, X86::VFMADDPDr213mY, X86::VFMADDPDr231mY },
+
+ { X86::VFMSUBSSr132r, X86::VFMSUBSSr213r, X86::VFMSUBSSr231r },
+ { X86::VFMSUBSDr132r, X86::VFMSUBSDr213r, X86::VFMSUBSDr231r },
+ { X86::VFMSUBPSr132r, X86::VFMSUBPSr213r, X86::VFMSUBPSr231r },
+ { X86::VFMSUBPDr132r, X86::VFMSUBPDr213r, X86::VFMSUBPDr231r },
+ { X86::VFMSUBPSr132rY, X86::VFMSUBPSr213rY, X86::VFMSUBPSr231rY },
+ { X86::VFMSUBPDr132rY, X86::VFMSUBPDr213rY, X86::VFMSUBPDr231rY },
+ { X86::VFMSUBSSr132m, X86::VFMSUBSSr213m, X86::VFMSUBSSr231m },
+ { X86::VFMSUBSDr132m, X86::VFMSUBSDr213m, X86::VFMSUBSDr231m },
+ { X86::VFMSUBPSr132m, X86::VFMSUBPSr213m, X86::VFMSUBPSr231m },
+ { X86::VFMSUBPDr132m, X86::VFMSUBPDr213m, X86::VFMSUBPDr231m },
+ { X86::VFMSUBPSr132mY, X86::VFMSUBPSr213mY, X86::VFMSUBPSr231mY },
+ { X86::VFMSUBPDr132mY, X86::VFMSUBPDr213mY, X86::VFMSUBPDr231mY },
+
+ { X86::VFNMADDSSr132r, X86::VFNMADDSSr213r, X86::VFNMADDSSr231r },
+ { X86::VFNMADDSDr132r, X86::VFNMADDSDr213r, X86::VFNMADDSDr231r },
+ { X86::VFNMADDPSr132r, X86::VFNMADDPSr213r, X86::VFNMADDPSr231r },
+ { X86::VFNMADDPDr132r, X86::VFNMADDPDr213r, X86::VFNMADDPDr231r },
+ { X86::VFNMADDPSr132rY, X86::VFNMADDPSr213rY, X86::VFNMADDPSr231rY },
+ { X86::VFNMADDPDr132rY, X86::VFNMADDPDr213rY, X86::VFNMADDPDr231rY },
+ { X86::VFNMADDSSr132m, X86::VFNMADDSSr213m, X86::VFNMADDSSr231m },
+ { X86::VFNMADDSDr132m, X86::VFNMADDSDr213m, X86::VFNMADDSDr231m },
+ { X86::VFNMADDPSr132m, X86::VFNMADDPSr213m, X86::VFNMADDPSr231m },
+ { X86::VFNMADDPDr132m, X86::VFNMADDPDr213m, X86::VFNMADDPDr231m },
+ { X86::VFNMADDPSr132mY, X86::VFNMADDPSr213mY, X86::VFNMADDPSr231mY },
+ { X86::VFNMADDPDr132mY, X86::VFNMADDPDr213mY, X86::VFNMADDPDr231mY },
+
+ { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr213r, X86::VFNMSUBSSr231r },
+ { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr213r, X86::VFNMSUBSDr231r },
+ { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr213r, X86::VFNMSUBPSr231r },
+ { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr213r, X86::VFNMSUBPDr231r },
+ { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr231rY },
+ { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr231rY },
+ { X86::VFNMSUBSSr132m, X86::VFNMSUBSSr213m, X86::VFNMSUBSSr231m },
+ { X86::VFNMSUBSDr132m, X86::VFNMSUBSDr213m, X86::VFNMSUBSDr231m },
+ { X86::VFNMSUBPSr132m, X86::VFNMSUBPSr213m, X86::VFNMSUBPSr231m },
+ { X86::VFNMSUBPDr132m, X86::VFNMSUBPDr213m, X86::VFNMSUBPDr231m },
+ { X86::VFNMSUBPSr132mY, X86::VFNMSUBPSr213mY, X86::VFNMSUBPSr231mY },
+ { X86::VFNMSUBPDr132mY, X86::VFNMSUBPDr213mY, X86::VFNMSUBPDr231mY },
+
+ { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr231r },
+ { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr231r },
+ { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr231rY },
+ { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr231rY },
+ { X86::VFMADDSUBPSr132m, X86::VFMADDSUBPSr213m, X86::VFMADDSUBPSr231m },
+ { X86::VFMADDSUBPDr132m, X86::VFMADDSUBPDr213m, X86::VFMADDSUBPDr231m },
+ { X86::VFMADDSUBPSr132mY, X86::VFMADDSUBPSr213mY, X86::VFMADDSUBPSr231mY },
+ { X86::VFMADDSUBPDr132mY, X86::VFMADDSUBPDr213mY, X86::VFMADDSUBPDr231mY },
+
+ { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr231r },
+ { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr231r },
+ { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr231rY },
+ { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr231rY },
+ { X86::VFMSUBADDPSr132m, X86::VFMSUBADDPSr213m, X86::VFMSUBADDPSr231m },
+ { X86::VFMSUBADDPDr132m, X86::VFMSUBADDPDr213m, X86::VFMSUBADDPDr231m },
+ { X86::VFMSUBADDPSr132mY, X86::VFMSUBADDPSr213mY, X86::VFMSUBADDPSr231mY },
+ { X86::VFMSUBADDPDr132mY, X86::VFMSUBADDPDr213mY, X86::VFMSUBADDPDr231mY }
+ };
+
+ // Define the array that holds FMA*_Int opcodes in groups
+ // of 3 opcodes(132, 213, 231) in each group.
+ static const unsigned IntrinOpcodeGroups[][3] = {
+ { X86::VFMADDSSr132r_Int, X86::VFMADDSSr213r_Int, X86::VFMADDSSr231r_Int },
+ { X86::VFMADDSDr132r_Int, X86::VFMADDSDr213r_Int, X86::VFMADDSDr231r_Int },
+ { X86::VFMADDSSr132m_Int, X86::VFMADDSSr213m_Int, X86::VFMADDSSr231m_Int },
+ { X86::VFMADDSDr132m_Int, X86::VFMADDSDr213m_Int, X86::VFMADDSDr231m_Int },
+
+ { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr231r_Int },
+ { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr231r_Int },
+ { X86::VFMSUBSSr132m_Int, X86::VFMSUBSSr213m_Int, X86::VFMSUBSSr231m_Int },
+ { X86::VFMSUBSDr132m_Int, X86::VFMSUBSDr213m_Int, X86::VFMSUBSDr231m_Int },
+
+ { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr231r_Int },
+ { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr231r_Int },
+ { X86::VFNMADDSSr132m_Int, X86::VFNMADDSSr213m_Int, X86::VFNMADDSSr231m_Int },
+ { X86::VFNMADDSDr132m_Int, X86::VFNMADDSDr213m_Int, X86::VFNMADDSDr231m_Int },
+
+ { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr231r_Int },
+ { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr231r_Int },
+ { X86::VFNMSUBSSr132m_Int, X86::VFNMSUBSSr213m_Int, X86::VFNMSUBSSr231m_Int },
+ { X86::VFNMSUBSDr132m_Int, X86::VFNMSUBSDr213m_Int, X86::VFNMSUBSDr231m_Int },
+ };
+
+ const unsigned Form132Index = 0;
+ const unsigned Form213Index = 1;
+ const unsigned Form231Index = 2;
+ const unsigned FormsNum = 3;
+
+ bool IsIntrinOpcode;
+ isFMA3(Opc, &IsIntrinOpcode);
+
+ size_t GroupsNum;
+ const unsigned (*OpcodeGroups)[3];
+ if (IsIntrinOpcode) {
+ GroupsNum = array_lengthof(IntrinOpcodeGroups);
+ OpcodeGroups = IntrinOpcodeGroups;
+ } else {
+ GroupsNum = array_lengthof(RegularOpcodeGroups);
+ OpcodeGroups = RegularOpcodeGroups;
+ }
+
+ const unsigned *FoundOpcodesGroup = nullptr;
+ size_t FormIndex;
+
+ // Look for the input opcode in the corresponding opcodes table.
+ for (size_t GroupIndex = 0; GroupIndex < GroupsNum && !FoundOpcodesGroup;
+ ++GroupIndex) {
+ for (FormIndex = 0; FormIndex < FormsNum; ++FormIndex) {
+ if (OpcodeGroups[GroupIndex][FormIndex] == Opc) {
+ FoundOpcodesGroup = OpcodeGroups[GroupIndex];
+ break;
+ }
+ }
+ }
+
+ // The input opcode does not match with any of the opcodes from the tables.
+ // The unsupported FMA opcode must be added to one of the two opcode groups
+ // defined above.
+ assert(FoundOpcodesGroup != nullptr && "Unexpected FMA3 opcode");
+
+ // Put the lowest index to SrcOpIdx1 to simplify the checks below.
+ if (SrcOpIdx1 > SrcOpIdx2)
+ std::swap(SrcOpIdx1, SrcOpIdx2);
+
+ // TODO: Commuting the 1st operand of FMA*_Int requires some additional
+ // analysis. The commute optimization is legal only if all users of FMA*_Int
+ // use only the lowest element of the FMA*_Int instruction. Such analysis are
+ // not implemented yet. So, just return 0 in that case.
+ // When such analysis are available this place will be the right place for
+ // calling it.
+ if (IsIntrinOpcode && SrcOpIdx1 == 1)
+ return 0;
+
+ unsigned Case;
+ if (SrcOpIdx1 == 1 && SrcOpIdx2 == 2)
+ Case = 0;
+ else if (SrcOpIdx1 == 1 && SrcOpIdx2 == 3)
+ Case = 1;
+ else if (SrcOpIdx1 == 2 && SrcOpIdx2 == 3)
+ Case = 2;
+ else
+ return 0;
+
+ // Define the FMA forms mapping array that helps to map input FMA form
+ // to output FMA form to preserve the operation semantics after
+ // commuting the operands.
+ static const unsigned FormMapping[][3] = {
+ // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
+ // FMA132 A, C, b; ==> FMA231 C, A, b;
+ // FMA213 B, A, c; ==> FMA213 A, B, c;
+ // FMA231 C, A, b; ==> FMA132 A, C, b;
+ { Form231Index, Form213Index, Form132Index },
+ // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
+ // FMA132 A, c, B; ==> FMA132 B, c, A;
+ // FMA213 B, a, C; ==> FMA231 C, a, B;
+ // FMA231 C, a, B; ==> FMA213 B, a, C;
+ { Form132Index, Form231Index, Form213Index },
+ // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
+ // FMA132 a, C, B; ==> FMA213 a, B, C;
+ // FMA213 b, A, C; ==> FMA132 b, C, A;
+ // FMA231 c, A, B; ==> FMA231 c, B, A;
+ { Form213Index, Form132Index, Form231Index }
+ };
+
+ // Everything is ready, just adjust the FMA opcode and return it.
+ FormIndex = FormMapping[Case][FormIndex];
+ return FoundOpcodesGroup[FormIndex];
+}
+
+bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI,
+ unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const {
switch (MI->getOpcode()) {
case X86::CMPPDrri:
// Ordered/Unordered/Equal/NotEqual tests
unsigned Imm = MI->getOperand(3).getImm() & 0x7;
switch (Imm) {
- case 0x00: // EQUAL
- case 0x03: // UNORDERED
- case 0x04: // NOT EQUAL
- case 0x07: // ORDERED
- SrcOpIdx1 = 1;
- SrcOpIdx2 = 2;
- return true;
+ case 0x00: // EQUAL
+ case 0x03: // UNORDERED
+ case 0x04: // NOT EQUAL
+ case 0x07: // ORDERED
+ // The indices of the commutable operands are 1 and 2.
+ // Assign them to the returned operand indices here.
+ return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
}
return false;
}
- case X86::VFMADDPDr231r:
- case X86::VFMADDPSr231r:
- case X86::VFMADDSDr231r:
- case X86::VFMADDSSr231r:
- case X86::VFMSUBPDr231r:
- case X86::VFMSUBPSr231r:
- case X86::VFMSUBSDr231r:
- case X86::VFMSUBSSr231r:
- case X86::VFNMADDPDr231r:
- case X86::VFNMADDPSr231r:
- case X86::VFNMADDSDr231r:
- case X86::VFNMADDSSr231r:
- case X86::VFNMSUBPDr231r:
- case X86::VFNMSUBPSr231r:
- case X86::VFNMSUBSDr231r:
- case X86::VFNMSUBSSr231r:
- case X86::VFMADDPDr231rY:
- case X86::VFMADDPSr231rY:
- case X86::VFMSUBPDr231rY:
- case X86::VFMSUBPSr231rY:
- case X86::VFNMADDPDr231rY:
- case X86::VFNMADDPSr231rY:
- case X86::VFNMSUBPDr231rY:
- case X86::VFNMSUBPSr231rY:
- SrcOpIdx1 = 2;
- SrcOpIdx2 = 3;
- return true;
default:
+ if (isFMA3(MI->getOpcode()))
+ return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
}
+ return false;
}
static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
return !isPredicated(MI);
}
-bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
- MachineBasicBlock *&TBB,
- MachineBasicBlock *&FBB,
- SmallVectorImpl<MachineOperand> &Cond,
- bool AllowModify) const {
+bool X86InstrInfo::AnalyzeBranchImpl(
+ MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {
+
// Start from the bottom of the block and work up, examining the
// terminator instructions.
MachineBasicBlock::iterator I = MBB.end();
FBB = TBB;
TBB = I->getOperand(0).getMBB();
Cond.push_back(MachineOperand::CreateImm(BranchCode));
+ CondBranches.push_back(I);
continue;
}
// Update the MachineOperand.
Cond[0].setImm(BranchCode);
+ CondBranches.push_back(I);
}
return false;
}
+bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ SmallVector<MachineInstr *, 4> CondBranches;
+ return AnalyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
+}
+
+bool X86InstrInfo::AnalyzeBranchPredicate(MachineBasicBlock &MBB,
+ MachineBranchPredicate &MBP,
+ bool AllowModify) const {
+ using namespace std::placeholders;
+
+ SmallVector<MachineOperand, 4> Cond;
+ SmallVector<MachineInstr *, 4> CondBranches;
+ if (AnalyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
+ AllowModify))
+ return true;
+
+ if (Cond.size() != 1)
+ return true;
+
+ assert(MBP.TrueDest && "expected!");
+
+ if (!MBP.FalseDest)
+ MBP.FalseDest = MBB.getNextNode();
+
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+ MachineInstr *ConditionDef = nullptr;
+ bool SingleUseCondition = true;
+
+ for (auto I = std::next(MBB.rbegin()), E = MBB.rend(); I != E; ++I) {
+ if (I->modifiesRegister(X86::EFLAGS, TRI)) {
+ ConditionDef = &*I;
+ break;
+ }
+
+ if (I->readsRegister(X86::EFLAGS, TRI))
+ SingleUseCondition = false;
+ }
+
+ if (!ConditionDef)
+ return true;
+
+ if (SingleUseCondition) {
+ for (auto *Succ : MBB.successors())
+ if (Succ->isLiveIn(X86::EFLAGS))
+ SingleUseCondition = false;
+ }
+
+ MBP.ConditionDef = ConditionDef;
+ MBP.SingleUseCondition = SingleUseCondition;
+
+ // Currently we only recognize the simple pattern:
+ //
+ // test %reg, %reg
+ // je %label
+ //
+ const unsigned TestOpcode =
+ Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr;
+
+ if (ConditionDef->getOpcode() == TestOpcode &&
+ ConditionDef->getNumOperands() == 3 &&
+ ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
+ (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) {
+ MBP.LHS = ConditionDef->getOperand(0);
+ MBP.RHS = MachineOperand::CreateImm(0);
+ MBP.Predicate = Cond[0].getImm() == X86::COND_NE
+ ? MachineBranchPredicate::PRED_NE
+ : MachineBranchPredicate::PRED_EQ;
+ return false;
+ }
+
+ return true;
+}
+
unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
MachineBasicBlock::iterator I = MBB.end();
unsigned Count = 0;
unsigned
X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
- MachineBasicBlock *FBB,
- const SmallVectorImpl<MachineOperand> &Cond,
+ MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
DebugLoc DL) const {
// Shouldn't be a fall through.
assert(TBB && "InsertBranch must not be told to insert a fallthrough");
bool X86InstrInfo::
canInsertSelect(const MachineBasicBlock &MBB,
- const SmallVectorImpl<MachineOperand> &Cond,
+ ArrayRef<MachineOperand> Cond,
unsigned TrueReg, unsigned FalseReg,
int &CondCycles, int &TrueCycles, int &FalseCycles) const {
// Not all subtargets have cmov instructions.
void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, DebugLoc DL,
- unsigned DstReg,
- const SmallVectorImpl<MachineOperand> &Cond,
+ unsigned DstReg, ArrayRef<MachineOperand> Cond,
unsigned TrueReg, unsigned FalseReg) const {
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
assert(Cond.size() == 1 && "Invalid Cond array");
X86::MOVPQIto64rr);
if (X86::VR64RegClass.contains(SrcReg))
// Copy from a VR64 register to a GR64 register.
- return X86::MOVSDto64rr;
+ return X86::MMX_MOVD64from64rr;
} else if (X86::GR64RegClass.contains(SrcReg)) {
// Copy from a GR64 register to a VR128 register.
if (X86::VR128XRegClass.contains(DestReg))
X86::MOV64toPQIrr);
// Copy from a GR64 register to a VR64 register.
if (X86::VR64RegClass.contains(DestReg))
- return X86::MOV64toSDrr;
+ return X86::MMX_MOVD64to64rr;
}
// SrcReg(FR32) -> DestReg(GR32)
return 0;
}
-inline static bool MaskRegClassContains(unsigned Reg) {
+static bool MaskRegClassContains(unsigned Reg) {
return X86::VK8RegClass.contains(Reg) ||
X86::VK16RegClass.contains(Reg) ||
X86::VK32RegClass.contains(Reg) ||
X86::VK64RegClass.contains(Reg) ||
X86::VK1RegClass.contains(Reg);
}
+
+static bool GRRegClassContains(unsigned Reg) {
+ return X86::GR64RegClass.contains(Reg) ||
+ X86::GR32RegClass.contains(Reg) ||
+ X86::GR16RegClass.contains(Reg) ||
+ X86::GR8RegClass.contains(Reg);
+}
+static
+unsigned copyPhysRegOpcode_AVX512_DQ(unsigned& DestReg, unsigned& SrcReg) {
+ if (MaskRegClassContains(SrcReg) && X86::GR8RegClass.contains(DestReg)) {
+ DestReg = getX86SubSuperRegister(DestReg, MVT::i32);
+ return X86::KMOVBrk;
+ }
+ if (MaskRegClassContains(DestReg) && X86::GR8RegClass.contains(SrcReg)) {
+ SrcReg = getX86SubSuperRegister(SrcReg, MVT::i32);
+ return X86::KMOVBkr;
+ }
+ return 0;
+}
+
+static
+unsigned copyPhysRegOpcode_AVX512_BW(unsigned& DestReg, unsigned& SrcReg) {
+ if (MaskRegClassContains(SrcReg) && MaskRegClassContains(DestReg))
+ return X86::KMOVQkk;
+ if (MaskRegClassContains(SrcReg) && X86::GR32RegClass.contains(DestReg))
+ return X86::KMOVDrk;
+ if (MaskRegClassContains(SrcReg) && X86::GR64RegClass.contains(DestReg))
+ return X86::KMOVQrk;
+ if (MaskRegClassContains(DestReg) && X86::GR32RegClass.contains(SrcReg))
+ return X86::KMOVDkr;
+ if (MaskRegClassContains(DestReg) && X86::GR64RegClass.contains(SrcReg))
+ return X86::KMOVQkr;
+ return 0;
+}
+
static
-unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) {
+unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg,
+ const X86Subtarget &Subtarget)
+{
+ if (Subtarget.hasDQI())
+ if (auto Opc = copyPhysRegOpcode_AVX512_DQ(DestReg, SrcReg))
+ return Opc;
+ if (Subtarget.hasBWI())
+ if (auto Opc = copyPhysRegOpcode_AVX512_BW(DestReg, SrcReg))
+ return Opc;
if (X86::VR128XRegClass.contains(DestReg, SrcReg) ||
X86::VR256XRegClass.contains(DestReg, SrcReg) ||
X86::VR512RegClass.contains(DestReg, SrcReg)) {
SrcReg = get512BitSuperRegister(SrcReg);
return X86::VMOVAPSZrr;
}
- if (MaskRegClassContains(DestReg) &&
- MaskRegClassContains(SrcReg))
+ if (MaskRegClassContains(DestReg) && MaskRegClassContains(SrcReg))
return X86::KMOVWkk;
- if (MaskRegClassContains(DestReg) &&
- (X86::GR32RegClass.contains(SrcReg) ||
- X86::GR16RegClass.contains(SrcReg) ||
- X86::GR8RegClass.contains(SrcReg))) {
+ if (MaskRegClassContains(DestReg) && GRRegClassContains(SrcReg)) {
SrcReg = getX86SubSuperRegister(SrcReg, MVT::i32);
return X86::KMOVWkr;
}
- if ((X86::GR32RegClass.contains(DestReg) ||
- X86::GR16RegClass.contains(DestReg) ||
- X86::GR8RegClass.contains(DestReg)) &&
- MaskRegClassContains(SrcReg)) {
+ if (GRRegClassContains(DestReg) && MaskRegClassContains(SrcReg)) {
DestReg = getX86SubSuperRegister(DestReg, MVT::i32);
return X86::KMOVWrk;
}
else if (X86::VR64RegClass.contains(DestReg, SrcReg))
Opc = X86::MMX_MOVQ64rr;
else if (HasAVX512)
- Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg);
+ Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg, Subtarget);
else if (X86::VR128RegClass.contains(DestReg, SrcReg))
Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
else if (X86::VR256RegClass.contains(DestReg, SrcReg))
return;
}
- // Moving EFLAGS to / from another register requires a push and a pop.
- // Notice that we have to adjust the stack if we don't want to clobber the
- // first frame index. See X86FrameLowering.cpp - clobbersTheStack.
- if (SrcReg == X86::EFLAGS) {
- if (X86::GR64RegClass.contains(DestReg)) {
- BuildMI(MBB, MI, DL, get(X86::PUSHF64));
- BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg);
- return;
- }
- if (X86::GR32RegClass.contains(DestReg)) {
- BuildMI(MBB, MI, DL, get(X86::PUSHF32));
- BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg);
- return;
- }
- }
- if (DestReg == X86::EFLAGS) {
- if (X86::GR64RegClass.contains(SrcReg)) {
- BuildMI(MBB, MI, DL, get(X86::PUSH64r))
- .addReg(SrcReg, getKillRegState(KillSrc));
- BuildMI(MBB, MI, DL, get(X86::POPF64));
- return;
+ bool FromEFLAGS = SrcReg == X86::EFLAGS;
+ bool ToEFLAGS = DestReg == X86::EFLAGS;
+ int Reg = FromEFLAGS ? DestReg : SrcReg;
+ bool is32 = X86::GR32RegClass.contains(Reg);
+ bool is64 = X86::GR64RegClass.contains(Reg);
+ if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) {
+ // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is
+ // inefficient. Instead:
+ // - Save the overflow flag OF into AL using SETO, and restore it using a
+ // signed 8-bit addition of AL and INT8_MAX.
+ // - Save/restore the bottom 8 EFLAGS bits (CF, PF, AF, ZF, SF) to/from AH
+ // using LAHF/SAHF.
+ // - When RAX/EAX is live and isn't the destination register, make sure it
+ // isn't clobbered by PUSH/POP'ing it before and after saving/restoring
+ // the flags.
+ // This approach is ~2.25x faster than using PUSHF/POPF.
+ //
+ // This is still somewhat inefficient because we don't know which flags are
+ // actually live inside EFLAGS. Were we able to do a single SETcc instead of
+ // SETO+LAHF / ADDB+SAHF the code could be 1.02x faster.
+ //
+ // PUSHF/POPF is also potentially incorrect because it affects other flags
+ // such as TF/IF/DF, which LLVM doesn't model.
+ //
+ // Notice that we have to adjust the stack if we don't want to clobber the
+ // first frame index. See X86FrameLowering.cpp - clobbersTheStack.
+
+ int Mov = is64 ? X86::MOV64rr : X86::MOV32rr;
+ int Push = is64 ? X86::PUSH64r : X86::PUSH32r;
+ int Pop = is64 ? X86::POP64r : X86::POP32r;
+ int AX = is64 ? X86::RAX : X86::EAX;
+
+ bool AXDead = (Reg == AX);
+ // FIXME: The above could figure out that AX is dead in more cases with:
+ // || (MachineBasicBlock::LQR_Dead ==
+ // MBB.computeRegisterLiveness(&getRegisterInfo(), AX, MI));
+ //
+ // Unfortunately this is slightly broken, see PR24535 and the likely
+ // related PR25033 PR24991 PR24992 PR25201. These issues seem to
+ // showcase sub-register / super-register confusion: a previous kill
+ // of AH but no kill of AL leads computeRegisterLiveness to
+ // erroneously conclude that AX is dead.
+ //
+ // Once fixed, also update cmpxchg-clobber-flags.ll and
+ // peephole-na-phys-copy-folding.ll.
+
+ if (!AXDead)
+ BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true));
+ if (FromEFLAGS) {
+ BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL);
+ BuildMI(MBB, MI, DL, get(X86::LAHF));
+ BuildMI(MBB, MI, DL, get(Mov), Reg).addReg(AX);
}
- if (X86::GR32RegClass.contains(SrcReg)) {
- BuildMI(MBB, MI, DL, get(X86::PUSH32r))
- .addReg(SrcReg, getKillRegState(KillSrc));
- BuildMI(MBB, MI, DL, get(X86::POPF32));
- return;
+ if (ToEFLAGS) {
+ BuildMI(MBB, MI, DL, get(Mov), AX).addReg(Reg, getKillRegState(KillSrc));
+ BuildMI(MBB, MI, DL, get(X86::ADD8ri), X86::AL)
+ .addReg(X86::AL)
+ .addImm(INT8_MAX);
+ BuildMI(MBB, MI, DL, get(X86::SAHF));
}
+ if (!AXDead)
+ BuildMI(MBB, MI, DL, get(Pop), AX);
+ return;
}
DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
}
}
+bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *MemOp, unsigned &BaseReg,
+ unsigned &Offset,
+ const TargetRegisterInfo *TRI) const {
+ const MCInstrDesc &Desc = MemOp->getDesc();
+ int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags, MemOp->getOpcode());
+ if (MemRefBegin < 0)
+ return false;
+
+ MemRefBegin += X86II::getOperandBias(Desc);
+
+ BaseReg = MemOp->getOperand(MemRefBegin + X86::AddrBaseReg).getReg();
+ if (MemOp->getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
+ return false;
+
+ if (MemOp->getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
+ X86::NoRegister)
+ return false;
+
+ const MachineOperand &DispMO = MemOp->getOperand(MemRefBegin + X86::AddrDisp);
+
+ // Displacement can be symbolic
+ if (!DispMO.isImm())
+ return false;
+
+ Offset = DispMO.getImm();
+
+ return (MemOp->getOperand(MemRefBegin + X86::AddrIndexReg).getReg() ==
+ X86::NoRegister);
+}
+
static unsigned getStoreRegOpcode(unsigned SrcReg,
const TargetRegisterClass *RC,
bool isStackAligned,
DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
assert(DefMI);
bool SawStore = false;
- if (!DefMI->isSafeToMove(this, nullptr, SawStore))
+ if (!DefMI->isSafeToMove(nullptr, SawStore))
return nullptr;
// Collect information about virtual register operands of MI.
const GlobalValue *GV =
cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
- MachineMemOperand *MMO = MBB.getParent()->
- getMachineMemOperand(MachinePointerInfo::getGOT(), Flag, 8, 8);
+ MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
+ MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 8, 8);
MachineBasicBlock::iterator I = MIB.getInstr();
BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
return true;
case X86::KSET0B:
case X86::KSET0W: return Expand2AddrUndef(MIB, get(X86::KXORWrr));
+ case X86::KSET0D: return Expand2AddrUndef(MIB, get(X86::KXORDrr));
+ case X86::KSET0Q: return Expand2AddrUndef(MIB, get(X86::KXORQrr));
case X86::KSET1B:
case X86::KSET1W: return Expand2AddrUndef(MIB, get(X86::KXNORWrr));
+ case X86::KSET1D: return Expand2AddrUndef(MIB, get(X86::KXNORDrr));
+ case X86::KSET1Q: return Expand2AddrUndef(MIB, get(X86::KXNORQrr));
case TargetOpcode::LOAD_STACK_GUARD:
expandLoadStackGuard(MIB, *this);
return true;
return false;
}
+static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,
+ int PtrOffset = 0) {
+ unsigned NumAddrOps = MOs.size();
+
+ if (NumAddrOps < 4) {
+ // FrameIndex only - add an immediate offset (whether its zero or not).
+ for (unsigned i = 0; i != NumAddrOps; ++i)
+ MIB.addOperand(MOs[i]);
+ addOffset(MIB, PtrOffset);
+ } else {
+ // General Memory Addressing - we need to add any offset to an existing
+ // offset.
+ assert(MOs.size() == 5 && "Unexpected memory operand list length");
+ for (unsigned i = 0; i != NumAddrOps; ++i) {
+ const MachineOperand &MO = MOs[i];
+ if (i == 3 && PtrOffset != 0) {
+ MIB.addDisp(MO, PtrOffset);
+ } else {
+ MIB.addOperand(MO);
+ }
+ }
+ }
+}
+
static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
ArrayRef<MachineOperand> MOs,
+ MachineBasicBlock::iterator InsertPt,
MachineInstr *MI,
const TargetInstrInfo &TII) {
// Create the base instruction with the memory operand as the first part.
MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
MI->getDebugLoc(), true);
MachineInstrBuilder MIB(MF, NewMI);
- unsigned NumAddrOps = MOs.size();
- for (unsigned i = 0; i != NumAddrOps; ++i)
- MIB.addOperand(MOs[i]);
- if (NumAddrOps < 4) // FrameIndex only
- addOffset(MIB, 0);
+ addOperands(MIB, MOs);
// Loop over the rest of the ri operands, converting them over.
unsigned NumOps = MI->getDesc().getNumOperands()-2;
MachineOperand &MO = MI->getOperand(i);
MIB.addOperand(MO);
}
+
+ MachineBasicBlock *MBB = InsertPt->getParent();
+ MBB->insert(InsertPt, NewMI);
+
return MIB;
}
static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
unsigned OpNo, ArrayRef<MachineOperand> MOs,
- MachineInstr *MI, const TargetInstrInfo &TII) {
+ MachineBasicBlock::iterator InsertPt,
+ MachineInstr *MI, const TargetInstrInfo &TII,
+ int PtrOffset = 0) {
// Omit the implicit operands, something BuildMI can't do.
MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
MI->getDebugLoc(), true);
MachineOperand &MO = MI->getOperand(i);
if (i == OpNo) {
assert(MO.isReg() && "Expected to fold into reg operand!");
- unsigned NumAddrOps = MOs.size();
- for (unsigned i = 0; i != NumAddrOps; ++i)
- MIB.addOperand(MOs[i]);
- if (NumAddrOps < 4) // FrameIndex only
- addOffset(MIB, 0);
+ addOperands(MIB, MOs, PtrOffset);
} else {
MIB.addOperand(MO);
}
}
+
+ MachineBasicBlock *MBB = InsertPt->getParent();
+ MBB->insert(InsertPt, NewMI);
+
return MIB;
}
static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
ArrayRef<MachineOperand> MOs,
+ MachineBasicBlock::iterator InsertPt,
MachineInstr *MI) {
- MachineFunction &MF = *MI->getParent()->getParent();
- MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(Opcode));
-
- unsigned NumAddrOps = MOs.size();
- for (unsigned i = 0; i != NumAddrOps; ++i)
- MIB.addOperand(MOs[i]);
- if (NumAddrOps < 4) // FrameIndex only
- addOffset(MIB, 0);
+ MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
+ MI->getDebugLoc(), TII.get(Opcode));
+ addOperands(MIB, MOs);
return MIB.addImm(0);
}
-MachineInstr *X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
- MachineInstr *MI,
- unsigned OpNum,
- ArrayRef<MachineOperand> MOs,
- unsigned Size, unsigned Align,
- bool AllowCommute) const {
+MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
+ MachineFunction &MF, MachineInstr *MI, unsigned OpNum,
+ ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
+ unsigned Size, unsigned Align) const {
+ switch (MI->getOpcode()) {
+ case X86::INSERTPSrr:
+ case X86::VINSERTPSrr:
+ // Attempt to convert the load of inserted vector into a fold load
+ // of a single float.
+ if (OpNum == 2) {
+ unsigned Imm = MI->getOperand(MI->getNumOperands() - 1).getImm();
+ unsigned ZMask = Imm & 15;
+ unsigned DstIdx = (Imm >> 4) & 3;
+ unsigned SrcIdx = (Imm >> 6) & 3;
+
+ unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize();
+ if (Size <= RCSize && 4 <= Align) {
+ int PtrOffset = SrcIdx * 4;
+ unsigned NewImm = (DstIdx << 4) | ZMask;
+ unsigned NewOpCode =
+ (MI->getOpcode() == X86::VINSERTPSrr ? X86::VINSERTPSrm
+ : X86::INSERTPSrm);
+ MachineInstr *NewMI =
+ FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
+ NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
+ return NewMI;
+ }
+ }
+ break;
+ };
+
+ return nullptr;
+}
+
+MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
+ MachineFunction &MF, MachineInstr *MI, unsigned OpNum,
+ ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
+ unsigned Size, unsigned Align, bool AllowCommute) const {
const DenseMap<unsigned,
std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
bool isCallRegIndirect = Subtarget.callRegIndirect();
bool isTwoAddrFold = false;
- // For CPUs that favor the register form of a call,
- // do not fold loads into calls.
- if (isCallRegIndirect &&
- (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r))
+ // For CPUs that favor the register form of a call or push,
+ // do not fold loads into calls or pushes, unless optimizing for size
+ // aggressively.
+ if (isCallRegIndirect && !MF.getFunction()->optForMinSize() &&
+ (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r ||
+ MI->getOpcode() == X86::PUSH16r || MI->getOpcode() == X86::PUSH32r ||
+ MI->getOpcode() == X86::PUSH64r))
return nullptr;
unsigned NumOps = MI->getDesc().getNumOperands();
return nullptr;
MachineInstr *NewMI = nullptr;
+
+ // Attempt to fold any custom cases we have.
+ if (MachineInstr *CustomMI =
+ foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align))
+ return CustomMI;
+
// Folding a memory location into the two-address part of a two-address
// instruction is different than folding it other places. It requires
// replacing the *two* registers with the memory location.
isTwoAddrFold = true;
} else if (OpNum == 0) {
if (MI->getOpcode() == X86::MOV32r0) {
- NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI);
+ NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI);
if (NewMI)
return NewMI;
}
}
if (isTwoAddrFold)
- NewMI = FuseTwoAddrInst(MF, Opcode, MOs, MI, *this);
+ NewMI = FuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this);
else
- NewMI = FuseInst(MF, Opcode, OpNum, MOs, MI, *this);
+ NewMI = FuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this);
if (NarrowToMOV32rm) {
// If this is the special case where we use a MOV32rm to load a 32-bit
// If the instruction and target operand are commutable, commute the
// instruction and try again.
if (AllowCommute) {
- unsigned OriginalOpIdx = OpNum, CommuteOpIdx1, CommuteOpIdx2;
+ unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex;
if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
bool HasDef = MI->getDesc().getNumDefs();
unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;
unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg();
unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg();
- bool Tied0 =
- 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
bool Tied1 =
+ 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
+ bool Tied2 =
0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);
// If either of the commutable operands are tied to the destination
// then we can not commute + fold.
- if ((HasDef && Reg0 == Reg1 && Tied0) ||
- (HasDef && Reg0 == Reg2 && Tied1))
+ if ((HasDef && Reg0 == Reg1 && Tied1) ||
+ (HasDef && Reg0 == Reg2 && Tied2))
return nullptr;
- if ((CommuteOpIdx1 == OriginalOpIdx) ||
- (CommuteOpIdx2 == OriginalOpIdx)) {
- MachineInstr *CommutedMI = commuteInstruction(MI, false);
- if (!CommutedMI) {
- // Unable to commute.
- return nullptr;
- }
- if (CommutedMI != MI) {
- // New instruction. We can't fold from this.
- CommutedMI->eraseFromParent();
- return nullptr;
- }
+ MachineInstr *CommutedMI =
+ commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
+ if (!CommutedMI) {
+ // Unable to commute.
+ return nullptr;
+ }
+ if (CommutedMI != MI) {
+ // New instruction. We can't fold from this.
+ CommutedMI->eraseFromParent();
+ return nullptr;
+ }
- // Attempt to fold with the commuted version of the instruction.
- unsigned CommuteOp =
- (CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1);
- NewMI = foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, Size, Align,
- /*AllowCommute=*/false);
- if (NewMI)
- return NewMI;
-
- // Folding failed again - undo the commute before returning.
- MachineInstr *UncommutedMI = commuteInstruction(MI, false);
- if (!UncommutedMI) {
- // Unable to commute.
- return nullptr;
- }
- if (UncommutedMI != MI) {
- // New instruction. It doesn't need to be kept.
- UncommutedMI->eraseFromParent();
- return nullptr;
- }
+ // Attempt to fold with the commuted version of the instruction.
+ NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt,
+ Size, Align, /*AllowCommute=*/false);
+ if (NewMI)
+ return NewMI;
- // Return here to prevent duplicate fuse failure report.
+ // Folding failed again - undo the commute before returning.
+ MachineInstr *UncommutedMI =
+ commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
+ if (!UncommutedMI) {
+ // Unable to commute.
+ return nullptr;
+ }
+ if (UncommutedMI != MI) {
+ // New instruction. It doesn't need to be kept.
+ UncommutedMI->eraseFromParent();
return nullptr;
}
+
+ // Return here to prevent duplicate fuse failure report.
+ return nullptr;
}
}
MI->addRegisterKilled(Reg, TRI, true);
}
-MachineInstr *X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
- MachineInstr *MI,
- ArrayRef<unsigned> Ops,
- int FrameIndex) const {
+MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
+ MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt, int FrameIndex) const {
// Check switch flag
if (NoFusing) return nullptr;
// Unless optimizing for size, don't fold to avoid partial
// register update stalls
- if (!MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) &&
- hasPartialRegUpdate(MI->getOpcode()))
+ if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode()))
return nullptr;
const MachineFrameInfo *MFI = MF.getFrameInfo();
return nullptr;
return foldMemoryOperandImpl(MF, MI, Ops[0],
- MachineOperand::CreateFI(FrameIndex), Size,
- Alignment, /*AllowCommute=*/true);
+ MachineOperand::CreateFI(FrameIndex), InsertPt,
+ Size, Alignment, /*AllowCommute=*/true);
}
-static bool isPartialRegisterLoad(const MachineInstr &LoadMI,
- const MachineFunction &MF) {
+/// Check if \p LoadMI is a partial register load that we can't fold into \p MI
+/// because the latter uses contents that wouldn't be defined in the folded
+/// version. For instance, this transformation isn't legal:
+/// movss (%rdi), %xmm0
+/// addps %xmm0, %xmm0
+/// ->
+/// addps (%rdi), %xmm0
+///
+/// But this one is:
+/// movss (%rdi), %xmm0
+/// addss %xmm0, %xmm0
+/// ->
+/// addss (%rdi), %xmm0
+///
+static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
+ const MachineInstr &UserMI,
+ const MachineFunction &MF) {
unsigned Opc = LoadMI.getOpcode();
+ unsigned UserOpc = UserMI.getOpcode();
unsigned RegSize =
MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize();
- if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4)
+ if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4) {
// These instructions only load 32 bits, we can't fold them if the
- // destination register is wider than 32 bits (4 bytes).
- return true;
+ // destination register is wider than 32 bits (4 bytes), and its user
+ // instruction isn't scalar (SS).
+ switch (UserOpc) {
+ case X86::ADDSSrr_Int: case X86::VADDSSrr_Int:
+ case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int:
+ case X86::MULSSrr_Int: case X86::VMULSSrr_Int:
+ case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int:
+ case X86::VFMADDSSr132r_Int: case X86::VFNMADDSSr132r_Int:
+ case X86::VFMADDSSr213r_Int: case X86::VFNMADDSSr213r_Int:
+ case X86::VFMADDSSr231r_Int: case X86::VFNMADDSSr231r_Int:
+ case X86::VFMSUBSSr132r_Int: case X86::VFNMSUBSSr132r_Int:
+ case X86::VFMSUBSSr213r_Int: case X86::VFNMSUBSSr213r_Int:
+ case X86::VFMSUBSSr231r_Int: case X86::VFNMSUBSSr231r_Int:
+ return false;
+ default:
+ return true;
+ }
+ }
- if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8)
+ if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8) {
// These instructions only load 64 bits, we can't fold them if the
- // destination register is wider than 64 bits (8 bytes).
- return true;
+ // destination register is wider than 64 bits (8 bytes), and its user
+ // instruction isn't scalar (SD).
+ switch (UserOpc) {
+ case X86::ADDSDrr_Int: case X86::VADDSDrr_Int:
+ case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int:
+ case X86::MULSDrr_Int: case X86::VMULSDrr_Int:
+ case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int:
+ case X86::VFMADDSDr132r_Int: case X86::VFNMADDSDr132r_Int:
+ case X86::VFMADDSDr213r_Int: case X86::VFNMADDSDr213r_Int:
+ case X86::VFMADDSDr231r_Int: case X86::VFNMADDSDr231r_Int:
+ case X86::VFMSUBSDr132r_Int: case X86::VFNMSUBSDr132r_Int:
+ case X86::VFMSUBSDr213r_Int: case X86::VFNMSUBSDr213r_Int:
+ case X86::VFMSUBSDr231r_Int: case X86::VFNMSUBSDr231r_Int:
+ return false;
+ default:
+ return true;
+ }
+ }
return false;
}
-MachineInstr *X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
- MachineInstr *MI,
- ArrayRef<unsigned> Ops,
- MachineInstr *LoadMI) const {
+MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
+ MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const {
// If loading from a FrameIndex, fold directly from the FrameIndex.
unsigned NumOps = LoadMI->getDesc().getNumOperands();
int FrameIndex;
if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
- if (isPartialRegisterLoad(*LoadMI, MF))
+ if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF))
return nullptr;
- return foldMemoryOperandImpl(MF, MI, Ops, FrameIndex);
+ return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex);
}
// Check switch flag
if (NoFusing) return nullptr;
- // Unless optimizing for size, don't fold to avoid partial
- // register update stalls
- if (!MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) &&
- hasPartialRegUpdate(MI->getOpcode()))
+ // Avoid partial register update stalls unless optimizing for size.
+ if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode()))
return nullptr;
// Determine the alignment of the load.
break;
}
default: {
- if (isPartialRegisterLoad(*LoadMI, MF))
+ if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF))
return nullptr;
// Folding a normal load. Just copy the load's address operands.
- for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
- MOs.push_back(LoadMI->getOperand(i));
+ MOs.append(LoadMI->operands_begin() + NumOps - X86::AddrNumOperands,
+ LoadMI->operands_begin() + NumOps);
break;
}
}
- return foldMemoryOperandImpl(MF, MI, Ops[0], MOs,
+ return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, InsertPt,
/*Size=*/0, Alignment, /*AllowCommute=*/true);
}
-bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
- ArrayRef<unsigned> Ops) const {
- // Check switch flag
- if (NoFusing) return 0;
-
- if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
- switch (MI->getOpcode()) {
- default: return false;
- case X86::TEST8rr:
- case X86::TEST16rr:
- case X86::TEST32rr:
- case X86::TEST64rr:
- return true;
- case X86::ADD32ri:
- // FIXME: AsmPrinter doesn't know how to handle
- // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
- if (MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
- return false;
- break;
- }
- }
-
- if (Ops.size() != 1)
- return false;
-
- unsigned OpNum = Ops[0];
- unsigned Opc = MI->getOpcode();
- unsigned NumOps = MI->getDesc().getNumOperands();
- bool isTwoAddr = NumOps > 1 &&
- MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
-
- // Folding a memory location into the two-address part of a two-address
- // instruction is different than folding it other places. It requires
- // replacing the *two* registers with the memory location.
- const DenseMap<unsigned,
- std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
- if (isTwoAddr && NumOps >= 2 && OpNum < 2) {
- OpcodeTablePtr = &RegOp2MemOpTable2Addr;
- } else if (OpNum == 0) {
- if (Opc == X86::MOV32r0)
- return true;
-
- OpcodeTablePtr = &RegOp2MemOpTable0;
- } else if (OpNum == 1) {
- OpcodeTablePtr = &RegOp2MemOpTable1;
- } else if (OpNum == 2) {
- OpcodeTablePtr = &RegOp2MemOpTable2;
- } else if (OpNum == 3) {
- OpcodeTablePtr = &RegOp2MemOpTable3;
- }
-
- if (OpcodeTablePtr && OpcodeTablePtr->count(Opc))
- return true;
- return TargetInstrInfo::canFoldMemoryOperand(MI, Ops);
-}
-
bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
SmallVectorImpl<MachineInstr*> &NewMIs) const {
const MCInstrDesc &MCID = get(Opc);
const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
+ // TODO: Check if 32-byte or greater accesses are slow too?
if (!MI->hasOneMemOperand() &&
RC == &X86::VR128RegClass &&
- !Subtarget.isUnalignedMemAccessFast())
+ Subtarget.isUnalignedMem16Slow())
// Without memoperands, loadRegFromAddr and storeRegToStackSlot will
// conservatively assume the address is unaligned. That's bad for
// performance.
cast<MachineSDNode>(N)->memoperands_end());
if (!(*MMOs.first) &&
RC == &X86::VR128RegClass &&
- !Subtarget.isUnalignedMemAccessFast())
+ Subtarget.isUnalignedMem16Slow())
// Do not introduce a slow unaligned load.
return false;
+ // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
+ // memory access is slow above.
unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
bool isAligned = (*MMOs.first) &&
(*MMOs.first)->getAlignment() >= Alignment;
cast<MachineSDNode>(N)->memoperands_end());
if (!(*MMOs.first) &&
RC == &X86::VR128RegClass &&
- !Subtarget.isUnalignedMemAccessFast())
+ Subtarget.isUnalignedMem16Slow())
// Do not introduce a slow unaligned store.
return false;
+ // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
+ // memory access is slow above.
unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
bool isAligned = (*MMOs.first) &&
(*MMOs.first)->getAlignment() >= Alignment;
{ X86::MOVAPSrr, X86::MOVAPDrr, X86::MOVDQArr },
{ X86::MOVUPSmr, X86::MOVUPDmr, X86::MOVDQUmr },
{ X86::MOVUPSrm, X86::MOVUPDrm, X86::MOVDQUrm },
+ { X86::MOVLPSmr, X86::MOVLPDmr, X86::MOVPQI2QImr },
{ X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr },
{ X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm },
{ X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr },
{ X86::VMOVAPSrr, X86::VMOVAPDrr, X86::VMOVDQArr },
{ X86::VMOVUPSmr, X86::VMOVUPDmr, X86::VMOVDQUmr },
{ X86::VMOVUPSrm, X86::VMOVUPDrm, X86::VMOVDQUrm },
+ { X86::VMOVLPSmr, X86::VMOVLPDmr, X86::VMOVPQI2QImr },
{ X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr },
{ X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm },
{ X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr },
// domains, but they require a bit more work than just switching opcodes.
static const uint16_t *lookup(unsigned opcode, unsigned domain) {
- for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i)
- if (ReplaceableInstrs[i][domain-1] == opcode)
- return ReplaceableInstrs[i];
+ for (const uint16_t (&Row)[3] : ReplaceableInstrs)
+ if (Row[domain-1] == opcode)
+ return Row;
return nullptr;
}
static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) {
- for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i)
- if (ReplaceableInstrsAVX2[i][domain-1] == opcode)
- return ReplaceableInstrsAVX2[i];
+ for (const uint16_t (&Row)[3] : ReplaceableInstrsAVX2)
+ if (Row[domain-1] == opcode)
+ return Row;
return nullptr;
}
void X86InstrInfo::getUnconditionalBranch(
MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const {
Branch.setOpcode(X86::JMP_1);
- Branch.addOperand(MCOperand::CreateExpr(BranchTarget));
+ Branch.addOperand(MCOperand::createExpr(BranchTarget));
}
// This code must remain in sync with getJumpInstrTableEntryBound in this class!
}
bool X86InstrInfo::
-hasHighOperandLatency(const InstrItineraryData *ItinData,
+hasHighOperandLatency(const TargetSchedModel &SchedModel,
const MachineRegisterInfo *MRI,
const MachineInstr *DefMI, unsigned DefIdx,
const MachineInstr *UseMI, unsigned UseIdx) const {
return isHighLatencyDef(DefMI->getOpcode());
}
+bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst,
+ const MachineBasicBlock *MBB) const {
+ assert((Inst.getNumOperands() == 3 || Inst.getNumOperands() == 4) &&
+ "Reassociation needs binary operators");
+
+ // Integer binary math/logic instructions have a third source operand:
+ // the EFLAGS register. That operand must be both defined here and never
+ // used; ie, it must be dead. If the EFLAGS operand is live, then we can
+ // not change anything because rearranging the operands could affect other
+ // instructions that depend on the exact status flags (zero, sign, etc.)
+ // that are set by using these particular operands with this operation.
+ if (Inst.getNumOperands() == 4) {
+ assert(Inst.getOperand(3).isReg() &&
+ Inst.getOperand(3).getReg() == X86::EFLAGS &&
+ "Unexpected operand in reassociable instruction");
+ if (!Inst.getOperand(3).isDead())
+ return false;
+ }
+
+ return TargetInstrInfo::hasReassociableOperands(Inst, MBB);
+}
+
+// TODO: There are many more machine instruction opcodes to match:
+// 1. Other data types (integer, vectors)
+// 2. Other math / logic operations (xor, or)
+// 3. Other forms of the same operation (intrinsics and other variants)
+bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
+ switch (Inst.getOpcode()) {
+ case X86::AND8rr:
+ case X86::AND16rr:
+ case X86::AND32rr:
+ case X86::AND64rr:
+ case X86::OR8rr:
+ case X86::OR16rr:
+ case X86::OR32rr:
+ case X86::OR64rr:
+ case X86::XOR8rr:
+ case X86::XOR16rr:
+ case X86::XOR32rr:
+ case X86::XOR64rr:
+ case X86::IMUL16rr:
+ case X86::IMUL32rr:
+ case X86::IMUL64rr:
+ case X86::PANDrr:
+ case X86::PORrr:
+ case X86::PXORrr:
+ case X86::VPANDrr:
+ case X86::VPANDYrr:
+ case X86::VPORrr:
+ case X86::VPORYrr:
+ case X86::VPXORrr:
+ case X86::VPXORYrr:
+ // Normal min/max instructions are not commutative because of NaN and signed
+ // zero semantics, but these are. Thus, there's no need to check for global
+ // relaxed math; the instructions themselves have the properties we need.
+ case X86::MAXCPDrr:
+ case X86::MAXCPSrr:
+ case X86::MAXCSDrr:
+ case X86::MAXCSSrr:
+ case X86::MINCPDrr:
+ case X86::MINCPSrr:
+ case X86::MINCSDrr:
+ case X86::MINCSSrr:
+ case X86::VMAXCPDrr:
+ case X86::VMAXCPSrr:
+ case X86::VMAXCPDYrr:
+ case X86::VMAXCPSYrr:
+ case X86::VMAXCSDrr:
+ case X86::VMAXCSSrr:
+ case X86::VMINCPDrr:
+ case X86::VMINCPSrr:
+ case X86::VMINCPDYrr:
+ case X86::VMINCPSYrr:
+ case X86::VMINCSDrr:
+ case X86::VMINCSSrr:
+ return true;
+ case X86::ADDPDrr:
+ case X86::ADDPSrr:
+ case X86::ADDSDrr:
+ case X86::ADDSSrr:
+ case X86::MULPDrr:
+ case X86::MULPSrr:
+ case X86::MULSDrr:
+ case X86::MULSSrr:
+ case X86::VADDPDrr:
+ case X86::VADDPSrr:
+ case X86::VADDPDYrr:
+ case X86::VADDPSYrr:
+ case X86::VADDSDrr:
+ case X86::VADDSSrr:
+ case X86::VMULPDrr:
+ case X86::VMULPSrr:
+ case X86::VMULPDYrr:
+ case X86::VMULPSYrr:
+ case X86::VMULSDrr:
+ case X86::VMULSSrr:
+ return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
+ default:
+ return false;
+ }
+}
+
+/// This is an architecture-specific helper function of reassociateOps.
+/// Set special operand attributes for new instructions after reassociation.
+void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1,
+ MachineInstr &OldMI2,
+ MachineInstr &NewMI1,
+ MachineInstr &NewMI2) const {
+ // Integer instructions define an implicit EFLAGS source register operand as
+ // the third source (fourth total) operand.
+ if (OldMI1.getNumOperands() != 4 || OldMI2.getNumOperands() != 4)
+ return;
+
+ assert(NewMI1.getNumOperands() == 4 && NewMI2.getNumOperands() == 4 &&
+ "Unexpected instruction type for reassociation");
+
+ MachineOperand &OldOp1 = OldMI1.getOperand(3);
+ MachineOperand &OldOp2 = OldMI2.getOperand(3);
+ MachineOperand &NewOp1 = NewMI1.getOperand(3);
+ MachineOperand &NewOp2 = NewMI2.getOperand(3);
+
+ assert(OldOp1.isReg() && OldOp1.getReg() == X86::EFLAGS && OldOp1.isDead() &&
+ "Must have dead EFLAGS operand in reassociable instruction");
+ assert(OldOp2.isReg() && OldOp2.getReg() == X86::EFLAGS && OldOp2.isDead() &&
+ "Must have dead EFLAGS operand in reassociable instruction");
+
+ (void)OldOp1;
+ (void)OldOp2;
+
+ assert(NewOp1.isReg() && NewOp1.getReg() == X86::EFLAGS &&
+ "Unexpected operand in reassociable instruction");
+ assert(NewOp2.isReg() && NewOp2.getReg() == X86::EFLAGS &&
+ "Unexpected operand in reassociable instruction");
+
+ // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations
+ // of this pass or other passes. The EFLAGS operands must be dead in these new
+ // instructions because the EFLAGS operands in the original instructions must
+ // be dead in order for reassociation to occur.
+ NewOp1.setIsDead();
+ NewOp2.setIsDead();
+}
+
+std::pair<unsigned, unsigned>
+X86InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+ return std::make_pair(TF, 0u);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+ using namespace X86II;
+ static const std::pair<unsigned, const char *> TargetFlags[] = {
+ {MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"},
+ {MO_PIC_BASE_OFFSET, "x86-pic-base-offset"},
+ {MO_GOT, "x86-got"},
+ {MO_GOTOFF, "x86-gotoff"},
+ {MO_GOTPCREL, "x86-gotpcrel"},
+ {MO_PLT, "x86-plt"},
+ {MO_TLSGD, "x86-tlsgd"},
+ {MO_TLSLD, "x86-tlsld"},
+ {MO_TLSLDM, "x86-tlsldm"},
+ {MO_GOTTPOFF, "x86-gottpoff"},
+ {MO_INDNTPOFF, "x86-indntpoff"},
+ {MO_TPOFF, "x86-tpoff"},
+ {MO_DTPOFF, "x86-dtpoff"},
+ {MO_NTPOFF, "x86-ntpoff"},
+ {MO_GOTNTPOFF, "x86-gotntpoff"},
+ {MO_DLLIMPORT, "x86-dllimport"},
+ {MO_DARWIN_STUB, "x86-darwin-stub"},
+ {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"},
+ {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"},
+ {MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE, "x86-darwin-hidden-nonlazy-pic-base"},
+ {MO_TLVP, "x86-tlvp"},
+ {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"},
+ {MO_SECREL, "x86-secrel"}};
+ return makeArrayRef(TargetFlags);
+}
+
namespace {
/// Create Global Base Reg pass. This initializes the PIC
/// global base register for x86-32.