{ X86::XOR8rr, X86::XOR8mr, 0 }
};
- for (unsigned i = 0, e = array_lengthof(MemoryFoldTable2Addr); i != e; ++i) {
- unsigned RegOp = MemoryFoldTable2Addr[i].RegOp;
- unsigned MemOp = MemoryFoldTable2Addr[i].MemOp;
- unsigned Flags = MemoryFoldTable2Addr[i].Flags;
+ for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2Addr) {
AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable,
- RegOp, MemOp,
+ Entry.RegOp, Entry.MemOp,
// Index 0, folded load and store, no alignment requirement.
- Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
+ Entry.Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
}
static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
{ X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD },
{ X86::PEXTRDrr, X86::PEXTRDmr, TB_FOLDED_STORE },
{ X86::PEXTRQrr, X86::PEXTRQmr, TB_FOLDED_STORE },
+ { X86::PUSH16r, X86::PUSH16rmm, TB_FOLDED_LOAD },
+ { X86::PUSH32r, X86::PUSH32rmm, TB_FOLDED_LOAD },
+ { X86::PUSH64r, X86::PUSH64rmm, TB_FOLDED_LOAD },
{ X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE },
{ X86::SETAr, X86::SETAm, TB_FOLDED_STORE },
{ X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE },
{ X86::VCVTPS2PHYrr, X86::VCVTPS2PHYmr, TB_FOLDED_STORE }
};
- for (unsigned i = 0, e = array_lengthof(MemoryFoldTable0); i != e; ++i) {
- unsigned RegOp = MemoryFoldTable0[i].RegOp;
- unsigned MemOp = MemoryFoldTable0[i].MemOp;
- unsigned Flags = MemoryFoldTable0[i].Flags;
+ for (X86MemoryFoldTableEntry Entry : MemoryFoldTable0) {
AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable,
- RegOp, MemOp, TB_INDEX_0 | Flags);
+ Entry.RegOp, Entry.MemOp, TB_INDEX_0 | Entry.Flags);
}
static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, 0 }
};
- for (unsigned i = 0, e = array_lengthof(MemoryFoldTable1); i != e; ++i) {
- unsigned RegOp = MemoryFoldTable1[i].RegOp;
- unsigned MemOp = MemoryFoldTable1[i].MemOp;
- unsigned Flags = MemoryFoldTable1[i].Flags;
+ for (X86MemoryFoldTableEntry Entry : MemoryFoldTable1) {
AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable,
- RegOp, MemOp,
+ Entry.RegOp, Entry.MemOp,
// Index 1, folded load
- Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
+ Entry.Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
}
static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 },
{ X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 },
- // FIXME: We should not be folding Fs* scalar loads into vector
- // instructions because the vector instructions require vector-sized
- // loads. Lowering should create vector-sized instructions (the Fv*
- // variants below) to allow load folding.
- { X86::FsANDNPDrr, X86::FsANDNPDrm, TB_ALIGN_16 },
- { X86::FsANDNPSrr, X86::FsANDNPSrm, TB_ALIGN_16 },
- { X86::FsANDPDrr, X86::FsANDPDrm, TB_ALIGN_16 },
- { X86::FsANDPSrr, X86::FsANDPSrm, TB_ALIGN_16 },
- { X86::FsORPDrr, X86::FsORPDrm, TB_ALIGN_16 },
- { X86::FsORPSrr, X86::FsORPSrm, TB_ALIGN_16 },
- { X86::FsXORPDrr, X86::FsXORPDrm, TB_ALIGN_16 },
- { X86::FsXORPSrr, X86::FsXORPSrm, TB_ALIGN_16 },
+ // Do not fold Fs* scalar logical op loads because there are no scalar
+ // load variants for these instructions. When folded, the load is required
+ // to be 128-bits, so the load size would not match.
{ X86::FvANDNPDrr, X86::FvANDNPDrm, TB_ALIGN_16 },
{ X86::FvANDNPSrr, X86::FvANDNPSrm, TB_ALIGN_16 },
{ X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 },
{ X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 },
{ X86::PXORrr, X86::PXORrm, TB_ALIGN_16 },
+ { X86::ROUNDSDr, X86::ROUNDSDm, 0 },
+ { X86::ROUNDSSr, X86::ROUNDSSm, 0 },
{ X86::SBB32rr, X86::SBB32rm, 0 },
{ X86::SBB64rr, X86::SBB64rm, 0 },
{ X86::SHUFPDrri, X86::SHUFPDrmi, TB_ALIGN_16 },
{ X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 },
{ X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, 0 },
{ X86::VPXORrr, X86::VPXORrm, 0 },
+ { X86::VROUNDSDr, X86::VROUNDSDm, 0 },
+ { X86::VROUNDSSr, X86::VROUNDSSm, 0 },
{ X86::VSHUFPDrri, X86::VSHUFPDrmi, 0 },
{ X86::VSHUFPSrri, X86::VSHUFPSrmi, 0 },
{ X86::VSUBPDrr, X86::VSUBPDrm, 0 },
{ X86::VPXORYrr, X86::VPXORYrm, 0 },
// FMA4 foldable patterns
- { X86::VFMADDSS4rr, X86::VFMADDSS4mr, 0 },
- { X86::VFMADDSD4rr, X86::VFMADDSD4mr, 0 },
- { X86::VFMADDPS4rr, X86::VFMADDPS4mr, 0 },
- { X86::VFMADDPD4rr, X86::VFMADDPD4mr, 0 },
- { X86::VFMADDPS4rrY, X86::VFMADDPS4mrY, 0 },
- { X86::VFMADDPD4rrY, X86::VFMADDPD4mrY, 0 },
- { X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, 0 },
- { X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, 0 },
- { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, 0 },
- { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, 0 },
- { X86::VFNMADDPS4rrY, X86::VFNMADDPS4mrY, 0 },
- { X86::VFNMADDPD4rrY, X86::VFNMADDPD4mrY, 0 },
- { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, 0 },
- { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, 0 },
- { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, 0 },
- { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, 0 },
- { X86::VFMSUBPS4rrY, X86::VFMSUBPS4mrY, 0 },
- { X86::VFMSUBPD4rrY, X86::VFMSUBPD4mrY, 0 },
- { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, 0 },
- { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, 0 },
- { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, 0 },
- { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, 0 },
- { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4mrY, 0 },
- { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4mrY, 0 },
- { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, 0 },
- { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, 0 },
- { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4mrY, 0 },
- { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4mrY, 0 },
- { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, 0 },
- { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, 0 },
- { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4mrY, 0 },
- { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, 0 },
+ { X86::VFMADDSS4rr, X86::VFMADDSS4mr, TB_ALIGN_NONE },
+ { X86::VFMADDSD4rr, X86::VFMADDSD4mr, TB_ALIGN_NONE },
+ { X86::VFMADDPS4rr, X86::VFMADDPS4mr, TB_ALIGN_NONE },
+ { X86::VFMADDPD4rr, X86::VFMADDPD4mr, TB_ALIGN_NONE },
+ { X86::VFMADDPS4rrY, X86::VFMADDPS4mrY, TB_ALIGN_NONE },
+ { X86::VFMADDPD4rrY, X86::VFMADDPD4mrY, TB_ALIGN_NONE },
+ { X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, TB_ALIGN_NONE },
+ { X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, TB_ALIGN_NONE },
+ { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, TB_ALIGN_NONE },
+ { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, TB_ALIGN_NONE },
+ { X86::VFNMADDPS4rrY, X86::VFNMADDPS4mrY, TB_ALIGN_NONE },
+ { X86::VFNMADDPD4rrY, X86::VFNMADDPD4mrY, TB_ALIGN_NONE },
+ { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBPS4rrY, X86::VFMSUBPS4mrY, TB_ALIGN_NONE },
+ { X86::VFMSUBPD4rrY, X86::VFMSUBPD4mrY, TB_ALIGN_NONE },
+ { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, TB_ALIGN_NONE },
+ { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, TB_ALIGN_NONE },
+ { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, TB_ALIGN_NONE },
+ { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, TB_ALIGN_NONE },
+ { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4mrY, TB_ALIGN_NONE },
+ { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4mrY, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4mrY, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4mrY, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4mrY, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, TB_ALIGN_NONE },
// XOP foldable instructions
{ X86::VPCMOVrr, X86::VPCMOVmr, 0 },
{ X86::SHA256RNDS2rr, X86::SHA256RNDS2rm, TB_ALIGN_16 }
};
- for (unsigned i = 0, e = array_lengthof(MemoryFoldTable2); i != e; ++i) {
- unsigned RegOp = MemoryFoldTable2[i].RegOp;
- unsigned MemOp = MemoryFoldTable2[i].MemOp;
- unsigned Flags = MemoryFoldTable2[i].Flags;
+ for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2) {
AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable,
- RegOp, MemOp,
+ Entry.RegOp, Entry.MemOp,
// Index 2, folded load
- Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
+ Entry.Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
}
static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr213mY, TB_ALIGN_NONE },
// FMA4 foldable patterns
- { X86::VFMADDSS4rr, X86::VFMADDSS4rm, 0 },
- { X86::VFMADDSD4rr, X86::VFMADDSD4rm, 0 },
- { X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_16 },
- { X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_16 },
- { X86::VFMADDPS4rrY, X86::VFMADDPS4rmY, TB_ALIGN_32 },
- { X86::VFMADDPD4rrY, X86::VFMADDPD4rmY, TB_ALIGN_32 },
- { X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, 0 },
- { X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, 0 },
- { X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_16 },
- { X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_16 },
- { X86::VFNMADDPS4rrY, X86::VFNMADDPS4rmY, TB_ALIGN_32 },
- { X86::VFNMADDPD4rrY, X86::VFNMADDPD4rmY, TB_ALIGN_32 },
- { X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, 0 },
- { X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, 0 },
- { X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_16 },
- { X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_16 },
- { X86::VFMSUBPS4rrY, X86::VFMSUBPS4rmY, TB_ALIGN_32 },
- { X86::VFMSUBPD4rrY, X86::VFMSUBPD4rmY, TB_ALIGN_32 },
- { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, 0 },
- { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, 0 },
- { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_16 },
- { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_16 },
- { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4rmY, TB_ALIGN_32 },
- { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4rmY, TB_ALIGN_32 },
- { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_16 },
- { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_16 },
- { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4rmY, TB_ALIGN_32 },
- { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4rmY, TB_ALIGN_32 },
- { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_16 },
- { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_16 },
- { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4rmY, TB_ALIGN_32 },
- { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4rmY, TB_ALIGN_32 },
+ { X86::VFMADDSS4rr, X86::VFMADDSS4rm, TB_ALIGN_NONE },
+ { X86::VFMADDSD4rr, X86::VFMADDSD4rm, TB_ALIGN_NONE },
+ { X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_NONE },
+ { X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_NONE },
+ { X86::VFMADDPS4rrY, X86::VFMADDPS4rmY, TB_ALIGN_NONE },
+ { X86::VFMADDPD4rrY, X86::VFMADDPD4rmY, TB_ALIGN_NONE },
+ { X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, TB_ALIGN_NONE },
+ { X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, TB_ALIGN_NONE },
+ { X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_NONE },
+ { X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_NONE },
+ { X86::VFNMADDPS4rrY, X86::VFNMADDPS4rmY, TB_ALIGN_NONE },
+ { X86::VFNMADDPD4rrY, X86::VFNMADDPD4rmY, TB_ALIGN_NONE },
+ { X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBPS4rrY, X86::VFMSUBPS4rmY, TB_ALIGN_NONE },
+ { X86::VFMSUBPD4rrY, X86::VFMSUBPD4rmY, TB_ALIGN_NONE },
+ { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, TB_ALIGN_NONE },
+ { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, TB_ALIGN_NONE },
+ { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_NONE },
+ { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_NONE },
+ { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4rmY, TB_ALIGN_NONE },
+ { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4rmY, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4rmY, TB_ALIGN_NONE },
+ { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4rmY, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4rmY, TB_ALIGN_NONE },
+ { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4rmY, TB_ALIGN_NONE },
// XOP foldable instructions
{ X86::VPCMOVrr, X86::VPCMOVrm, 0 },
{ X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 }
};
- for (unsigned i = 0, e = array_lengthof(MemoryFoldTable3); i != e; ++i) {
- unsigned RegOp = MemoryFoldTable3[i].RegOp;
- unsigned MemOp = MemoryFoldTable3[i].MemOp;
- unsigned Flags = MemoryFoldTable3[i].Flags;
+ for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) {
AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
- RegOp, MemOp,
+ Entry.RegOp, Entry.MemOp,
// Index 3, folded load
- Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
+ Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
}
static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 }
};
- for (unsigned i = 0, e = array_lengthof(MemoryFoldTable4); i != e; ++i) {
- unsigned RegOp = MemoryFoldTable4[i].RegOp;
- unsigned MemOp = MemoryFoldTable4[i].MemOp;
- unsigned Flags = MemoryFoldTable4[i].Flags;
+ for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) {
AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
- RegOp, MemOp,
+ Entry.RegOp, Entry.MemOp,
// Index 4, folded load
- Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
+ Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
}
}
inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
// Left shift instructions can be transformed into load-effective-address
// instructions if we can encode them appropriately.
- // A LEA instruction utilizes a SIB byte to encode it's scale factor.
+ // A LEA instruction utilizes a SIB byte to encode its scale factor.
// The SIB.scale field is two bits wide which means that we can encode any
// shift amount less than 4.
return ShAmt < 4 && ShAmt > 0;
return !isPredicated(MI);
}
-bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
- MachineBasicBlock *&TBB,
- MachineBasicBlock *&FBB,
- SmallVectorImpl<MachineOperand> &Cond,
- bool AllowModify) const {
+bool X86InstrInfo::AnalyzeBranchImpl(
+ MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {
+
// Start from the bottom of the block and work up, examining the
// terminator instructions.
MachineBasicBlock::iterator I = MBB.end();
FBB = TBB;
TBB = I->getOperand(0).getMBB();
Cond.push_back(MachineOperand::CreateImm(BranchCode));
+ CondBranches.push_back(I);
continue;
}
// Update the MachineOperand.
Cond[0].setImm(BranchCode);
+ CondBranches.push_back(I);
}
return false;
}
+bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ SmallVector<MachineInstr *, 4> CondBranches;
+ return AnalyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
+}
+
+bool X86InstrInfo::AnalyzeBranchPredicate(MachineBasicBlock &MBB,
+ MachineBranchPredicate &MBP,
+ bool AllowModify) const {
+ using namespace std::placeholders;
+
+ SmallVector<MachineOperand, 4> Cond;
+ SmallVector<MachineInstr *, 4> CondBranches;
+ if (AnalyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
+ AllowModify))
+ return true;
+
+ if (Cond.size() != 1)
+ return true;
+
+ assert(MBP.TrueDest && "expected!");
+
+ if (!MBP.FalseDest)
+ MBP.FalseDest = MBB.getNextNode();
+
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+ MachineInstr *ConditionDef = nullptr;
+ bool SingleUseCondition = true;
+
+ for (auto I = std::next(MBB.rbegin()), E = MBB.rend(); I != E; ++I) {
+ if (I->modifiesRegister(X86::EFLAGS, TRI)) {
+ ConditionDef = &*I;
+ break;
+ }
+
+ if (I->readsRegister(X86::EFLAGS, TRI))
+ SingleUseCondition = false;
+ }
+
+ if (!ConditionDef)
+ return true;
+
+ if (SingleUseCondition) {
+ for (auto *Succ : MBB.successors())
+ if (Succ->isLiveIn(X86::EFLAGS))
+ SingleUseCondition = false;
+ }
+
+ MBP.ConditionDef = ConditionDef;
+ MBP.SingleUseCondition = SingleUseCondition;
+
+ // Currently we only recognize the simple pattern:
+ //
+ // test %reg, %reg
+ // je %label
+ //
+ const unsigned TestOpcode =
+ Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr;
+
+ if (ConditionDef->getOpcode() == TestOpcode &&
+ ConditionDef->getNumOperands() == 3 &&
+ ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
+ (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) {
+ MBP.LHS = ConditionDef->getOperand(0);
+ MBP.RHS = MachineOperand::CreateImm(0);
+ MBP.Predicate = Cond[0].getImm() == X86::COND_NE
+ ? MachineBranchPredicate::PRED_NE
+ : MachineBranchPredicate::PRED_EQ;
+ return false;
+ }
+
+ return true;
+}
+
unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
MachineBasicBlock::iterator I = MBB.end();
unsigned Count = 0;
X86::MOVPQIto64rr);
if (X86::VR64RegClass.contains(SrcReg))
// Copy from a VR64 register to a GR64 register.
- return X86::MOVSDto64rr;
+ return X86::MMX_MOVD64from64rr;
} else if (X86::GR64RegClass.contains(SrcReg)) {
// Copy from a GR64 register to a VR128 register.
if (X86::VR128XRegClass.contains(DestReg))
X86::MOV64toPQIrr);
// Copy from a GR64 register to a VR64 register.
if (X86::VR64RegClass.contains(DestReg))
- return X86::MOV64toSDrr;
+ return X86::MMX_MOVD64to64rr;
}
// SrcReg(FR32) -> DestReg(GR32)
return;
}
- // Moving EFLAGS to / from another register requires a push and a pop.
- // Notice that we have to adjust the stack if we don't want to clobber the
- // first frame index. See X86FrameLowering.cpp - clobbersTheStack.
- if (SrcReg == X86::EFLAGS) {
- if (X86::GR64RegClass.contains(DestReg)) {
- BuildMI(MBB, MI, DL, get(X86::PUSHF64));
- BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg);
- return;
- }
- if (X86::GR32RegClass.contains(DestReg)) {
- BuildMI(MBB, MI, DL, get(X86::PUSHF32));
- BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg);
- return;
- }
- }
- if (DestReg == X86::EFLAGS) {
- if (X86::GR64RegClass.contains(SrcReg)) {
- BuildMI(MBB, MI, DL, get(X86::PUSH64r))
- .addReg(SrcReg, getKillRegState(KillSrc));
- BuildMI(MBB, MI, DL, get(X86::POPF64));
- return;
+ bool FromEFLAGS = SrcReg == X86::EFLAGS;
+ bool ToEFLAGS = DestReg == X86::EFLAGS;
+ int Reg = FromEFLAGS ? DestReg : SrcReg;
+ bool is32 = X86::GR32RegClass.contains(Reg);
+ bool is64 = X86::GR64RegClass.contains(Reg);
+ if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) {
+ // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is
+ // inefficient. Instead:
+ // - Save the overflow flag OF into AL using SETO, and restore it using a
+ // signed 8-bit addition of AL and INT8_MAX.
+ // - Save/restore the bottom 8 EFLAGS bits (CF, PF, AF, ZF, SF) to/from AH
+ // using LAHF/SAHF.
+ // - When RAX/EAX is live and isn't the destination register, make sure it
+ // isn't clobbered by PUSH/POP'ing it before and after saving/restoring
+ // the flags.
+ // This approach is ~2.25x faster than using PUSHF/POPF.
+ //
+ // This is still somewhat inefficient because we don't know which flags are
+ // actually live inside EFLAGS. Were we able to do a single SETcc instead of
+ // SETO+LAHF / ADDB+SAHF the code could be 1.02x faster.
+ //
+ // PUSHF/POPF is also potentially incorrect because it affects other flags
+ // such as TF/IF/DF, which LLVM doesn't model.
+ //
+ // Notice that we have to adjust the stack if we don't want to clobber the
+ // first frame index. See X86FrameLowering.cpp - clobbersTheStack.
+
+ int Mov = is64 ? X86::MOV64rr : X86::MOV32rr;
+ int Push = is64 ? X86::PUSH64r : X86::PUSH32r;
+ int Pop = is64 ? X86::POP64r : X86::POP32r;
+ int AX = is64 ? X86::RAX : X86::EAX;
+
+ bool AXDead = (Reg == AX) ||
+ (MachineBasicBlock::LQR_Dead ==
+ MBB.computeRegisterLiveness(&getRegisterInfo(), AX, MI));
+
+ if (!AXDead)
+ BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true));
+ if (FromEFLAGS) {
+ BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL);
+ BuildMI(MBB, MI, DL, get(X86::LAHF));
+ BuildMI(MBB, MI, DL, get(Mov), Reg).addReg(AX);
}
- if (X86::GR32RegClass.contains(SrcReg)) {
- BuildMI(MBB, MI, DL, get(X86::PUSH32r))
- .addReg(SrcReg, getKillRegState(KillSrc));
- BuildMI(MBB, MI, DL, get(X86::POPF32));
- return;
+ if (ToEFLAGS) {
+ BuildMI(MBB, MI, DL, get(Mov), AX).addReg(Reg, getKillRegState(KillSrc));
+ BuildMI(MBB, MI, DL, get(X86::ADD8ri), X86::AL)
+ .addReg(X86::AL)
+ .addImm(INT8_MAX);
+ BuildMI(MBB, MI, DL, get(X86::SAHF));
}
+ if (!AXDead)
+ BuildMI(MBB, MI, DL, get(Pop), AX);
+ return;
}
DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
const GlobalValue *GV =
cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
- MachineMemOperand *MMO = MBB.getParent()->
- getMachineMemOperand(MachinePointerInfo::getGOT(), Flag, 8, 8);
+ MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
+ MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 8, 8);
MachineBasicBlock::iterator I = MIB.getInstr();
BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
bool isCallRegIndirect = Subtarget.callRegIndirect();
bool isTwoAddrFold = false;
- // For CPUs that favor the register form of a call,
- // do not fold loads into calls.
- if (isCallRegIndirect &&
- (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r))
+ // For CPUs that favor the register form of a call or push,
+ // do not fold loads into calls or pushes, unless optimizing for size
+ // aggressively.
+ if (isCallRegIndirect && !MF.getFunction()->optForMinSize() &&
+ (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r ||
+ MI->getOpcode() == X86::PUSH16r || MI->getOpcode() == X86::PUSH32r ||
+ MI->getOpcode() == X86::PUSH64r))
return nullptr;
unsigned NumOps = MI->getDesc().getNumOperands();
// Unless optimizing for size, don't fold to avoid partial
// register update stalls
- if (!MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) &&
- hasPartialRegUpdate(MI->getOpcode()))
+ if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode()))
return nullptr;
const MachineFrameInfo *MFI = MF.getFrameInfo();
Size, Alignment, /*AllowCommute=*/true);
}
-static bool isPartialRegisterLoad(const MachineInstr &LoadMI,
- const MachineFunction &MF) {
+/// Check if \p LoadMI is a partial register load that we can't fold into \p MI
+/// because the latter uses contents that wouldn't be defined in the folded
+/// version. For instance, this transformation isn't legal:
+/// movss (%rdi), %xmm0
+/// addps %xmm0, %xmm0
+/// ->
+/// addps (%rdi), %xmm0
+///
+/// But this one is:
+/// movss (%rdi), %xmm0
+/// addss %xmm0, %xmm0
+/// ->
+/// addss (%rdi), %xmm0
+///
+static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
+ const MachineInstr &UserMI,
+ const MachineFunction &MF) {
unsigned Opc = LoadMI.getOpcode();
+ unsigned UserOpc = UserMI.getOpcode();
unsigned RegSize =
MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize();
- if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4)
+ if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4) {
// These instructions only load 32 bits, we can't fold them if the
- // destination register is wider than 32 bits (4 bytes).
- return true;
+ // destination register is wider than 32 bits (4 bytes), and its user
+ // instruction isn't scalar (SS).
+ switch (UserOpc) {
+ case X86::ADDSSrr_Int: case X86::VADDSSrr_Int:
+ case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int:
+ case X86::MULSSrr_Int: case X86::VMULSSrr_Int:
+ case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int:
+ return false;
+ default:
+ return true;
+ }
+ }
- if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8)
+ if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8) {
// These instructions only load 64 bits, we can't fold them if the
- // destination register is wider than 64 bits (8 bytes).
- return true;
+ // destination register is wider than 64 bits (8 bytes), and its user
+ // instruction isn't scalar (SD).
+ switch (UserOpc) {
+ case X86::ADDSDrr_Int: case X86::VADDSDrr_Int:
+ case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int:
+ case X86::MULSDrr_Int: case X86::VMULSDrr_Int:
+ case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int:
+ return false;
+ default:
+ return true;
+ }
+ }
return false;
}
unsigned NumOps = LoadMI->getDesc().getNumOperands();
int FrameIndex;
if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
- if (isPartialRegisterLoad(*LoadMI, MF))
+ if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF))
return nullptr;
return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex);
}
// Check switch flag
if (NoFusing) return nullptr;
- // Unless optimizing for size, don't fold to avoid partial
- // register update stalls
- if (!MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) &&
- hasPartialRegUpdate(MI->getOpcode()))
+ // Avoid partial register update stalls unless optimizing for size.
+ if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode()))
return nullptr;
// Determine the alignment of the load.
break;
}
default: {
- if (isPartialRegisterLoad(*LoadMI, MF))
+ if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF))
return nullptr;
// Folding a normal load. Just copy the load's address operands.
/*Size=*/0, Alignment, /*AllowCommute=*/true);
}
-bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
- ArrayRef<unsigned> Ops) const {
- // Check switch flag
- if (NoFusing) return 0;
-
- if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
- switch (MI->getOpcode()) {
- default: return false;
- case X86::TEST8rr:
- case X86::TEST16rr:
- case X86::TEST32rr:
- case X86::TEST64rr:
- return true;
- case X86::ADD32ri:
- // FIXME: AsmPrinter doesn't know how to handle
- // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
- if (MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
- return false;
- break;
- }
- }
-
- if (Ops.size() != 1)
- return false;
-
- unsigned OpNum = Ops[0];
- unsigned Opc = MI->getOpcode();
- unsigned NumOps = MI->getDesc().getNumOperands();
- bool isTwoAddr = NumOps > 1 &&
- MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
-
- // Folding a memory location into the two-address part of a two-address
- // instruction is different than folding it other places. It requires
- // replacing the *two* registers with the memory location.
- const DenseMap<unsigned,
- std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
- if (isTwoAddr && NumOps >= 2 && OpNum < 2) {
- OpcodeTablePtr = &RegOp2MemOpTable2Addr;
- } else if (OpNum == 0) {
- if (Opc == X86::MOV32r0)
- return true;
-
- OpcodeTablePtr = &RegOp2MemOpTable0;
- } else if (OpNum == 1) {
- OpcodeTablePtr = &RegOp2MemOpTable1;
- } else if (OpNum == 2) {
- OpcodeTablePtr = &RegOp2MemOpTable2;
- } else if (OpNum == 3) {
- OpcodeTablePtr = &RegOp2MemOpTable3;
- }
-
- if (OpcodeTablePtr && OpcodeTablePtr->count(Opc))
- return true;
- return TargetInstrInfo::canFoldMemoryOperand(MI, Ops);
-}
-
bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
SmallVectorImpl<MachineInstr*> &NewMIs) const {
return isHighLatencyDef(DefMI->getOpcode());
}
-/// If the input instruction is part of a chain of dependent ops that are
-/// suitable for reassociation, return the earlier instruction in the sequence
-/// that defines its first operand, otherwise return a nullptr.
-/// If the instruction's operands must be commuted to be considered a
-/// reassociation candidate, Commuted will be set to true.
-static MachineInstr *isReassocCandidate(const MachineInstr &Inst,
- unsigned AssocOpcode,
- bool checkPrevOneUse,
- bool &Commuted) {
- if (Inst.getOpcode() != AssocOpcode)
- return nullptr;
-
- MachineOperand Op1 = Inst.getOperand(1);
- MachineOperand Op2 = Inst.getOperand(2);
-
- const MachineBasicBlock *MBB = Inst.getParent();
+static bool hasReassociableOperands(const MachineInstr &Inst,
+ const MachineBasicBlock *MBB) {
+ assert((Inst.getNumOperands() == 3 || Inst.getNumOperands() == 4) &&
+ "Reassociation needs binary operators");
+ const MachineOperand &Op1 = Inst.getOperand(1);
+ const MachineOperand &Op2 = Inst.getOperand(2);
const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
- // We need virtual register definitions.
+ // Integer binary math/logic instructions have a third source operand:
+ // the EFLAGS register. That operand must be both defined here and never
+ // used; ie, it must be dead. If the EFLAGS operand is live, then we can
+ // not change anything because rearranging the operands could affect other
+ // instructions that depend on the exact status flags (zero, sign, etc.)
+ // that are set by using these particular operands with this operation.
+ if (Inst.getNumOperands() == 4) {
+ assert(Inst.getOperand(3).isReg() &&
+ Inst.getOperand(3).getReg() == X86::EFLAGS &&
+ "Unexpected operand in reassociable instruction");
+ if (!Inst.getOperand(3).isDead())
+ return false;
+ }
+
+ // We need virtual register definitions for the operands that we will
+ // reassociate.
MachineInstr *MI1 = nullptr;
MachineInstr *MI2 = nullptr;
if (Op1.isReg() && TargetRegisterInfo::isVirtualRegister(Op1.getReg()))
MI1 = MRI.getUniqueVRegDef(Op1.getReg());
if (Op2.isReg() && TargetRegisterInfo::isVirtualRegister(Op2.getReg()))
MI2 = MRI.getUniqueVRegDef(Op2.getReg());
-
+
// And they need to be in the trace (otherwise, they won't have a depth).
- if (!MI1 || !MI2 || MI1->getParent() != MBB || MI2->getParent() != MBB)
- return nullptr;
-
- Commuted = false;
- if (MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode) {
+ if (MI1 && MI2 && MI1->getParent() == MBB && MI2->getParent() == MBB)
+ return true;
+
+ return false;
+}
+
+static bool hasReassociableSibling(const MachineInstr &Inst, bool &Commuted) {
+ const MachineBasicBlock *MBB = Inst.getParent();
+ const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ MachineInstr *MI1 = MRI.getUniqueVRegDef(Inst.getOperand(1).getReg());
+ MachineInstr *MI2 = MRI.getUniqueVRegDef(Inst.getOperand(2).getReg());
+ unsigned AssocOpcode = Inst.getOpcode();
+
+ // If only one operand has the same opcode and it's the second source operand,
+ // the operands must be commuted.
+ Commuted = MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode;
+ if (Commuted)
std::swap(MI1, MI2);
- Commuted = true;
- }
- // Avoid reassociating operands when it won't provide any benefit. If both
- // operands are produced by instructions of this type, we may already
- // have the optimal sequence.
- if (MI2->getOpcode() == AssocOpcode)
- return nullptr;
-
- // The instruction must only be used by the other instruction that we
- // reassociate with.
- if (checkPrevOneUse && !MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg()))
- return nullptr;
-
- // We must match a simple chain of dependent ops.
- // TODO: This check is not necessary for the earliest instruction in the
- // sequence. Instead of a sequence of 3 dependent instructions with the same
- // opcode, we only need to find a sequence of 2 dependent instructions with
- // the same opcode plus 1 other instruction that adds to the height of the
- // trace.
- if (MI1->getOpcode() != AssocOpcode)
- return nullptr;
-
- return MI1;
+ // 1. The previous instruction must be the same type as Inst.
+ // 2. The previous instruction must have virtual register definitions for its
+ // operands in the same basic block as Inst.
+ // 3. The previous instruction's result must only be used by Inst.
+ if (MI1->getOpcode() == AssocOpcode &&
+ hasReassociableOperands(*MI1, MBB) &&
+ MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg()))
+ return true;
+
+ return false;
}
-/// Select a pattern based on how the operands of each associative operation
-/// need to be commuted.
-static MachineCombinerPattern::MC_PATTERN getPattern(bool CommutePrev,
- bool CommuteRoot) {
- if (CommutePrev) {
- if (CommuteRoot)
- return MachineCombinerPattern::MC_REASSOC_XA_YB;
- return MachineCombinerPattern::MC_REASSOC_XA_BY;
- } else {
- if (CommuteRoot)
- return MachineCombinerPattern::MC_REASSOC_AX_YB;
- return MachineCombinerPattern::MC_REASSOC_AX_BY;
+// TODO: There are many more machine instruction opcodes to match:
+// 1. Other data types (integer, vectors)
+// 2. Other math / logic operations (and, or)
+// 3. Other forms of the same operation (intrinsics and other variants)
+static bool isAssociativeAndCommutative(const MachineInstr &Inst) {
+ switch (Inst.getOpcode()) {
+ case X86::IMUL16rr:
+ case X86::IMUL32rr:
+ case X86::IMUL64rr:
+ // Normal min/max instructions are not commutative because of NaN and signed
+ // zero semantics, but these are. Thus, there's no need to check for global
+ // relaxed math; the instructions themselves have the properties we need.
+ case X86::MINCSSrr:
+ case X86::VMINCSSrr:
+ return true;
+ case X86::ADDPDrr:
+ case X86::ADDPSrr:
+ case X86::ADDSDrr:
+ case X86::ADDSSrr:
+ case X86::MULPDrr:
+ case X86::MULPSrr:
+ case X86::MULSDrr:
+ case X86::MULSSrr:
+ case X86::VADDPDrr:
+ case X86::VADDPSrr:
+ case X86::VADDPDYrr:
+ case X86::VADDPSYrr:
+ case X86::VADDSDrr:
+ case X86::VADDSSrr:
+ case X86::VMULPDrr:
+ case X86::VMULPSrr:
+ case X86::VMULPDYrr:
+ case X86::VMULPSYrr:
+ case X86::VMULSDrr:
+ case X86::VMULSSrr:
+ return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
+ default:
+ return false;
}
}
-bool X86InstrInfo::hasPattern(MachineInstr &Root,
- SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Pattern) const {
- if (!Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath)
- return false;
+/// Return true if the input instruction is part of a chain of dependent ops
+/// that are suitable for reassociation, otherwise return false.
+/// If the instruction's operands must be commuted to have a previous
+/// instruction of the same type define the first source operand, Commuted will
+/// be set to true.
+static bool isReassociationCandidate(const MachineInstr &Inst, bool &Commuted) {
+ // 1. The operation must be associative and commutative.
+ // 2. The instruction must have virtual register definitions for its
+ // operands in the same basic block.
+ // 3. The instruction must have a reassociable sibling.
+ if (isAssociativeAndCommutative(Inst) &&
+ hasReassociableOperands(Inst, Inst.getParent()) &&
+ hasReassociableSibling(Inst, Commuted))
+ return true;
- // TODO: There are many more associative instruction types to match:
- // 1. Other forms of scalar FP add (non-AVX)
- // 2. Other data types (double, integer, vectors)
- // 3. Other math / logic operations (mul, and, or)
- unsigned AssocOpcode = X86::VADDSSrr;
+ return false;
+}
+// FIXME: This has the potential to be expensive (compile time) while not
+// improving the code at all. Some ways to limit the overhead:
+// 1. Track successful transforms; bail out if hit rate gets too low.
+// 2. Only enable at -O3 or some other non-default optimization level.
+// 3. Pre-screen pattern candidates here: if an operand of the previous
+// instruction is known to not increase the critical path, then don't match
+// that pattern.
+bool X86InstrInfo::getMachineCombinerPatterns(MachineInstr &Root,
+ SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns) const {
// TODO: There is nothing x86-specific here except the instruction type.
// This logic could be hoisted into the machine combiner pass itself.
- bool CommuteRoot;
- if (MachineInstr *Prev = isReassocCandidate(Root, AssocOpcode, true,
- CommuteRoot)) {
- bool CommutePrev;
- if (isReassocCandidate(*Prev, AssocOpcode, false, CommutePrev)) {
- // We found a sequence of instructions that may be suitable for a
- // reassociation of operands to increase ILP.
- Pattern.push_back(getPattern(CommutePrev, CommuteRoot));
- return true;
+
+ // Look for this reassociation pattern:
+ // B = A op X (Prev)
+ // C = B op Y (Root)
+
+ bool Commute;
+ if (isReassociationCandidate(Root, Commute)) {
+ // We found a sequence of instructions that may be suitable for a
+ // reassociation of operands to increase ILP. Specify each commutation
+ // possibility for the Prev instruction in the sequence and let the
+ // machine combiner decide if changing the operands is worthwhile.
+ if (Commute) {
+ Patterns.push_back(MachineCombinerPattern::MC_REASSOC_AX_YB);
+ Patterns.push_back(MachineCombinerPattern::MC_REASSOC_XA_YB);
+ } else {
+ Patterns.push_back(MachineCombinerPattern::MC_REASSOC_AX_BY);
+ Patterns.push_back(MachineCombinerPattern::MC_REASSOC_XA_BY);
}
+ return true;
}
-
+
return false;
}
+/// This is an architecture-specific helper function of reassociateOps.
+/// Set special operand attributes for new instructions after reassociation.
+static void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2,
+ MachineInstr &NewMI1, MachineInstr &NewMI2) {
+ // Integer instructions define an implicit EFLAGS source register operand as
+ // the third source (fourth total) operand.
+ if (OldMI1.getNumOperands() != 4 || OldMI2.getNumOperands() != 4)
+ return;
+
+ assert(NewMI1.getNumOperands() == 4 && NewMI2.getNumOperands() == 4 &&
+ "Unexpected instruction type for reassociation");
+
+ MachineOperand &OldOp1 = OldMI1.getOperand(3);
+ MachineOperand &OldOp2 = OldMI2.getOperand(3);
+ MachineOperand &NewOp1 = NewMI1.getOperand(3);
+ MachineOperand &NewOp2 = NewMI2.getOperand(3);
+
+ assert(OldOp1.isReg() && OldOp1.getReg() == X86::EFLAGS && OldOp1.isDead() &&
+ "Must have dead EFLAGS operand in reassociable instruction");
+ assert(OldOp2.isReg() && OldOp2.getReg() == X86::EFLAGS && OldOp2.isDead() &&
+ "Must have dead EFLAGS operand in reassociable instruction");
+
+ (void)OldOp1;
+ (void)OldOp2;
+
+ assert(NewOp1.isReg() && NewOp1.getReg() == X86::EFLAGS &&
+ "Unexpected operand in reassociable instruction");
+ assert(NewOp2.isReg() && NewOp2.getReg() == X86::EFLAGS &&
+ "Unexpected operand in reassociable instruction");
+
+ // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations
+ // of this pass or other passes. The EFLAGS operands must be dead in these new
+ // instructions because the EFLAGS operands in the original instructions must
+ // be dead in order for reassociation to occur.
+ NewOp1.setIsDead();
+ NewOp2.setIsDead();
+}
+
/// Attempt the following reassociation to reduce critical path length:
/// B = A op X (Prev)
/// C = B op Y (Root)
MachineOperand &OpX = Prev.getOperand(OpIdx[Pattern][2]);
MachineOperand &OpY = Root.getOperand(OpIdx[Pattern][3]);
MachineOperand &OpC = Root.getOperand(0);
-
+
unsigned RegA = OpA.getReg();
unsigned RegB = OpB.getReg();
unsigned RegX = OpX.getReg();
BuildMI(*MF, Prev.getDebugLoc(), TII->get(Opcode), NewVR)
.addReg(RegX, getKillRegState(KillX))
.addReg(RegY, getKillRegState(KillY));
- InsInstrs.push_back(MIB1);
-
MachineInstrBuilder MIB2 =
BuildMI(*MF, Root.getDebugLoc(), TII->get(Opcode), RegC)
.addReg(RegA, getKillRegState(KillA))
.addReg(NewVR, getKillRegState(true));
- InsInstrs.push_back(MIB2);
- // Record old instructions for deletion.
+ setSpecialOperandAttr(Root, Prev, *MIB1, *MIB2);
+
+ // Record new instructions for insertion and old instructions for deletion.
+ InsInstrs.push_back(MIB1);
+ InsInstrs.push_back(MIB2);
DelInstrs.push_back(&Prev);
DelInstrs.push_back(&Root);
}
// Select the previous instruction in the sequence based on the input pattern.
MachineInstr *Prev = nullptr;
- if (Pattern == MachineCombinerPattern::MC_REASSOC_AX_BY ||
- Pattern == MachineCombinerPattern::MC_REASSOC_XA_BY)
- Prev = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
- else if (Pattern == MachineCombinerPattern::MC_REASSOC_AX_YB ||
- Pattern == MachineCombinerPattern::MC_REASSOC_XA_YB)
- Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
- else
- llvm_unreachable("Unknown pattern for machine combiner");
-
+ switch (Pattern) {
+ case MachineCombinerPattern::MC_REASSOC_AX_BY:
+ case MachineCombinerPattern::MC_REASSOC_XA_BY:
+ Prev = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+ break;
+ case MachineCombinerPattern::MC_REASSOC_AX_YB:
+ case MachineCombinerPattern::MC_REASSOC_XA_YB:
+ Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
+ }
+ assert(Prev && "Unknown pattern for machine combiner");
+
reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, InstIdxForVirtReg);
return;
}
+std::pair<unsigned, unsigned>
+X86InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+ return std::make_pair(TF, 0u);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+ using namespace X86II;
+ static std::pair<unsigned, const char *> TargetFlags[] = {
+ {MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"},
+ {MO_PIC_BASE_OFFSET, "x86-pic-base-offset"},
+ {MO_GOT, "x86-got"},
+ {MO_GOTOFF, "x86-gotoff"},
+ {MO_GOTPCREL, "x86-gotpcrel"},
+ {MO_PLT, "x86-plt"},
+ {MO_TLSGD, "x86-tlsgd"},
+ {MO_TLSLD, "x86-tlsld"},
+ {MO_TLSLDM, "x86-tlsldm"},
+ {MO_GOTTPOFF, "x86-gottpoff"},
+ {MO_INDNTPOFF, "x86-indntpoff"},
+ {MO_TPOFF, "x86-tpoff"},
+ {MO_DTPOFF, "x86-dtpoff"},
+ {MO_NTPOFF, "x86-ntpoff"},
+ {MO_GOTNTPOFF, "x86-gotntpoff"},
+ {MO_DLLIMPORT, "x86-dllimport"},
+ {MO_DARWIN_STUB, "x86-darwin-stub"},
+ {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"},
+ {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"},
+ {MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE, "x86-darwin-hidden-nonlazy-pic-base"},
+ {MO_TLVP, "x86-tlvp"},
+ {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"},
+ {MO_SECREL, "x86-secrel"}};
+ return makeArrayRef(TargetFlags);
+}
+
namespace {
/// Create Global Base Reg pass. This initializes the PIC
/// global base register for x86-32.