inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
// Left shift instructions can be transformed into load-effective-address
// instructions if we can encode them appropriately.
- // A LEA instruction utilizes a SIB byte to encode it's scale factor.
+ // A LEA instruction utilizes a SIB byte to encode its scale factor.
// The SIB.scale field is two bits wide which means that we can encode any
// shift amount less than 4.
return ShAmt < 4 && ShAmt > 0;
return;
}
- // Moving EFLAGS to / from another register requires a push and a pop.
- // Notice that we have to adjust the stack if we don't want to clobber the
- // first frame index. See X86FrameLowering.cpp - clobbersTheStack.
- if (SrcReg == X86::EFLAGS) {
- if (X86::GR64RegClass.contains(DestReg)) {
- BuildMI(MBB, MI, DL, get(X86::PUSHF64));
- BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg);
- return;
+ bool FromEFLAGS = SrcReg == X86::EFLAGS;
+ bool ToEFLAGS = DestReg == X86::EFLAGS;
+ int Reg = FromEFLAGS ? DestReg : SrcReg;
+ bool is32 = X86::GR32RegClass.contains(Reg);
+ bool is64 = X86::GR64RegClass.contains(Reg);
+ if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) {
+ // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is
+ // inefficient. Instead:
+ // - Save the overflow flag OF into AL using SETO, and restore it using a
+ // signed 8-bit addition of AL and INT8_MAX.
+ // - Save/restore the bottom 8 EFLAGS bits (CF, PF, AF, ZF, SF) to/from AH
+ // using LAHF/SAHF.
+ // - When RAX/EAX is live and isn't the destination register, make sure it
+ // isn't clobbered by PUSH/POP'ing it before and after saving/restoring
+ // the flags.
+ // This approach is ~2.25x faster than using PUSHF/POPF.
+ //
+ // This is still somewhat inefficient because we don't know which flags are
+ // actually live inside EFLAGS. Were we able to do a single SETcc instead of
+ // SETO+LAHF / ADDB+SAHF the code could be 1.02x faster.
+ //
+ // PUSHF/POPF is also potentially incorrect because it affects other flags
+ // such as TF/IF/DF, which LLVM doesn't model.
+ //
+ // Notice that we have to adjust the stack if we don't want to clobber the
+ // first frame index. See X86FrameLowering.cpp - clobbersTheStack.
+
+ int Mov = is64 ? X86::MOV64rr : X86::MOV32rr;
+ int Push = is64 ? X86::PUSH64r : X86::PUSH32r;
+ int Pop = is64 ? X86::POP64r : X86::POP32r;
+ int AX = is64 ? X86::RAX : X86::EAX;
+
+ bool AXDead = (Reg == AX) ||
+ (MachineBasicBlock::LQR_Dead ==
+ MBB.computeRegisterLiveness(&getRegisterInfo(), AX, MI));
+
+ if (!AXDead)
+ BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true));
+ if (FromEFLAGS) {
+ BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL);
+ BuildMI(MBB, MI, DL, get(X86::LAHF));
+ BuildMI(MBB, MI, DL, get(Mov), Reg).addReg(AX);
}
- if (X86::GR32RegClass.contains(DestReg)) {
- BuildMI(MBB, MI, DL, get(X86::PUSHF32));
- BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg);
- return;
- }
- }
- if (DestReg == X86::EFLAGS) {
- if (X86::GR64RegClass.contains(SrcReg)) {
- BuildMI(MBB, MI, DL, get(X86::PUSH64r))
- .addReg(SrcReg, getKillRegState(KillSrc));
- BuildMI(MBB, MI, DL, get(X86::POPF64));
- return;
- }
- if (X86::GR32RegClass.contains(SrcReg)) {
- BuildMI(MBB, MI, DL, get(X86::PUSH32r))
- .addReg(SrcReg, getKillRegState(KillSrc));
- BuildMI(MBB, MI, DL, get(X86::POPF32));
- return;
+ if (ToEFLAGS) {
+ BuildMI(MBB, MI, DL, get(Mov), AX).addReg(Reg, getKillRegState(KillSrc));
+ BuildMI(MBB, MI, DL, get(X86::ADD8ri), X86::AL)
+ .addReg(X86::AL)
+ .addImm(INT8_MAX);
+ BuildMI(MBB, MI, DL, get(X86::SAHF));
}
+ if (!AXDead)
+ BuildMI(MBB, MI, DL, get(Pop), AX);
+ return;
}
DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
const GlobalValue *GV =
cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
- MachineMemOperand *MMO = MBB.getParent()->
- getMachineMemOperand(MachinePointerInfo::getGOT(), Flag, 8, 8);
+ MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
+ MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 8, 8);
MachineBasicBlock::iterator I = MIB.getInstr();
BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
// Check switch flag
if (NoFusing) return nullptr;
- // Unless optimizing for size, don't fold to avoid partial
- // register update stalls
- // FIXME: Use Function::optForSize().
- if (!MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) &&
- hasPartialRegUpdate(MI->getOpcode()))
+ // Avoid partial register update stalls unless optimizing for size.
+ if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode()))
return nullptr;
// Determine the alignment of the load.
// TODO: There are many more machine instruction opcodes to match:
// 1. Other data types (integer, vectors)
// 2. Other math / logic operations (and, or)
+// 3. Other forms of the same operation (intrinsics and other variants)
static bool isAssociativeAndCommutative(const MachineInstr &Inst) {
switch (Inst.getOpcode()) {
case X86::IMUL16rr:
case X86::IMUL32rr:
case X86::IMUL64rr:
+ // Normal min/max instructions are not commutative because of NaN and signed
+ // zero semantics, but these are. Thus, there's no need to check for global
+ // relaxed math; the instructions themselves have the properties we need.
+ case X86::MINCSSrr:
+ case X86::VMINCSSrr:
return true;
case X86::ADDPDrr:
case X86::ADDPSrr:
case X86::ADDSDrr:
case X86::ADDSSrr:
+ case X86::MULPDrr:
+ case X86::MULPSrr:
+ case X86::MULSDrr:
+ case X86::MULSSrr:
case X86::VADDPDrr:
case X86::VADDPSrr:
+ case X86::VADDPDYrr:
+ case X86::VADDPSYrr:
case X86::VADDSDrr:
case X86::VADDSSrr:
- case X86::MULSDrr:
- case X86::MULSSrr:
+ case X86::VMULPDrr:
+ case X86::VMULPSrr:
+ case X86::VMULPDYrr:
+ case X86::VMULPSYrr:
case X86::VMULSDrr:
case X86::VMULSSrr:
return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;