DisableAdvCopyOpt("disable-adv-copy-opt", cl::Hidden, cl::init(false),
cl::desc("Disable advanced copy optimization"));
+static cl::opt<bool> DisableNAPhysCopyOpt(
+ "disable-non-allocatable-phys-copy-opt", cl::Hidden, cl::init(false),
+ cl::desc("Disable non-allocatable physical register copy optimization"));
+
// Limit the number of PHI instructions to process
// in PeepholeOptimizer::getNextSource.
static cl::opt<unsigned> RewritePHILimit(
STATISTIC(NumSelects, "Number of selects optimized");
STATISTIC(NumUncoalescableCopies, "Number of uncoalescable copies optimized");
STATISTIC(NumRewrittenCopies, "Number of copies rewritten");
+STATISTIC(NumNAPhysCopies, "Number of non-allocatable physical copies removed");
namespace {
class ValueTrackerResult;
DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
/// \brief If copy instruction \p MI is a virtual register copy, track it in
- /// the set \p CopiedFromRegs and \p CopyMIs. If this virtual register was
+ /// the set \p CopySrcRegs and \p CopyMIs. If this virtual register was
/// previously seen as a copy, replace the uses of this copy with the
/// previously seen copy's destination register.
bool foldRedundantCopy(MachineInstr *MI,
- SmallSet<unsigned, 4> &CopiedFromRegs,
- DenseMap<unsigned, MachineInstr*> &CopyMIs);
+ SmallSet<unsigned, 4> &CopySrcRegs,
+ DenseMap<unsigned, MachineInstr *> &CopyMIs);
+
+ /// \brief Is the register \p Reg a non-allocatable physical register?
+ bool isNAPhysCopy(unsigned Reg);
+
+ /// \brief If copy instruction \p MI is a non-allocatable virtual<->physical
+ /// register copy, track it in the \p NAPhysToVirtMIs map. If this
+ /// non-allocatable physical register was previously copied to a virtual
+ /// registered and hasn't been clobbered, the virt->phys copy can be
+ /// deleted.
+ bool foldRedundantNAPhysCopy(
+ MachineInstr *MI,
+ DenseMap<unsigned, MachineInstr *> &NAPhysToVirtMIs);
bool isLoadFoldable(MachineInstr *MI,
SmallSet<unsigned, 16> &FoldAsLoadDefCandidates);
if (ImmDefRegs.count(Reg) == 0)
continue;
DenseMap<unsigned, MachineInstr*>::iterator II = ImmDefMIs.find(Reg);
- assert(II != ImmDefMIs.end());
+ assert(II != ImmDefMIs.end() && "couldn't find immediate definition");
if (TII->FoldImmediate(MI, II->second, Reg, MRI)) {
++NumImmFold;
return true;
//
// Should replace %vreg2 uses with %vreg1:sub1
bool PeepholeOptimizer::foldRedundantCopy(
- MachineInstr *MI,
- SmallSet<unsigned, 4> &CopySrcRegs,
- DenseMap<unsigned, MachineInstr *> &CopyMIs) {
- assert(MI->isCopy());
+ MachineInstr *MI,
+ SmallSet<unsigned, 4> &CopySrcRegs,
+ DenseMap<unsigned, MachineInstr *> &CopyMIs) {
+ assert(MI->isCopy() && "expected a COPY machine instruction");
unsigned SrcReg = MI->getOperand(1).getReg();
if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
return true;
}
+bool PeepholeOptimizer::isNAPhysCopy(unsigned Reg) {
+ return TargetRegisterInfo::isPhysicalRegister(Reg) &&
+ !MRI->isAllocatable(Reg);
+}
+
+bool PeepholeOptimizer::foldRedundantNAPhysCopy(
+ MachineInstr *MI, DenseMap<unsigned, MachineInstr *> &NAPhysToVirtMIs) {
+ assert(MI->isCopy() && "expected a COPY machine instruction");
+
+ if (DisableNAPhysCopyOpt)
+ return false;
+
+ unsigned DstReg = MI->getOperand(0).getReg();
+ unsigned SrcReg = MI->getOperand(1).getReg();
+ if (isNAPhysCopy(SrcReg) && TargetRegisterInfo::isVirtualRegister(DstReg)) {
+ // %vreg = COPY %PHYSREG
+ // Avoid using a datastructure which can track multiple live non-allocatable
+ // phys->virt copies since LLVM doesn't seem to do this.
+ NAPhysToVirtMIs.insert({SrcReg, MI});
+ return false;
+ }
+
+ if (!(TargetRegisterInfo::isVirtualRegister(SrcReg) && isNAPhysCopy(DstReg)))
+ return false;
+
+ // %PHYSREG = COPY %vreg
+ auto PrevCopy = NAPhysToVirtMIs.find(DstReg);
+ if (PrevCopy == NAPhysToVirtMIs.end()) {
+ // We can't remove the copy: there was an intervening clobber of the
+ // non-allocatable physical register after the copy to virtual.
+ DEBUG(dbgs() << "NAPhysCopy: intervening clobber forbids erasing " << *MI
+ << '\n');
+ return false;
+ }
+
+ unsigned PrevDstReg = PrevCopy->second->getOperand(0).getReg();
+ if (PrevDstReg == SrcReg) {
+ // Remove the virt->phys copy: we saw the virtual register definition, and
+ // the non-allocatable physical register's state hasn't changed since then.
+ DEBUG(dbgs() << "NAPhysCopy: erasing " << *MI << '\n');
+ ++NumNAPhysCopies;
+ return true;
+ }
+
+ // Potential missed optimization opportunity: we saw a different virtual
+ // register get a copy of the non-allocatable physical register, and we only
+ // track one such copy. Avoid getting confused by this new non-allocatable
+ // physical register definition, and remove it from the tracked copies.
+ DEBUG(dbgs() << "NAPhysCopy: missed opportunity " << *MI << '\n');
+ NAPhysToVirtMIs.erase(PrevCopy);
+ return false;
+}
+
bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
if (skipOptnoneFunction(*MF.getFunction()))
return false;
DenseMap<unsigned, MachineInstr*> ImmDefMIs;
SmallSet<unsigned, 16> FoldAsLoadDefCandidates;
+ // Track when a non-allocatable physical register is copied to a virtual
+ // register so that useless moves can be removed.
+ //
+ // %PHYSREG is the map index; MI is the last valid `%vreg = COPY %PHYSREG`
+ // without any intervening re-definition of %PHYSREG.
+ DenseMap<unsigned, MachineInstr *> NAPhysToVirtMIs;
+
// Set of virtual registers that are copied from.
SmallSet<unsigned, 4> CopySrcRegs;
DenseMap<unsigned, MachineInstr *> CopySrcMIs;
if (MI->isLoadFoldBarrier())
FoldAsLoadDefCandidates.clear();
- if (MI->isPosition() || MI->isPHI() || MI->isImplicitDef() ||
- MI->isKill() || MI->isInlineAsm() ||
- MI->hasUnmodeledSideEffects())
+ if (MI->isPosition() || MI->isPHI())
+ continue;
+
+ if (!MI->isCopy()) {
+ for (const auto &Op : MI->operands()) {
+ // Visit all operands: definitions can be implicit or explicit.
+ if (Op.isReg()) {
+ unsigned Reg = Op.getReg();
+ if (Op.isDef() && isNAPhysCopy(Reg)) {
+ const auto &Def = NAPhysToVirtMIs.find(Reg);
+ if (Def != NAPhysToVirtMIs.end()) {
+ // A new definition of the non-allocatable physical register
+ // invalidates previous copies.
+ DEBUG(dbgs() << "NAPhysCopy: invalidating because of " << *MI
+ << '\n');
+ NAPhysToVirtMIs.erase(Def);
+ }
+ }
+ } else if (Op.isRegMask()) {
+ const uint32_t *RegMask = Op.getRegMask();
+ for (auto &RegMI : NAPhysToVirtMIs) {
+ unsigned Def = RegMI.first;
+ if (MachineOperand::clobbersPhysReg(RegMask, Def)) {
+ DEBUG(dbgs() << "NAPhysCopy: invalidating because of " << *MI
+ << '\n');
+ NAPhysToVirtMIs.erase(Def);
+ }
+ }
+ }
+ }
+ }
+
+ if (MI->isImplicitDef() || MI->isKill())
+ continue;
+
+ if (MI->isInlineAsm() || MI->hasUnmodeledSideEffects()) {
+ // Blow away all non-allocatable physical registers knowledge since we
+ // don't know what's correct anymore.
+ //
+ // FIXME: handle explicit asm clobbers.
+ DEBUG(dbgs() << "NAPhysCopy: blowing away all info due to " << *MI
+ << '\n');
+ NAPhysToVirtMIs.clear();
continue;
+ }
if ((isUncoalescableCopy(*MI) &&
optimizeUncoalescableCopy(MI, LocalMIs)) ||
continue;
}
- if (MI->isCopy() && foldRedundantCopy(MI, CopySrcRegs, CopySrcMIs)) {
+ if (MI->isCopy() &&
+ (foldRedundantCopy(MI, CopySrcRegs, CopySrcMIs) ||
+ foldRedundantNAPhysCopy(MI, NAPhysToVirtMIs))) {
LocalMIs.erase(MI);
MI->eraseFromParent();
Changed = true;
--- /dev/null
+; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s
+
+; The peephole optimizer can elide some physical register copies such as
+; EFLAGS. Make sure the flags are used directly, instead of needlessly using
+; lahf, when possible.
+
+@L = external global i32
+@M = external global i8
+declare i32 @bar(i64)
+
+; CHECK-LABEL: plus_one
+; CHECK-NOT: seto
+; CHECK-NOT: lahf
+; CHECK-NOT: sahf
+; CHECK-NOT: pushf
+; CHECK-NOT: popf
+; CHECK: incl L
+define i1 @plus_one() {
+entry:
+ %loaded_L = load i32, i32* @L
+ %val = add nsw i32 %loaded_L, 1 ; N.B. will emit inc.
+ store i32 %val, i32* @L
+ %loaded_M = load i8, i8* @M
+ %masked = and i8 %loaded_M, 8
+ %M_is_true = icmp ne i8 %masked, 0
+ %L_is_false = icmp eq i32 %val, 0
+ %cond = and i1 %L_is_false, %M_is_true
+ br i1 %cond, label %exit2, label %exit
+
+exit:
+ ret i1 true
+
+exit2:
+ ret i1 false
+}
+
+; CHECK-LABEL: plus_forty_two
+; CHECK-NOT: seto
+; CHECK-NOT: lahf
+; CHECK-NOT: sahf
+; CHECK-NOT: pushf
+; CHECK-NOT: popf
+; CHECK: addl $42,
+define i1 @plus_forty_two() {
+entry:
+ %loaded_L = load i32, i32* @L
+ %val = add nsw i32 %loaded_L, 42 ; N.B. won't emit inc.
+ store i32 %val, i32* @L
+ %loaded_M = load i8, i8* @M
+ %masked = and i8 %loaded_M, 8
+ %M_is_true = icmp ne i8 %masked, 0
+ %L_is_false = icmp eq i32 %val, 0
+ %cond = and i1 %L_is_false, %M_is_true
+ br i1 %cond, label %exit2, label %exit
+
+exit:
+ ret i1 true
+
+exit2:
+ ret i1 false
+}
+
+; CHECK-LABEL: minus_one
+; CHECK-NOT: seto
+; CHECK-NOT: lahf
+; CHECK-NOT: sahf
+; CHECK-NOT: pushf
+; CHECK-NOT: popf
+; CHECK: decl L
+define i1 @minus_one() {
+entry:
+ %loaded_L = load i32, i32* @L
+ %val = add nsw i32 %loaded_L, -1 ; N.B. will emit dec.
+ store i32 %val, i32* @L
+ %loaded_M = load i8, i8* @M
+ %masked = and i8 %loaded_M, 8
+ %M_is_true = icmp ne i8 %masked, 0
+ %L_is_false = icmp eq i32 %val, 0
+ %cond = and i1 %L_is_false, %M_is_true
+ br i1 %cond, label %exit2, label %exit
+
+exit:
+ ret i1 true
+
+exit2:
+ ret i1 false
+}
+
+; CHECK-LABEL: minus_forty_two
+; CHECK-NOT: seto
+; CHECK-NOT: lahf
+; CHECK-NOT: sahf
+; CHECK-NOT: pushf
+; CHECK-NOT: popf
+; CHECK: addl $-42,
+define i1 @minus_forty_two() {
+entry:
+ %loaded_L = load i32, i32* @L
+ %val = add nsw i32 %loaded_L, -42 ; N.B. won't emit dec.
+ store i32 %val, i32* @L
+ %loaded_M = load i8, i8* @M
+ %masked = and i8 %loaded_M, 8
+ %M_is_true = icmp ne i8 %masked, 0
+ %L_is_false = icmp eq i32 %val, 0
+ %cond = and i1 %L_is_false, %M_is_true
+ br i1 %cond, label %exit2, label %exit
+
+exit:
+ ret i1 true
+
+exit2:
+ ret i1 false
+}
+
+; CHECK-LABEL: test_intervening_call:
+; CHECK: cmpxchg
+; CHECK: seto %al
+; CHECK-NEXT: lahf
+; CHECK: call{{[lq]}} bar
+; CHECK: addb $127, %al
+; CHECK-NEXT: sahf
+define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
+ ; cmpxchg sets EFLAGS, call clobbers it, then br uses EFLAGS.
+ %cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst
+ %v = extractvalue { i64, i1 } %cx, 0
+ %p = extractvalue { i64, i1 } %cx, 1
+ call i32 @bar(i64 %v)
+ br i1 %p, label %t, label %f
+
+t:
+ ret i64 42
+
+f:
+ ret i64 0
+}
+
+; CHECK-LABEL: test_two_live_flags:
+; CHECK: cmpxchg
+; CHECK-NEXT: seto %al
+; CHECK-NEXT: lahf
+; Save result of the first cmpxchg into D.
+; CHECK-NEXT: mov{{[lq]}} %[[AX:[er]ax]], %[[D:[re]d[xi]]]
+; CHECK: cmpxchg
+; CHECK-NEXT: sete %al
+; Save result of the second cmpxchg onto the stack.
+; CHECK-NEXT: push{{[lq]}} %[[AX]]
+; Restore result of the first cmpxchg from D, put it back in EFLAGS.
+; CHECK-NEXT: mov{{[lq]}} %[[D]], %[[AX]]
+; CHECK-NEXT: addb $127, %al
+; CHECK-NEXT: sahf
+; Restore result of the second cmpxchg from the stack.
+; CHECK-NEXT: pop{{[lq]}} %[[AX]]
+; Test from EFLAGS restored from first cmpxchg, jump if that fails.
+; CHECK-NEXT: jne
+; Fallthrough to test the second cmpxchg's result.
+; CHECK: testb %al, %al
+; CHECK-NEXT: je
+define i64 @test_two_live_flags(
+ i64* %foo0, i64 %bar0, i64 %baz0,
+ i64* %foo1, i64 %bar1, i64 %baz1) {
+ %cx0 = cmpxchg i64* %foo0, i64 %bar0, i64 %baz0 seq_cst seq_cst
+ %p0 = extractvalue { i64, i1 } %cx0, 1
+ %cx1 = cmpxchg i64* %foo1, i64 %bar1, i64 %baz1 seq_cst seq_cst
+ %p1 = extractvalue { i64, i1 } %cx1, 1
+ %flag = and i1 %p0, %p1
+ br i1 %flag, label %t, label %f
+
+t:
+ ret i64 42
+
+f:
+ ret i64 0
+}
+
+; CHECK-LABEL: asm_clobbering_flags:
+; CHECK: test
+; CHECK-NEXT: setg
+; CHECK-NEXT: #APP
+; CHECK-NEXT: bsfl
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movl
+; CHECK-NEXT: ret
+define i1 @asm_clobbering_flags(i32* %mem) {
+ %val = load i32, i32* %mem, align 4
+ %cmp = icmp sgt i32 %val, 0
+ %res = tail call i32 asm "bsfl $1,$0", "=r,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i32 %val)
+ store i32 %res, i32* %mem, align 4
+ ret i1 %cmp
+}