CodeGen peephole: fold redundant phys reg copies

author JF Bastien <jfb@google.com>

Thu, 3 Dec 2015 23:43:56 +0000 (23:43 +0000)

committer JF Bastien <jfb@google.com>

Thu, 3 Dec 2015 23:43:56 +0000 (23:43 +0000)
author JF Bastien <jfb@google.com>
Thu, 3 Dec 2015 23:43:56 +0000 (23:43 +0000)
committer JF Bastien <jfb@google.com>
Thu, 3 Dec 2015 23:43:56 +0000 (23:43 +0000)
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp

index 4fd1c4bda433ac1996fbc34ce957859b010ed45c..0fabc40b64e058bdc6fc5624e07e832cc02b620e 100644 (file)
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -98,6 +98,10 @@ static cl::opt<bool>
  DisableAdvCopyOpt("disable-adv-copy-opt", cl::Hidden, cl::init(false),
                    cl::desc("Disable advanced copy optimization"));
  
+static cl::opt<bool> DisableNAPhysCopyOpt(
+    "disable-non-allocatable-phys-copy-opt", cl::Hidden, cl::init(false),
+    cl::desc("Disable non-allocatable physical register copy optimization"));
+
  // Limit the number of PHI instructions to process
  // in PeepholeOptimizer::getNextSource.
  static cl::opt<unsigned> RewritePHILimit(
@@ -111,6 +115,7 @@ STATISTIC(NumLoadFold,   "Number of loads folded");
  STATISTIC(NumSelects,    "Number of selects optimized");
  STATISTIC(NumUncoalescableCopies, "Number of uncoalescable copies optimized");
  STATISTIC(NumRewrittenCopies, "Number of copies rewritten");
+STATISTIC(NumNAPhysCopies, "Number of non-allocatable physical copies removed");
  
  namespace {
    class ValueTrackerResult;
@@ -162,12 +167,24 @@ namespace {
                         DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
  
      /// \brief If copy instruction \p MI is a virtual register copy, track it in
-    /// the set \p CopiedFromRegs and \p CopyMIs. If this virtual register was
+    /// the set \p CopySrcRegs and \p CopyMIs. If this virtual register was
      /// previously seen as a copy, replace the uses of this copy with the
      /// previously seen copy's destination register.
      bool foldRedundantCopy(MachineInstr *MI,
-                           SmallSet<unsigned, 4> &CopiedFromRegs,
-                           DenseMap<unsigned, MachineInstr*> &CopyMIs);
+                           SmallSet<unsigned, 4> &CopySrcRegs,
+                           DenseMap<unsigned, MachineInstr *> &CopyMIs);
+
+    /// \brief Is the register \p Reg a non-allocatable physical register?
+    bool isNAPhysCopy(unsigned Reg);
+
+    /// \brief If copy instruction \p MI is a non-allocatable virtual<->physical
+    /// register copy, track it in the \p NAPhysToVirtMIs map. If this
+    /// non-allocatable physical register was previously copied to a virtual
+    /// registered and hasn't been clobbered, the virt->phys copy can be
+    /// deleted.
+    bool foldRedundantNAPhysCopy(
+        MachineInstr *MI,
+        DenseMap<unsigned, MachineInstr *> &NAPhysToVirtMIs);
  
      bool isLoadFoldable(MachineInstr *MI,
                          SmallSet<unsigned, 16> &FoldAsLoadDefCandidates);
@@ -1332,7 +1349,7 @@ bool PeepholeOptimizer::foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
      if (ImmDefRegs.count(Reg) == 0)
        continue;
      DenseMap<unsigned, MachineInstr*>::iterator II = ImmDefMIs.find(Reg);
-    assert(II != ImmDefMIs.end());
+    assert(II != ImmDefMIs.end() && "couldn't find immediate definition");
      if (TII->FoldImmediate(MI, II->second, Reg, MRI)) {
        ++NumImmFold;
        return true;
@@ -1356,10 +1373,10 @@ bool PeepholeOptimizer::foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
  //
  // Should replace %vreg2 uses with %vreg1:sub1
  bool PeepholeOptimizer::foldRedundantCopy(
-  MachineInstr *MI,
-  SmallSet<unsigned, 4> &CopySrcRegs,
-  DenseMap<unsigned, MachineInstr *> &CopyMIs) {
-  assert(MI->isCopy());
+    MachineInstr *MI,
+    SmallSet<unsigned, 4> &CopySrcRegs,
+    DenseMap<unsigned, MachineInstr *> &CopyMIs) {
+  assert(MI->isCopy() && "expected a COPY machine instruction");
  
    unsigned SrcReg = MI->getOperand(1).getReg();
    if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
@@ -1400,6 +1417,59 @@ bool PeepholeOptimizer::foldRedundantCopy(
    return true;
  }
  
+bool PeepholeOptimizer::isNAPhysCopy(unsigned Reg) {
+  return TargetRegisterInfo::isPhysicalRegister(Reg) &&
+         !MRI->isAllocatable(Reg);
+}
+
+bool PeepholeOptimizer::foldRedundantNAPhysCopy(
+    MachineInstr *MI, DenseMap<unsigned, MachineInstr *> &NAPhysToVirtMIs) {
+  assert(MI->isCopy() && "expected a COPY machine instruction");
+
+  if (DisableNAPhysCopyOpt)
+    return false;
+
+  unsigned DstReg = MI->getOperand(0).getReg();
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  if (isNAPhysCopy(SrcReg) && TargetRegisterInfo::isVirtualRegister(DstReg)) {
+    // %vreg = COPY %PHYSREG
+    // Avoid using a datastructure which can track multiple live non-allocatable
+    // phys->virt copies since LLVM doesn't seem to do this.
+    NAPhysToVirtMIs.insert({SrcReg, MI});
+    return false;
+  }
+
+  if (!(TargetRegisterInfo::isVirtualRegister(SrcReg) && isNAPhysCopy(DstReg)))
+    return false;
+
+  // %PHYSREG = COPY %vreg
+  auto PrevCopy = NAPhysToVirtMIs.find(DstReg);
+  if (PrevCopy == NAPhysToVirtMIs.end()) {
+    // We can't remove the copy: there was an intervening clobber of the
+    // non-allocatable physical register after the copy to virtual.
+    DEBUG(dbgs() << "NAPhysCopy: intervening clobber forbids erasing " << *MI
+                 << '\n');
+    return false;
+  }
+
+  unsigned PrevDstReg = PrevCopy->second->getOperand(0).getReg();
+  if (PrevDstReg == SrcReg) {
+    // Remove the virt->phys copy: we saw the virtual register definition, and
+    // the non-allocatable physical register's state hasn't changed since then.
+    DEBUG(dbgs() << "NAPhysCopy: erasing " << *MI << '\n');
+    ++NumNAPhysCopies;
+    return true;
+  }
+
+  // Potential missed optimization opportunity: we saw a different virtual
+  // register get a copy of the non-allocatable physical register, and we only
+  // track one such copy. Avoid getting confused by this new non-allocatable
+  // physical register definition, and remove it from the tracked copies.
+  DEBUG(dbgs() << "NAPhysCopy: missed opportunity " << *MI << '\n');
+  NAPhysToVirtMIs.erase(PrevCopy);
+  return false;
+}
+
  bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
    if (skipOptnoneFunction(*MF.getFunction()))
      return false;
@@ -1433,6 +1503,13 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
      DenseMap<unsigned, MachineInstr*> ImmDefMIs;
      SmallSet<unsigned, 16> FoldAsLoadDefCandidates;
  
+    // Track when a non-allocatable physical register is copied to a virtual
+    // register so that useless moves can be removed.
+    //
+    // %PHYSREG is the map index; MI is the last valid `%vreg = COPY %PHYSREG`
+    // without any intervening re-definition of %PHYSREG.
+    DenseMap<unsigned, MachineInstr *> NAPhysToVirtMIs;
+
      // Set of virtual registers that are copied from.
      SmallSet<unsigned, 4> CopySrcRegs;
      DenseMap<unsigned, MachineInstr *> CopySrcMIs;
@@ -1453,10 +1530,51 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
        if (MI->isLoadFoldBarrier())
          FoldAsLoadDefCandidates.clear();
  
-      if (MI->isPosition() || MI->isPHI() || MI->isImplicitDef() ||
-          MI->isKill() || MI->isInlineAsm() ||
-          MI->hasUnmodeledSideEffects())
+      if (MI->isPosition() || MI->isPHI())
+        continue;
+
+      if (!MI->isCopy()) {
+        for (const auto &Op : MI->operands()) {
+          // Visit all operands: definitions can be implicit or explicit.
+          if (Op.isReg()) {
+            unsigned Reg = Op.getReg();
+            if (Op.isDef() && isNAPhysCopy(Reg)) {
+              const auto &Def = NAPhysToVirtMIs.find(Reg);
+              if (Def != NAPhysToVirtMIs.end()) {
+                // A new definition of the non-allocatable physical register
+                // invalidates previous copies.
+                DEBUG(dbgs() << "NAPhysCopy: invalidating because of " << *MI
+                             << '\n');
+                NAPhysToVirtMIs.erase(Def);
+              }
+            }
+          } else if (Op.isRegMask()) {
+            const uint32_t *RegMask = Op.getRegMask();
+            for (auto &RegMI : NAPhysToVirtMIs) {
+              unsigned Def = RegMI.first;
+              if (MachineOperand::clobbersPhysReg(RegMask, Def)) {
+                DEBUG(dbgs() << "NAPhysCopy: invalidating because of " << *MI
+                             << '\n');
+                NAPhysToVirtMIs.erase(Def);
+              }
+            }
+          }
+        }
+      }
+
+      if (MI->isImplicitDef() || MI->isKill())
+        continue;
+
+      if (MI->isInlineAsm() || MI->hasUnmodeledSideEffects()) {
+        // Blow away all non-allocatable physical registers knowledge since we
+        // don't know what's correct anymore.
+        //
+        // FIXME: handle explicit asm clobbers.
+        DEBUG(dbgs() << "NAPhysCopy: blowing away all info due to " << *MI
+                     << '\n');
+        NAPhysToVirtMIs.clear();
          continue;
+      }
  
        if ((isUncoalescableCopy(*MI) &&
             optimizeUncoalescableCopy(MI, LocalMIs)) ||
@@ -1479,7 +1597,9 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
          continue;
        }
  
-      if (MI->isCopy() && foldRedundantCopy(MI, CopySrcRegs, CopySrcMIs)) {
+      if (MI->isCopy() &&
+          (foldRedundantCopy(MI, CopySrcRegs, CopySrcMIs) ||
+           foldRedundantNAPhysCopy(MI, NAPhysToVirtMIs))) {
          LocalMIs.erase(MI);
          MI->eraseFromParent();
          Changed = true;
diff --git a/test/CodeGen/X86/peephole-na-phys-copy-folding.ll b/test/CodeGen/X86/peephole-na-phys-copy-folding.ll

new file mode 100644 (file)

index 0000000..438bf8d
--- /dev/null
+++ b/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
@@ -0,0 +1,190 @@
+; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s
+
+; The peephole optimizer can elide some physical register copies such as
+; EFLAGS. Make sure the flags are used directly, instead of needlessly using
+; lahf, when possible.
+
+@L = external global i32
+@M = external global i8
+declare i32 @bar(i64)
+
+; CHECK-LABEL: plus_one
+; CHECK-NOT: seto
+; CHECK-NOT: lahf
+; CHECK-NOT: sahf
+; CHECK-NOT: pushf
+; CHECK-NOT: popf
+; CHECK: incl L
+define i1 @plus_one() {
+entry:
+  %loaded_L = load i32, i32* @L
+  %val = add nsw i32 %loaded_L, 1 ; N.B. will emit inc.
+  store i32 %val, i32* @L
+  %loaded_M = load i8, i8* @M
+  %masked = and i8 %loaded_M, 8
+  %M_is_true = icmp ne i8 %masked, 0
+  %L_is_false = icmp eq i32 %val, 0
+  %cond = and i1 %L_is_false, %M_is_true
+  br i1 %cond, label %exit2, label %exit
+
+exit:
+  ret i1 true
+
+exit2:
+  ret i1 false
+}
+
+; CHECK-LABEL: plus_forty_two
+; CHECK-NOT: seto
+; CHECK-NOT: lahf
+; CHECK-NOT: sahf
+; CHECK-NOT: pushf
+; CHECK-NOT: popf
+; CHECK: addl $42,
+define i1 @plus_forty_two() {
+entry:
+  %loaded_L = load i32, i32* @L
+  %val = add nsw i32 %loaded_L, 42 ; N.B. won't emit inc.
+  store i32 %val, i32* @L
+  %loaded_M = load i8, i8* @M
+  %masked = and i8 %loaded_M, 8
+  %M_is_true = icmp ne i8 %masked, 0
+  %L_is_false = icmp eq i32 %val, 0
+  %cond = and i1 %L_is_false, %M_is_true
+  br i1 %cond, label %exit2, label %exit
+
+exit:
+  ret i1 true
+
+exit2:
+  ret i1 false
+}
+
+; CHECK-LABEL: minus_one
+; CHECK-NOT: seto
+; CHECK-NOT: lahf
+; CHECK-NOT: sahf
+; CHECK-NOT: pushf
+; CHECK-NOT: popf
+; CHECK: decl L
+define i1 @minus_one() {
+entry:
+  %loaded_L = load i32, i32* @L
+  %val = add nsw i32 %loaded_L, -1 ; N.B. will emit dec.
+  store i32 %val, i32* @L
+  %loaded_M = load i8, i8* @M
+  %masked = and i8 %loaded_M, 8
+  %M_is_true = icmp ne i8 %masked, 0
+  %L_is_false = icmp eq i32 %val, 0
+  %cond = and i1 %L_is_false, %M_is_true
+  br i1 %cond, label %exit2, label %exit
+
+exit:
+  ret i1 true
+
+exit2:
+  ret i1 false
+}
+
+; CHECK-LABEL: minus_forty_two
+; CHECK-NOT: seto
+; CHECK-NOT: lahf
+; CHECK-NOT: sahf
+; CHECK-NOT: pushf
+; CHECK-NOT: popf
+; CHECK: addl $-42,
+define i1 @minus_forty_two() {
+entry:
+  %loaded_L = load i32, i32* @L
+  %val = add nsw i32 %loaded_L, -42 ; N.B. won't emit dec.
+  store i32 %val, i32* @L
+  %loaded_M = load i8, i8* @M
+  %masked = and i8 %loaded_M, 8
+  %M_is_true = icmp ne i8 %masked, 0
+  %L_is_false = icmp eq i32 %val, 0
+  %cond = and i1 %L_is_false, %M_is_true
+  br i1 %cond, label %exit2, label %exit
+
+exit:
+  ret i1 true
+
+exit2:
+  ret i1 false
+}
+
+; CHECK-LABEL: test_intervening_call:
+; CHECK:       cmpxchg
+; CHECK:       seto %al
+; CHECK-NEXT:  lahf
+; CHECK:       call{{[lq]}} bar
+; CHECK:       addb $127, %al
+; CHECK-NEXT:  sahf
+define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
+  ; cmpxchg sets EFLAGS, call clobbers it, then br uses EFLAGS.
+  %cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst
+  %v = extractvalue { i64, i1 } %cx, 0
+  %p = extractvalue { i64, i1 } %cx, 1
+  call i32 @bar(i64 %v)
+  br i1 %p, label %t, label %f
+
+t:
+  ret i64 42
+
+f:
+  ret i64 0
+}
+
+; CHECK-LABEL: test_two_live_flags:
+; CHECK:       cmpxchg
+; CHECK-NEXT:  seto %al
+; CHECK-NEXT:  lahf
+; Save result of the first cmpxchg into D.
+; CHECK-NEXT:  mov{{[lq]}} %[[AX:[er]ax]], %[[D:[re]d[xi]]]
+; CHECK:       cmpxchg
+; CHECK-NEXT:  sete %al
+; Save result of the second cmpxchg onto the stack.
+; CHECK-NEXT:  push{{[lq]}} %[[AX]]
+; Restore result of the first cmpxchg from D, put it back in EFLAGS.
+; CHECK-NEXT:  mov{{[lq]}} %[[D]], %[[AX]]
+; CHECK-NEXT:  addb $127, %al
+; CHECK-NEXT:  sahf
+; Restore result of the second cmpxchg from the stack.
+; CHECK-NEXT:  pop{{[lq]}} %[[AX]]
+; Test from EFLAGS restored from first cmpxchg, jump if that fails.
+; CHECK-NEXT:  jne
+; Fallthrough to test the second cmpxchg's result.
+; CHECK:       testb %al, %al
+; CHECK-NEXT:  je
+define i64 @test_two_live_flags(
+       i64* %foo0, i64 %bar0, i64 %baz0,
+       i64* %foo1, i64 %bar1, i64 %baz1) {
+  %cx0 = cmpxchg i64* %foo0, i64 %bar0, i64 %baz0 seq_cst seq_cst
+  %p0 = extractvalue { i64, i1 } %cx0, 1
+  %cx1 = cmpxchg i64* %foo1, i64 %bar1, i64 %baz1 seq_cst seq_cst
+  %p1 = extractvalue { i64, i1 } %cx1, 1
+  %flag = and i1 %p0, %p1
+  br i1 %flag, label %t, label %f
+
+t:
+  ret i64 42
+
+f:
+  ret i64 0
+}
+
+; CHECK-LABEL: asm_clobbering_flags:
+; CHECK:       test
+; CHECK-NEXT:  setg
+; CHECK-NEXT:  #APP
+; CHECK-NEXT:  bsfl
+; CHECK-NEXT:  #NO_APP
+; CHECK-NEXT:  movl
+; CHECK-NEXT:  ret
+define i1 @asm_clobbering_flags(i32* %mem) {
+  %val = load i32, i32* %mem, align 4
+  %cmp = icmp sgt i32 %val, 0
+  %res = tail call i32 asm "bsfl $1,$0", "=r,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i32 %val)
+  store i32 %res, i32* %mem, align 4
+  ret i1 %cmp
+}
author	JF Bastien <jfb@google.com>
	Thu, 3 Dec 2015 23:43:56 +0000 (23:43 +0000)
committer	JF Bastien <jfb@google.com>
	Thu, 3 Dec 2015 23:43:56 +0000 (23:43 +0000)
lib/CodeGen/PeepholeOptimizer.cpp		patch \| blob \| history
test/CodeGen/X86/peephole-na-phys-copy-folding.ll	[new file with mode: 0644]	patch \| blob