Micro-optimization:
authorBill Wendling <isanbard@gmail.com>
Fri, 5 Mar 2010 00:24:26 +0000 (00:24 +0000)
committerBill Wendling <isanbard@gmail.com>
Fri, 5 Mar 2010 00:24:26 +0000 (00:24 +0000)
This code:

float floatingPointComparison(float x, float y) {
    double product = (double)x * y;
    if (product == 0.0)
        return product;
    return product - 1.0;
}

produces this:

_floatingPointComparison:
0000000000000000        cvtss2sd        %xmm1,%xmm1
0000000000000004        cvtss2sd        %xmm0,%xmm0
0000000000000008        mulsd           %xmm1,%xmm0
000000000000000c        pxor            %xmm1,%xmm1
0000000000000010        ucomisd         %xmm1,%xmm0
0000000000000014        jne             0x00000004
0000000000000016        jp              0x00000002
0000000000000018        jmp             0x00000008
000000000000001a        addsd           0x00000006(%rip),%xmm0
0000000000000022        cvtsd2ss        %xmm0,%xmm0
0000000000000026        ret

The "jne/jp/jmp" sequence can be reduced to this instead:

_floatingPointComparison:
0000000000000000        cvtss2sd        %xmm1,%xmm1
0000000000000004        cvtss2sd        %xmm0,%xmm0
0000000000000008        mulsd           %xmm1,%xmm0
000000000000000c        pxor            %xmm1,%xmm1
0000000000000010        ucomisd         %xmm1,%xmm0
0000000000000014        jp              0x00000002
0000000000000016        je              0x00000008
0000000000000018        addsd           0x00000006(%rip),%xmm0
0000000000000020        cvtsd2ss        %xmm0,%xmm0
0000000000000024        ret

for a savings of 2 bytes.

This xform can happen when we recognize that jne and jp jump to the same "true"
MBB, the unconditional jump would jump to the "false" MBB, and the "true" branch
is the fall-through MBB.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@97766 91177308-0d34-0410-b5e6-96231b3b80d8

lib/Target/X86/X86InstrInfo.cpp
test/CodeGen/X86/jump-opt.ll [new file with mode: 0644]

index 39bda04b4d13b648deb2cbc3d4a37d0a743ee69c..0d3b54fe8e229708319444526b89b70c7101d48e 100644 (file)
@@ -1786,6 +1786,7 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                            const SmallVectorImpl<MachineOperand> &Cond) const {
   // FIXME this should probably have a DebugLoc operand
   DebugLoc dl = DebugLoc::getUnknownLoc();
+
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
   assert((Cond.size() == 1 || Cond.size() == 0) &&
@@ -1799,34 +1800,72 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   }
 
   // Conditional branch.
+  const MachineBasicBlock *NextBB = next(&MBB);
   unsigned Count = 0;
   X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
+
+  // In a two-way conditional branch, if the fall-through block is the
+  // "false" branch of the conditional jumps, we can cut out the
+  // unconditional jump by rearranging the conditional jumps. This saves a
+  // few bytes and improves performance. I.e., for COND_NE_OR_P:
+  //
+  //     JNE L1
+  //     JP  L1
+  //     JMP L2
+  // L1:
+  //     ...
+  // L2:
+  //     ...
+  //
+  // to:
+  // 
+  //     JP  L1
+  //     JE  L2
+  // L1:
+  //     ...
+  // L2:
+  //     ...
+  //
+  // Similarly for COND_NP_OR_E.
   switch (CC) {
+  default:
+    BuildMI(&MBB, dl, get(GetCondBranchFromCond(CC))).addMBB(TBB);
+    ++Count;
+    break;
   case X86::COND_NP_OR_E:
     // Synthesize NP_OR_E with two branches.
-    BuildMI(&MBB, dl, get(X86::JNP_4)).addMBB(TBB);
-    ++Count;
-    BuildMI(&MBB, dl, get(X86::JE_4)).addMBB(TBB);
-    ++Count;
+    if (FBB && FBB == NextBB) {
+      BuildMI(&MBB, dl, get(X86::JNP_4)).addMBB(TBB);
+      BuildMI(&MBB, dl, get(X86::JNE_4)).addMBB(FBB);
+      FBB = 0;
+    } else {
+      BuildMI(&MBB, dl, get(X86::JNP_4)).addMBB(TBB);
+      BuildMI(&MBB, dl, get(X86::JE_4)).addMBB(TBB);
+    }
+
+    Count += 2;
     break;
   case X86::COND_NE_OR_P:
     // Synthesize NE_OR_P with two branches.
-    BuildMI(&MBB, dl, get(X86::JNE_4)).addMBB(TBB);
-    ++Count;
-    BuildMI(&MBB, dl, get(X86::JP_4)).addMBB(TBB);
-    ++Count;
+    if (FBB && FBB == NextBB) {
+      BuildMI(&MBB, dl, get(X86::JP_4)).addMBB(TBB);
+      BuildMI(&MBB, dl, get(X86::JE_4)).addMBB(FBB);
+      FBB = 0;
+    } else {
+      BuildMI(&MBB, dl, get(X86::JNE_4)).addMBB(TBB);
+      BuildMI(&MBB, dl, get(X86::JP_4)).addMBB(TBB);
+    }
+
+    Count += 2;
     break;
-  default: {
-    unsigned Opc = GetCondBranchFromCond(CC);
-    BuildMI(&MBB, dl, get(Opc)).addMBB(TBB);
-    ++Count;
-  }
   }
+
   if (FBB) {
     // Two-way Conditional branch. Insert the second branch.
     BuildMI(&MBB, dl, get(X86::JMP_4)).addMBB(FBB);
     ++Count;
   }
+
   return Count;
 }
 
diff --git a/test/CodeGen/X86/jump-opt.ll b/test/CodeGen/X86/jump-opt.ll
new file mode 100644 (file)
index 0000000..dc32f66
--- /dev/null
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mtriple=i386-apple-darwin10 | FileCheck %s
+
+; <rdar://problem/7598384>
+define float @test1(float %x, float %y) nounwind readnone optsize ssp {
+; CHECK:      jp
+; CHECK-NEXT: je
+entry:
+  %0 = fpext float %x to double
+  %1 = fpext float %y to double
+  %2 = fmul double %0, %1
+  %3 = fcmp oeq double %2, 0.000000e+00
+  br i1 %3, label %bb2, label %bb1
+
+bb1:
+  %4 = fadd double %2, -1.000000e+00
+  br label %bb2
+
+bb2:
+  %.0.in = phi double [ %4, %bb1 ], [ %2, %entry ]
+  %.0 = fptrunc double %.0.in to float
+  ret float %.0
+}