[X86] Combine (cmov (and/or (setcc) (setcc))) into (cmov (cmov)).

author Ahmed Bougacha <ahmed.bougacha@gmail.com>

Tue, 3 Mar 2015 01:09:14 +0000 (01:09 +0000)

committer Ahmed Bougacha <ahmed.bougacha@gmail.com>

Tue, 3 Mar 2015 01:09:14 +0000 (01:09 +0000)
author Ahmed Bougacha <ahmed.bougacha@gmail.com>
Tue, 3 Mar 2015 01:09:14 +0000 (01:09 +0000)
committer Ahmed Bougacha <ahmed.bougacha@gmail.com>
Tue, 3 Mar 2015 01:09:14 +0000 (01:09 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 0439f0b7a2e0e2668e867c12ffac4e7b06dca85a..a8e166c589104629aee339b609a5a7264aef8ee3 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -21006,6 +21006,49 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
    return SDValue();
  }
  
+/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
+/// Match:
+///   (X86or (X86setcc) (X86setcc))
+///   (X86cmp (and (X86setcc) (X86setcc)), 0)
+static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
+                                           X86::CondCode &CC1, SDValue &Flags,
+                                           bool &isAnd) {
+  if (Cond->getOpcode() == X86ISD::CMP) {
+    ConstantSDNode *CondOp1C = dyn_cast<ConstantSDNode>(Cond->getOperand(1));
+    if (!CondOp1C || !CondOp1C->isNullValue())
+      return false;
+
+    Cond = Cond->getOperand(0);
+  }
+
+  isAnd = false;
+
+  SDValue SetCC0, SetCC1;
+  switch (Cond->getOpcode()) {
+  default: return false;
+  case ISD::AND:
+  case X86ISD::AND:
+    isAnd = true;
+    // fallthru
+  case ISD::OR:
+  case X86ISD::OR:
+    SetCC0 = Cond->getOperand(0);
+    SetCC1 = Cond->getOperand(1);
+    break;
+  };
+
+  // Make sure we have SETCC nodes, using the same flags value.
+  if (SetCC0.getOpcode() != X86ISD::SETCC ||
+      SetCC1.getOpcode() != X86ISD::SETCC ||
+      SetCC0->getOperand(1) != SetCC1->getOperand(1))
+    return false;
+
+  CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
+  CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
+  Flags = SetCC0->getOperand(1);
+  return true;
+}
+
  /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
  static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
                                    TargetLowering::DAGCombinerInfo &DCI,
@@ -21175,6 +21218,44 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
      }
    }
  
+  // Fold and/or of setcc's to double CMOV:
+  //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
+  //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
+  //
+  // This combine lets us generate:
+  //   cmovcc1 (jcc1 if we don't have CMOV)
+  //   cmovcc2 (same)
+  // instead of:
+  //   setcc1
+  //   setcc2
+  //   and/or
+  //   cmovne (jne if we don't have CMOV)
+  // When we can't use the CMOV instruction, it might increase branch
+  // mispredicts.
+  // When we can use CMOV, or when there is no mispredict, this improves
+  // throughput and reduces register pressure.
+  //
+  if (CC == X86::COND_NE) {
+    SDValue Flags;
+    X86::CondCode CC0, CC1;
+    bool isAndSetCC;
+    if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
+      if (isAndSetCC) {
+        std::swap(FalseOp, TrueOp);
+        CC0 = X86::GetOppositeBranchCondition(CC0);
+        CC1 = X86::GetOppositeBranchCondition(CC1);
+      }
+
+      SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, MVT::i8),
+        Flags};
+      SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
+      SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, MVT::i8), Flags};
+      SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
+      return CMOV;
+    }
+  }
+
    return SDValue();
  }
  
diff --git a/test/CodeGen/X86/cmovcmov.ll b/test/CodeGen/X86/cmovcmov.ll

new file mode 100644 (file)

index 0000000..e9ac86f
--- /dev/null
+++ b/test/CodeGen/X86/cmovcmov.ll
@@ -0,0 +1,264 @@
+; RUN: llc < %s -asm-verbose=false -mtriple=x86_64-unknown-linux | FileCheck %s --check-prefix=CHECK --check-prefix=CMOV
+; RUN: llc < %s -asm-verbose=false -mtriple=i686-unknown-linux | FileCheck %s --check-prefix=CHECK --check-prefix=NOCMOV
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Test 2xCMOV patterns exposed after legalization.
+; One way to do that is with (select (fcmp une/oeq)), which gets
+; legalized to setp/setne.
+
+; CHECK-LABEL: test_select_fcmp_oeq_i32:
+
+; CMOV-NEXT: ucomiss  %xmm1, %xmm0
+; CMOV-NEXT: cmovnel  %esi, %edi
+; CMOV-NEXT: cmovpl  %esi, %edi
+; CMOV-NEXT: movl  %edi, %eax
+; CMOV-NEXT: retq
+
+; NOCMOV-NEXT:   flds  8(%esp)
+; NOCMOV-NEXT:   flds  4(%esp)
+; NOCMOV-NEXT:   fucompp
+; NOCMOV-NEXT:   fnstsw  %ax
+; NOCMOV-NEXT:   sahf
+; NOCMOV-NEXT:   leal  16(%esp), %eax
+; NOCMOV-NEXT:   movl  %eax, %ecx
+; NOCMOV-NEXT:   jne  [[TBB1:.LBB[0-9_]+]]
+; NOCMOV-NEXT:   leal  12(%esp), %ecx
+; NOCMOV-NEXT: [[TBB1]]:
+; NOCMOV-NEXT:   jp  [[TBB2:.LBB[0-9_]+]]
+; NOCMOV-NEXT:   movl  %ecx, %eax
+; NOCMOV-NEXT: [[TBB2]]:
+; NOCMOV-NEXT:   movl  (%eax), %eax
+; NOCMOV-NEXT:   retl
+define i32 @test_select_fcmp_oeq_i32(float %a, float %b, i32 %c, i32 %d) #0 {
+entry:
+  %cmp = fcmp oeq float %a, %b
+  %r = select i1 %cmp, i32 %c, i32 %d
+  ret i32 %r
+}
+
+; CHECK-LABEL: test_select_fcmp_oeq_i64:
+
+; CMOV-NEXT:   ucomiss  %xmm1, %xmm0
+; CMOV-NEXT:   cmovneq  %rsi, %rdi
+; CMOV-NEXT:   cmovpq  %rsi, %rdi
+; CMOV-NEXT:   movq  %rdi, %rax
+; CMOV-NEXT:   retq
+
+; NOCMOV-NEXT:   flds  8(%esp)
+; NOCMOV-NEXT:   flds  4(%esp)
+; NOCMOV-NEXT:   fucompp
+; NOCMOV-NEXT:   fnstsw  %ax
+; NOCMOV-NEXT:   sahf
+; NOCMOV-NEXT:   leal  20(%esp), %ecx
+; NOCMOV-NEXT:   movl  %ecx, %eax
+; NOCMOV-NEXT:   jne  [[TBB1:.LBB[0-9_]+]]
+; NOCMOV-NEXT:   leal  12(%esp), %eax
+; NOCMOV-NEXT: [[TBB1]]:
+; NOCMOV-NEXT:   jp  [[TBB2:.LBB[0-9_]+]]
+; NOCMOV-NEXT:   movl  %eax, %ecx
+; NOCMOV-NEXT: [[TBB2]]:
+; NOCMOV-NEXT:   movl  (%ecx), %eax
+; NOCMOV-NEXT:   orl  $4, %ecx
+; NOCMOV-NEXT:   movl  (%ecx), %edx
+; NOCMOV-NEXT:   retl
+define i64 @test_select_fcmp_oeq_i64(float %a, float %b, i64 %c, i64 %d) #0 {
+entry:
+  %cmp = fcmp oeq float %a, %b
+  %r = select i1 %cmp, i64 %c, i64 %d
+  ret i64 %r
+}
+
+; CHECK-LABEL: test_select_fcmp_une_i64:
+
+; CMOV-NEXT:   ucomiss  %xmm1, %xmm0
+; CMOV-NEXT:   cmovneq  %rdi, %rsi
+; CMOV-NEXT:   cmovpq  %rdi, %rsi
+; CMOV-NEXT:   movq  %rsi, %rax
+; CMOV-NEXT:   retq
+
+; NOCMOV-NEXT:   flds  8(%esp)
+; NOCMOV-NEXT:   flds  4(%esp)
+; NOCMOV-NEXT:   fucompp
+; NOCMOV-NEXT:   fnstsw  %ax
+; NOCMOV-NEXT:   sahf
+; NOCMOV-NEXT:   leal  12(%esp), %ecx
+; NOCMOV-NEXT:   movl  %ecx, %eax
+; NOCMOV-NEXT:   jne  [[TBB1:.LBB[0-9_]+]]
+; NOCMOV-NEXT:   leal  20(%esp), %eax
+; NOCMOV-NEXT: [[TBB1]]:
+; NOCMOV-NEXT:   jp  [[TBB2:.LBB[0-9_]+]]
+; NOCMOV-NEXT:   movl  %eax, %ecx
+; NOCMOV-NEXT: [[TBB2]]:
+; NOCMOV-NEXT:   movl  (%ecx), %eax
+; NOCMOV-NEXT:   orl  $4, %ecx
+; NOCMOV-NEXT:   movl  (%ecx), %edx
+; NOCMOV-NEXT:   retl
+define i64 @test_select_fcmp_une_i64(float %a, float %b, i64 %c, i64 %d) #0 {
+entry:
+  %cmp = fcmp une float %a, %b
+  %r = select i1 %cmp, i64 %c, i64 %d
+  ret i64 %r
+}
+
+; CHECK-LABEL: test_select_fcmp_oeq_f64:
+
+; CMOV-NEXT:   ucomiss  %xmm1, %xmm0
+; CMOV-NEXT:   movaps  %xmm3, %xmm0
+; CMOV-NEXT:   jne  [[TBB1:.LBB[0-9_]+]]
+; CMOV-NEXT:   movaps  %xmm2, %xmm0
+; CMOV-NEXT: [[TBB1]]:
+; CMOV-NEXT:   jp  [[TBB2:.LBB[0-9_]+]]
+; CMOV-NEXT:   movaps  %xmm0, %xmm3
+; CMOV-NEXT: [[TBB2]]:
+; CMOV-NEXT:   movaps  %xmm3, %xmm0
+; CMOV-NEXT:   retq
+
+; NOCMOV-NEXT:   flds  8(%esp)
+; NOCMOV-NEXT:   flds  4(%esp)
+; NOCMOV-NEXT:   fucompp
+; NOCMOV-NEXT:   fnstsw  %ax
+; NOCMOV-NEXT:   sahf
+; NOCMOV-NEXT:   leal  20(%esp), %eax
+; NOCMOV-NEXT:   movl  %eax, %ecx
+; NOCMOV-NEXT:   jne  [[TBB1:.LBB[0-9_]+]]
+; NOCMOV-NEXT:   leal  12(%esp), %ecx
+; NOCMOV-NEXT: [[TBB1]]:
+; NOCMOV-NEXT:   jp  [[TBB2:.LBB[0-9_]+]]
+; NOCMOV-NEXT:   movl  %ecx, %eax
+; NOCMOV-NEXT: [[TBB2]]:
+; NOCMOV-NEXT:   fldl  (%eax)
+; NOCMOV-NEXT:   retl
+define double @test_select_fcmp_oeq_f64(float %a, float %b, double %c, double %d) #0 {
+entry:
+  %cmp = fcmp oeq float %a, %b
+  %r = select i1 %cmp, double %c, double %d
+  ret double %r
+}
+
+; CHECK-LABEL: test_select_fcmp_oeq_v4i32:
+
+; CMOV-NEXT:   ucomiss  %xmm1, %xmm0
+; CMOV-NEXT:   movaps  %xmm3, %xmm0
+; CMOV-NEXT:   jne  [[TBB1:.LBB[0-9_]+]]
+; CMOV-NEXT:   movaps  %xmm2, %xmm0
+; CMOV-NEXT: [[TBB1]]:
+; CMOV-NEXT:   jp  [[TBB2:.LBB[0-9_]+]]
+; CMOV-NEXT:   movaps  %xmm0, %xmm3
+; CMOV-NEXT: [[TBB2]]:
+; CMOV-NEXT:   movaps  %xmm3, %xmm0
+; CMOV-NEXT:   retq
+
+; NOCMOV-NEXT:   pushl  %ebx
+; NOCMOV-NEXT:   pushl  %edi
+; NOCMOV-NEXT:   pushl  %esi
+; NOCMOV-NEXT:   flds  24(%esp)
+; NOCMOV-NEXT:   flds  20(%esp)
+; NOCMOV-NEXT:   fucompp
+; NOCMOV-NEXT:   fnstsw  %ax
+; NOCMOV-NEXT:   sahf
+; NOCMOV-NEXT:   leal  44(%esp), %eax
+; NOCMOV-NEXT:   movl  %eax, %ecx
+; NOCMOV-NEXT:   jne  [[TBB1:.LBB[0-9_]+]]
+; NOCMOV-NEXT:   leal  28(%esp), %ecx
+; NOCMOV-NEXT: [[TBB1]]:
+; NOCMOV-NEXT:   jp  [[TBB2:.LBB[0-9_]+]]
+; NOCMOV-NEXT:   movl  %ecx, %eax
+; NOCMOV-NEXT: [[TBB2]]:
+; NOCMOV-NEXT:   movl  (%eax), %eax
+; NOCMOV-NEXT:   leal  48(%esp), %ecx
+; NOCMOV-NEXT:   movl  %ecx, %edx
+; NOCMOV-NEXT:   jne  [[TBB1:.LBB[0-9_]+]]
+; NOCMOV-NEXT:   leal  32(%esp), %edx
+; NOCMOV-NEXT: [[TBB1]]:
+; NOCMOV-NEXT:   jp  [[TBB2:.LBB[0-9_]+]]
+; NOCMOV-NEXT:   movl  %edx, %ecx
+; NOCMOV-NEXT: [[TBB2]]:
+; NOCMOV-NEXT:   movl  (%ecx), %ecx
+; NOCMOV-NEXT:   leal  52(%esp), %edx
+; NOCMOV-NEXT:   movl  %edx, %esi
+; NOCMOV-NEXT:   jne  [[TBB1:.LBB[0-9_]+]]
+; NOCMOV-NEXT:   leal  36(%esp), %esi
+; NOCMOV-NEXT: [[TBB1]]:
+; NOCMOV-NEXT:   jp  [[TBB2:.LBB[0-9_]+]]
+; NOCMOV-NEXT:   movl  %esi, %edx
+; NOCMOV-NEXT: [[TBB2]]:
+; NOCMOV-NEXT:   movl  (%edx), %edx
+; NOCMOV-NEXT:   leal  56(%esp), %esi
+; NOCMOV-NEXT:   movl  %esi, %ebx
+; NOCMOV-NEXT:   jne  [[TBB1:.LBB[0-9_]+]]
+; NOCMOV-NEXT:   leal  40(%esp), %ebx
+; NOCMOV-NEXT: [[TBB1]]:
+; NOCMOV-NEXT:   movl  16(%esp), %edi
+; NOCMOV-NEXT:   jp  [[TBB2:.LBB[0-9_]+]]
+; NOCMOV-NEXT:   movl  %ebx, %esi
+; NOCMOV-NEXT: [[TBB2]]:
+; NOCMOV-NEXT:   movl  (%esi), %esi
+; NOCMOV-NEXT:   movl  %esi, 12(%edi)
+; NOCMOV-NEXT:   movl  %edx, 8(%edi)
+; NOCMOV-NEXT:   movl  %ecx, 4(%edi)
+; NOCMOV-NEXT:   movl  %eax, (%edi)
+; NOCMOV-NEXT:   popl  %esi
+; NOCMOV-NEXT:   popl  %edi
+; NOCMOV-NEXT:   popl  %ebx
+; NOCMOV-NEXT:   retl  $4
+define <4 x i32> @test_select_fcmp_oeq_v4i32(float %a, float %b, <4 x i32> %c, <4 x i32> %d) #0 {
+entry:
+  %cmp = fcmp oeq float %a, %b
+  %r = select i1 %cmp, <4 x i32> %c, <4 x i32> %d
+  ret <4 x i32> %r
+}
+
+; Also make sure we catch the original code-sequence of interest:
+
+; CMOV: [[ONE_F32_LCPI:.LCPI.*]]:
+; CMOV-NEXT:   .long  1065353216
+
+; CHECK-LABEL: test_zext_fcmp_une:
+; CMOV-NEXT:   ucomiss  %xmm1, %xmm0
+; CMOV-NEXT:   movss  [[ONE_F32_LCPI]](%rip), %xmm0
+; CMOV-NEXT:   movaps  %xmm0, %xmm1
+; CMOV-NEXT:   jne  [[TBB1:.LBB[0-9_]+]]
+; CMOV-NEXT:   xorps  %xmm1, %xmm1
+; CMOV-NEXT: [[TBB1]]:
+; CMOV-NEXT:   jp  [[TBB2:.LBB[0-9_]+]]
+; CMOV-NEXT:   movaps  %xmm1, %xmm0
+; CMOV-NEXT: [[TBB2]]:
+; CMOV-NEXT:   retq
+
+; NOCMOV:     jne
+; NOCMOV:     jp
+define float @test_zext_fcmp_une(float %a, float %b) #0 {
+entry:
+  %cmp = fcmp une float %a, %b
+  %conv1 = zext i1 %cmp to i32
+  %conv2 = sitofp i32 %conv1 to float
+  ret float %conv2
+}
+
+; CMOV: [[ONE_F32_LCPI:.LCPI.*]]:
+; CMOV-NEXT:   .long  1065353216
+
+; CHECK-LABEL: test_zext_fcmp_oeq:
+; CMOV-NEXT:   ucomiss  %xmm1, %xmm0
+; CMOV-NEXT:   xorps  %xmm0, %xmm0
+; CMOV-NEXT:   xorps  %xmm1, %xmm1
+; CMOV-NEXT:   jne  [[TBB1:.LBB[0-9_]+]]
+; CMOV-NEXT:   movss  [[ONE_F32_LCPI]](%rip), %xmm1
+; CMOV-NEXT: [[TBB1]]:
+; CMOV-NEXT:   jp  [[TBB2:.LBB[0-9_]+]]
+; CMOV-NEXT:   movaps  %xmm1, %xmm0
+; CMOV-NEXT: [[TBB2]]:
+; CMOV-NEXT:   retq
+
+; NOCMOV:   jne
+; NOCMOV:   jp
+define float @test_zext_fcmp_oeq(float %a, float %b) #0 {
+entry:
+  %cmp = fcmp oeq float %a, %b
+  %conv1 = zext i1 %cmp to i32
+  %conv2 = sitofp i32 %conv1 to float
+  ret float %conv2
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/fast-isel-select-cmov2.ll b/test/CodeGen/X86/fast-isel-select-cmov2.ll

index 0de94f1e2403696df9f04848c212b65202175270..8556ff21021a54be6d1b9898fc6a45fa14ce6a9f 100644 (file)
--- a/test/CodeGen/X86/fast-isel-select-cmov2.ll
+++ b/test/CodeGen/X86/fast-isel-select-cmov2.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10                             | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort=1 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10                               | FileCheck %s --check-prefix=CHECK --check-prefix=SDAG
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=CHECK --check-prefix=FAST
  
  ; Test all the cmp predicates that can feed an integer conditional move.
  
@@ -15,10 +15,13 @@ define i64 @select_fcmp_false_cmov(double %a, double %b, i64 %c, i64 %d) {
  define i64 @select_fcmp_oeq_cmov(double %a, double %b, i64 %c, i64 %d) {
  ; CHECK-LABEL: select_fcmp_oeq_cmov
  ; CHECK:       ucomisd %xmm1, %xmm0
-; CHECK-NEXT:  setnp %al
-; CHECK-NEXT:  sete %cl
-; CHECK-NEXT:  testb %al, %cl
-; CHECK-NEXT:  cmoveq %rsi, %rdi
+; SDAG-NEXT:   cmovneq %rsi, %rdi
+; SDAG-NEXT:   cmovpq %rsi, %rdi
+; SDAG-NEXT:   movq %rdi, %rax
+; FAST-NEXT:   setnp %al
+; FAST-NEXT:   sete %cl
+; FAST-NEXT:   testb %al, %cl
+; FAST-NEXT:   cmoveq %rsi, %rdi
    %1 = fcmp oeq double %a, %b
    %2 = select i1 %1, i64 %c, i64 %d
    ret i64 %2
@@ -135,10 +138,13 @@ define i64 @select_fcmp_ule_cmov(double %a, double %b, i64 %c, i64 %d) {
  define i64 @select_fcmp_une_cmov(double %a, double %b, i64 %c, i64 %d) {
  ; CHECK-LABEL: select_fcmp_une_cmov
  ; CHECK:       ucomisd %xmm1, %xmm0
-; CHECK-NEXT:  setp %al
-; CHECK-NEXT:  setne %cl
-; CHECK-NEXT:  orb %al, %cl
-; CHECK-NEXT:  cmoveq %rsi, %rdi
+; SDAG-NEXT:   cmovneq %rdi, %rsi
+; SDAG-NEXT:   cmovpq %rdi, %rsi
+; SDAG-NEXT:   movq %rsi, %rax
+; FAST-NEXT:   setp %al
+; FAST-NEXT:   setne %cl
+; FAST-NEXT:   orb %al, %cl
+; FAST-NEXT:   cmoveq %rsi, %rdi
    %1 = fcmp une double %a, %b
    %2 = select i1 %1, i64 %c, i64 %d
    ret i64 %2
author	Ahmed Bougacha <ahmed.bougacha@gmail.com>
	Tue, 3 Mar 2015 01:09:14 +0000 (01:09 +0000)
committer	Ahmed Bougacha <ahmed.bougacha@gmail.com>
	Tue, 3 Mar 2015 01:09:14 +0000 (01:09 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/cmovcmov.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/X86/fast-isel-select-cmov2.ll		patch \| blob \| history