[CodeGenPrepare] Teach when it is profitable to speculate calls to @llvm.cttz/ctlz.

author Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>

Sun, 28 Dec 2014 11:07:35 +0000 (11:07 +0000)

committer Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>

Sun, 28 Dec 2014 11:07:35 +0000 (11:07 +0000)
author Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Sun, 28 Dec 2014 11:07:35 +0000 (11:07 +0000)
committer Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Sun, 28 Dec 2014 11:07:35 +0000 (11:07 +0000)
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h

index 9c0189782249d8d959077e9872db6f9b3b8ceb3a..9a04d3c6c679f50c98dddb296e2551c18492b047 100644 (file)
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -249,6 +249,16 @@ public:
      return true;
    }
  
+  /// \brief Return true if it is cheap to speculate a call to intrinsic cttz.
+  virtual bool isCheapToSpeculateCttz() const {
+    return false;
+  }
+  
+  /// \brief Return true if it is cheap to speculate a call to intrinsic ctlz.
+  virtual bool isCheapToSpeculateCtlz() const {
+    return false;
+  }
+
    /// \brief Return if the target supports combining a
    /// chain like:
    /// \code
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp

index aba0d580916f060f54b871f913d7642f907b03e4..55a463775294c9994c37c5d650d5ce0c206c29b3 100644 (file)
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -3978,6 +3978,119 @@ void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) {
    Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted);
  }
  
+// See if we can speculate calls to intrinsic cttz/ctlz.
+//
+// Example:
+// entry:
+//   ...
+//   %cmp = icmp eq i64 %val, 0
+//   br i1 %cmp, label %end.bb, label %then.bb
+//
+// then.bb:
+//   %c = tail call i64 @llvm.cttz.i64(i64 %val, i1 true)
+//   br label %EndBB
+//
+// end.bb:
+//   %cond = phi i64 [ %c, %then.bb ], [ 64, %entry ]
+//
+// ==>
+//
+// entry:
+//   ...
+//   %c = tail call i64 @llvm.cttz.i64(i64 %val, i1 false)
+//
+static bool OptimizeBranchInst(BranchInst *BrInst, const TargetLowering &TLI) {
+  assert(BrInst->isConditional() && "Expected a conditional branch!");
+  BasicBlock *ThenBB = BrInst->getSuccessor(1);
+  BasicBlock *EndBB = BrInst->getSuccessor(0);
+
+  // See if ThenBB contains only one instruction (excluding the
+  // terminator and DbgInfoIntrinsic calls).
+  IntrinsicInst *II = nullptr;
+  for (BasicBlock::iterator I = ThenBB->begin(),
+                            E = std::prev(ThenBB->end()); I != E; ++I) {
+    // Skip debug info.
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    if (II)
+      // Avoid speculating more than one instruction.
+      return false;
+
+    // See if this is a call to intrinsic cttz/ctlz.
+    if (match(cast<Instruction>(I), m_Intrinsic<Intrinsic::cttz>())) {
+      // Avoid speculating expensive intrinsic calls.
+      if (!TLI.isCheapToSpeculateCttz())
+        return false;
+    }
+    else if (match(cast<Instruction>(I), m_Intrinsic<Intrinsic::ctlz>())) {
+      // Avoid speculating expensive intrinsic calls.
+      if (!TLI.isCheapToSpeculateCtlz())
+        return false;
+    } else
+      return false;
+    
+    II = cast<IntrinsicInst>(I);
+  }
+
+  // Look for PHI nodes with 'II' as the incoming value from 'ThenBB'.
+  BasicBlock *EntryBB = BrInst->getParent();
+  for (BasicBlock::iterator I = EndBB->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+    Value *ThenV = PN->getIncomingValueForBlock(ThenBB);
+    Value *OrigV = PN->getIncomingValueForBlock(EntryBB);
+
+    if (!OrigV || ThenV != II)
+      return false;
+
+    if (ConstantInt *CInt = dyn_cast<ConstantInt>(OrigV)) {
+      unsigned BitWidth = ThenV->getType()->getIntegerBitWidth();
+
+      // Don't try to simplify this phi node if 'ThenV' is a cttz/ctlz
+      // intrinsic call, but 'OrigV' is not equal to the 'size-of' in bits
+      // of the value in input to the cttz/ctlz.
+      if (CInt->getValue() != BitWidth)
+        return false;
+
+      // Hoist the call to cttz/ctlz from ThenBB into EntryBB.
+      EntryBB->getInstList().splice(BrInst, ThenBB->getInstList(),
+                                    ThenBB->begin(), std::prev(ThenBB->end()));
+ 
+      // Update PN setting ThenV as the incoming value from both 'EntryBB'
+      // and 'ThenBB'. Eventually, method 'OptimizeInst' will fold this
+      // phi node if all the incoming values are the same.
+      PN->setIncomingValue(PN->getBasicBlockIndex(EntryBB), ThenV);
+      PN->setIncomingValue(PN->getBasicBlockIndex(ThenBB), ThenV);
+
+      // Clear the 'undef on zero' flag of the cttz/ctlz intrinsic call.
+      if (cast<ConstantInt>(II->getArgOperand(1))->isOne()) {
+        Type *Ty = II->getArgOperand(0)->getType();
+        Value *Args[] = { II->getArgOperand(0),
+                          ConstantInt::getFalse(II->getContext()) };
+        Module *M = EntryBB->getParent()->getParent();
+        Value *IF = Intrinsic::getDeclaration(M, II->getIntrinsicID(), Ty);
+        IRBuilder<> Builder(BrInst);
+        Instruction *NewI = Builder.CreateCall(IF, Args);
+
+        // Replace the old call to cttz/ctlz.
+        II->replaceAllUsesWith(NewI);
+        II->eraseFromParent();
+      }
+ 
+      // Update BrInst condition so that the branch to EndBB is always taken.
+      // Later on, method 'ConstantFoldTerminator' will simplify this branch
+      // replacing it with a direct branch to 'EndBB'.
+      // As a side effect, CodeGenPrepare will attempt to simplify the control
+      // flow graph by deleting basic block 'ThenBB' and merging 'EntryBB' into
+      // 'EndBB' (calling method 'EliminateFallThrough').
+      BrInst->setCondition(ConstantInt::getTrue(BrInst->getContext()));
+      return true;
+    }
+  }
+
+  return false;
+}
+
  /// Some targets can do store(extractelement) with one instruction.
  /// Try to push the extractelement towards the stores when the target
  /// has this feature and this is profitable.
@@ -4130,6 +4243,34 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) {
    if (isa<ExtractElementInst>(I))
      return OptimizeExtractElementInst(I);
  
+  if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+    if (TLI && BI->isConditional() && BI->getCondition()->hasOneUse()) {
+      // Check if the branch condition compares a value agaist zero.
+      if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition())) {
+        if (ICI->getPredicate() == ICmpInst::ICMP_EQ &&
+            match(ICI->getOperand(1), m_Zero())) {
+          BasicBlock *ThenBB = BI->getSuccessor(1);
+          BasicBlock *EndBB = BI->getSuccessor(0);
+
+          // Check if ThenBB is only reachable from this basic block; also,
+          // check if EndBB has more than one predecessor.
+          if (ThenBB->getSinglePredecessor() &&
+              !EndBB->getSinglePredecessor()) {
+            TerminatorInst *TI = ThenBB->getTerminator();
+
+            if (TI->getNumSuccessors() == 1 && TI->getSuccessor(0) == EndBB &&
+                // Try to speculate calls to intrinsic cttz/ctlz from 'ThenBB'.
+                OptimizeBranchInst(BI, *TLI)) {
+              ModifiedDT = true;
+              return true;
+            }
+          }
+        }
+      }
+    }
+    return false;
+  }
+
    return false;
  }
  
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 77a1d6e69febe440ec063328ebaba000dd8f67e9..c966642ffff30ee7ebe7f76f6148db9a78090df9 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -3882,6 +3882,16 @@ bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
    return (Index == 0 || Index == ResVT.getVectorNumElements());
  }
  
+bool X86TargetLowering::isCheapToSpeculateCttz() const {
+  // Don't try to speculate cttz if we can't directly use TZCNT.
+  return Subtarget->hasBMI();
+}
+
+bool X86TargetLowering::isCheapToSpeculateCtlz() const {
+  // Don't try to speculate ctlz if we can't directly use LZCNT.
+  return Subtarget->hasLZCNT();
+}
+
  /// isUndefOrInRange - Return true if Val is undef or if its value falls within
  /// the specified range (L, H].
  static bool isUndefOrInRange(int Val, int Low, int Hi) {
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h

index c5e3e14c545c5a0089eaefdf75662a716da902a4..0b01bae9e6734814fa6366496e99bc672b82b1f0 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -633,6 +633,10 @@ namespace llvm {
      /// This method returns the name of a target specific DAG node.
      const char *getTargetNodeName(unsigned Opcode) const override;
  
+    bool isCheapToSpeculateCttz() const override;
+
+    bool isCheapToSpeculateCtlz() const override;
+
      /// Return the value type to use for ISD::SETCC.
      EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
  
diff --git a/test/CodeGen/X86/cttz-ctlz.ll b/test/CodeGen/X86/cttz-ctlz.ll

new file mode 100644 (file)

index 0000000..88dcf65
--- /dev/null
+++ b/test/CodeGen/X86/cttz-ctlz.ll
@@ -0,0 +1,250 @@
+; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown -mattr=+bmi < %s | FileCheck %s --check-prefix=ALL --check-prefix=BMI
+; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown -mattr=+lzcnt < %s | FileCheck %s --check-prefix=ALL --check-prefix=LZCNT
+; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefix=ALL --check-prefix=GENERIC
+
+
+define i64 @test1(i64 %A) {
+; ALL-LABEL: @test1(
+; LZCNT: [[CTLZ:%[A-Za-z0-9]+]] = call i64 @llvm.ctlz.i64(i64 %A, i1 false)
+; LZCNT-NEXT: ret i64 [[CTLZ]]
+; BMI: icmp eq i64 %A, 0
+; BMI: call i64 @llvm.ctlz.i64(i64 %A, i1 true)
+; GENERIC: icmp eq i64 %A, 0
+; GENERIC: call i64 @llvm.ctlz.i64(i64 %A, i1 true)
+entry:
+  %tobool = icmp eq i64 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i64 @llvm.ctlz.i64(i64 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i64 [ %0, %cond.true ], [ 64, %entry ]
+  ret i64 %cond
+}
+
+
+define i32 @test2(i32 %A) {
+; ALL-LABEL: @test2(
+; LZCNT: [[CTLZ:%[A-Za-z0-9]+]] = call i32 @llvm.ctlz.i32(i32 %A, i1 false)
+; LZCNT-NEXT: ret i32 [[CTLZ]]
+; BMI: icmp eq i32 %A, 0
+; BMI: call i32 @llvm.ctlz.i32(i32 %A, i1 true)
+; GENERIC: icmp eq i32 %A, 0
+; GENERIC: call i32 @llvm.ctlz.i32(i32 %A, i1 true)
+entry:
+  %tobool = icmp eq i32 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i32 @llvm.ctlz.i32(i32 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i32 [ %0, %cond.true ], [ 32, %entry ]
+  ret i32 %cond
+}
+
+
+define signext i16 @test3(i16 signext %A) {
+; ALL-LABEL: @test3(
+; LZCNT: [[CTLZ:%[A-Za-z0-9]+]] = call i16 @llvm.ctlz.i16(i16 %A, i1 false)
+; LZCNT-NEXT: ret i16 [[CTLZ]]
+; BMI: icmp eq i16 %A, 0
+; BMI: call i16 @llvm.ctlz.i16(i16 %A, i1 true)
+; GENERIC: icmp eq i16 %A, 0
+; GENERIC: call i16 @llvm.ctlz.i16(i16 %A, i1 true)
+entry:
+  %tobool = icmp eq i16 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i16 @llvm.ctlz.i16(i16 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i16 [ %0, %cond.true ], [ 16, %entry ]
+  ret i16 %cond
+}
+
+
+define i64 @test1b(i64 %A) {
+; ALL-LABEL: @test1b(
+; LZCNT: icmp eq i64 %A, 0
+; LZCNT: call i64 @llvm.cttz.i64(i64 %A, i1 true)
+; BMI: [[CTTZ:%[A-Za-z0-9]+]] = call i64 @llvm.cttz.i64(i64 %A, i1 false)
+; BMI-NEXT: ret i64 [[CTTZ]]
+; GENERIC: icmp eq i64 %A, 0
+; GENERIC: call i64 @llvm.cttz.i64(i64 %A, i1 true)
+entry:
+  %tobool = icmp eq i64 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i64 @llvm.cttz.i64(i64 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i64 [ %0, %cond.true ], [ 64, %entry ]
+  ret i64 %cond
+}
+
+
+define i32 @test2b(i32 %A) {
+; ALL-LABEL: @test2b(
+; LZCNT: icmp eq i32 %A, 0
+; LZCNT: call i32 @llvm.cttz.i32(i32 %A, i1 true)
+; BMI: [[CTTZ:%[A-Za-z0-9]+]] = call i32 @llvm.cttz.i32(i32 %A, i1 false)
+; BMI-NEXT: ret i32 [[CTTZ]]
+; GENERIC: icmp eq i32 %A, 0
+; GENERIC: call i32 @llvm.cttz.i32(i32 %A, i1 true)
+entry:
+  %tobool = icmp eq i32 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i32 @llvm.cttz.i32(i32 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i32 [ %0, %cond.true ], [ 32, %entry ]
+  ret i32 %cond
+}
+
+
+define signext i16 @test3b(i16 signext %A) {
+; ALL-LABEL: @test3b(
+; LZCNT: icmp eq i16 %A, 0
+; LZCNT: call i16 @llvm.cttz.i16(i16 %A, i1 true)
+; BMI: [[CTTZ:%[A-Za-z0-9]+]] = call i16 @llvm.cttz.i16(i16 %A, i1 false)
+; BMI-NEXT: ret i16 [[CTTZ]]
+; GENERIC: icmp eq i16 %A, 0
+; GENERIC: call i16 @llvm.cttz.i16(i16 %A, i1 true)
+entry:
+  %tobool = icmp eq i16 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i16 @llvm.cttz.i16(i16 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i16 [ %0, %cond.true ], [ 16, %entry ]
+  ret i16 %cond
+}
+
+
+define i64 @test1c(i64 %A) {
+; ALL-LABEL: @test1c(
+; ALL: icmp eq i64 %A, 0
+; ALL: call i64 @llvm.ctlz.i64(i64 %A, i1 true)
+entry:
+  %tobool = icmp eq i64 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i64 @llvm.ctlz.i64(i64 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i64 [ %0, %cond.true ], [ 63, %entry ]
+  ret i64 %cond
+}
+
+define i32 @test2c(i32 %A) {
+; ALL-LABEL: @test2c(
+; ALL: icmp eq i32 %A, 0
+; ALL: call i32 @llvm.ctlz.i32(i32 %A, i1 true)
+entry:
+  %tobool = icmp eq i32 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i32 @llvm.ctlz.i32(i32 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i32 [ %0, %cond.true ], [ 31, %entry ]
+  ret i32 %cond
+}
+
+
+define signext i16 @test3c(i16 signext %A) {
+; ALL-LABEL: @test3c(
+; ALL: icmp eq i16 %A, 0
+; ALL: call i16 @llvm.ctlz.i16(i16 %A, i1 true)
+entry:
+  %tobool = icmp eq i16 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i16 @llvm.ctlz.i16(i16 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i16 [ %0, %cond.true ], [ 15, %entry ]
+  ret i16 %cond
+}
+
+
+define i64 @test1d(i64 %A) {
+; ALL-LABEL: @test1d(
+; ALL: icmp eq i64 %A, 0
+; ALL: call i64 @llvm.cttz.i64(i64 %A, i1 true)
+entry:
+  %tobool = icmp eq i64 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i64 @llvm.cttz.i64(i64 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i64 [ %0, %cond.true ], [ 63, %entry ]
+  ret i64 %cond
+}
+
+
+define i32 @test2d(i32 %A) {
+; ALL-LABEL: @test2d(
+; ALL: icmp eq i32 %A, 0
+; ALL: call i32 @llvm.cttz.i32(i32 %A, i1 true)
+entry:
+  %tobool = icmp eq i32 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i32 @llvm.cttz.i32(i32 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i32 [ %0, %cond.true ], [ 31, %entry ]
+  ret i32 %cond
+}
+
+
+define signext i16 @test3d(i16 signext %A) {
+; ALL-LABEL: @test3d(
+; ALL: icmp eq i16 %A, 0
+; ALL: call i16 @llvm.cttz.i16(i16 %A, i1 true)
+entry:
+  %tobool = icmp eq i16 %A, 0
+  br i1 %tobool, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %entry
+  %0 = tail call i16 @llvm.cttz.i16(i16 %A, i1 true)
+  br label %cond.end
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond = phi i16 [ %0, %cond.true ], [ 15, %entry ]
+  ret i16 %cond
+}
+
+
+declare i64 @llvm.ctlz.i64(i64, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
author	Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
	Sun, 28 Dec 2014 11:07:35 +0000 (11:07 +0000)
committer	Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
	Sun, 28 Dec 2014 11:07:35 +0000 (11:07 +0000)
include/llvm/Target/TargetLowering.h		patch \| blob \| history
lib/CodeGen/CodeGenPrepare.cpp		patch \| blob \| history
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
lib/Target/X86/X86ISelLowering.h		patch \| blob \| history
test/CodeGen/X86/cttz-ctlz.ll	[new file with mode: 0644]	patch \| blob