Add straight-line strength reduction to LLVM

author Jingyue Wu <jingyue@google.com>

Tue, 3 Feb 2015 19:37:06 +0000 (19:37 +0000)

committer Jingyue Wu <jingyue@google.com>

Tue, 3 Feb 2015 19:37:06 +0000 (19:37 +0000)
author Jingyue Wu <jingyue@google.com>
Tue, 3 Feb 2015 19:37:06 +0000 (19:37 +0000)
committer Jingyue Wu <jingyue@google.com>
Tue, 3 Feb 2015 19:37:06 +0000 (19:37 +0000)
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h

index 1085e0e942de4d5c526974d680460ef694d07cf5..7f66589fcd2c36ab2067c79839651bd466508747 100644 (file)
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -254,6 +254,7 @@ void initializeSpillPlacementPass(PassRegistry&);
  void initializeStackProtectorPass(PassRegistry&);
  void initializeStackColoringPass(PassRegistry&);
  void initializeStackSlotColoringPass(PassRegistry&);
+void initializeStraightLineStrengthReducePass(PassRegistry &);
  void initializeStripDeadDebugInfoPass(PassRegistry&);
  void initializeStripDeadPrototypesPassPass(PassRegistry&);
  void initializeStripDebugDeclarePass(PassRegistry&);
diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h

index bce4c0679a100f35e522e5697369267d5f4e0247..0e5fb0d2609b95674042ed5e121b500a90a63560 100644 (file)
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -167,6 +167,7 @@ namespace {
        (void) llvm::createScalarizerPass();
        (void) llvm::createSeparateConstOffsetFromGEPPass();
        (void) llvm::createRewriteSymbolsPass();
+      (void) llvm::createStraightLineStrengthReducePass();
  
        (void)new llvm::IntervalPartition();
        (void)new llvm::ScalarEvolution();
diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h

index 83158596a214c9fbcd9422d83140ff8ffe080223..c13d03f4e9f88c98d30901b704dd3405f0ae67c8 100644 (file)
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h
@@ -412,6 +412,8 @@ createSeparateConstOffsetFromGEPPass(const TargetMachine *TM = nullptr,
  //
  BasicBlockPass *createLoadCombinePass();
  
+FunctionPass *createStraightLineStrengthReducePass();
+
  } // End llvm namespace
  
  #endif
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp

index 371d34d1fbd20403d1e6c3f4305e077b7715360c..5734d885be637770e24d7021ae2ed86e7fb2571d 100644 (file)
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -158,6 +158,7 @@ void NVPTXPassConfig::addIRPasses() {
    addPass(createNVPTXAssignValidGlobalNamesPass());
    addPass(createGenericToNVVMPass());
    addPass(createNVPTXFavorNonGenericAddrSpacesPass());
+  addPass(createStraightLineStrengthReducePass());
    addPass(createSeparateConstOffsetFromGEPPass());
    // The SeparateConstOffsetFromGEP pass creates variadic bases that can be used
    // by multiple GEPs. Run GVN or EarlyCSE to really reuse them. GVN generates
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt

index 1279e83d865dd32d07d964d0d22e217e776e09dc..c549b5811b0495aba738cb880b44a45977d53a2a 100644 (file)
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -38,6 +38,7 @@ add_llvm_library(LLVMScalarOpts
    SeparateConstOffsetFromGEP.cpp
    SimplifyCFGPass.cpp
    Sink.cpp
+  StraightLineStrengthReduce.cpp
    StructurizeCFG.cpp
    TailRecursionElimination.cpp
    )
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp

index e08aa5c0499e9986f939a6ff6b1c6e7191e267bf..47554dd48a29428b93b9b9ecbafdb6062f8896c2 100644 (file)
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -69,6 +69,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
    initializeSinkingPass(Registry);
    initializeTailCallElimPass(Registry);
    initializeSeparateConstOffsetFromGEPPass(Registry);
+  initializeStraightLineStrengthReducePass(Registry);
    initializeLoadCombinePass(Registry);
  }
  
diff --git a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp

new file mode 100644 (file)

index 0000000..a0e2182
--- /dev/null
+++ b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -0,0 +1,274 @@
+//===-- StraightLineStrengthReduce.cpp - ------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements straight-line strength reduction (SLSR). Unlike loop
+// strength reduction, this algorithm is designed to reduce arithmetic
+// redundancy in straight-line code instead of loops. It has proven to be
+// effective in simplifying arithmetic statements derived from an unrolled loop.
+// It can also simplify the logic of SeparateConstOffsetFromGEP.
+//
+// There are many optimizations we can perform in the domain of SLSR. This file
+// for now contains only an initial step. Specifically, we look for strength
+// reduction candidate in the form of
+//
+// (B + i) * S
+//
+// where B and S are integer constants or variables, and i is a constant
+// integer. If we found two such candidates
+//
+// S1: X = (B + i) * S S2: Y = (B + i') * S
+//
+// and S1 dominates S2, we call S1 a basis of S2, and can replace S2 with
+//
+// Y = X + (i' - i) * S
+//
+// where (i' - i) * S is folded to the extent possible. When S2 has multiple
+// bases, we pick the one that is closest to S2, or S2's "immediate" basis.
+//
+// TODO:
+//
+// - Handle candidates in the form of B + i * S
+//
+// - Handle candidates in the form of pointer arithmetics. e.g., B[i * S]
+//
+// - Floating point arithmetics when fast math is enabled.
+//
+// - SLSR may decrease ILP at the architecture level. Targets that are very
+//   sensitive to ILP may want to disable it. Having SLSR to consider ILP is
+//   left as future work.
+#include <vector>
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+namespace {
+
+class StraightLineStrengthReduce : public FunctionPass {
+ public:
+  // SLSR candidate. Such a candidate must be in the form of
+  //   (Base + Index) * Stride
+  struct Candidate : public ilist_node<Candidate> {
+    Candidate(Value *B = nullptr, ConstantInt *Idx = nullptr,
+              Value *S = nullptr, Instruction *I = nullptr)
+        : Base(B), Index(Idx), Stride(S), Ins(I), Basis(nullptr) {}
+    Value *Base;
+    ConstantInt *Index;
+    Value *Stride;
+    // The instruction this candidate corresponds to. It helps us to rewrite a
+    // candidate with respect to its immediate basis. Note that one instruction
+    // can corresponds to multiple candidates depending on how you associate the
+    // expression. For instance,
+    //
+    // (a + 1) * (b + 2)
+    //
+    // can be treated as
+    //
+    // <Base: a, Index: 1, Stride: b + 2>
+    //
+    // or
+    //
+    // <Base: b, Index: 2, Stride: a + 1>
+    Instruction *Ins;
+    // Points to the immediate basis of this candidate, or nullptr if we cannot
+    // find any basis for this candidate.
+    Candidate *Basis;
+  };
+
+  static char ID;
+
+  StraightLineStrengthReduce() : FunctionPass(ID), DT(nullptr) {
+    initializeStraightLineStrengthReducePass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    // We do not modify the shape of the CFG.
+    AU.setPreservesCFG();
+  }
+
+  bool runOnFunction(Function &F) override;
+
+ private:
+  // Returns true if Basis is a basis for C, i.e., Basis dominates C and they
+  // share the same base and stride.
+  bool isBasisFor(const Candidate &Basis, const Candidate &C);
+  // Checks whether I is in a candidate form. If so, adds all the matching forms
+  // to Candidates, and tries to find the immediate basis for each of them.
+  void allocateCandidateAndFindBasis(Instruction *I);
+  // Given that I is in the form of "(B + Idx) * S", adds this form to
+  // Candidates, and finds its immediate basis.
+  void allocateCandidateAndFindBasis(Value *B, ConstantInt *Idx, Value *S,
+                                     Instruction *I);
+  // Rewrites candidate C with respect to Basis.
+  void rewriteCandidateWithBasis(const Candidate &C, const Candidate &Basis);
+
+  DominatorTree *DT;
+  ilist<Candidate> Candidates;
+  // Temporarily holds all instructions that are unlinked (but not deleted) by
+  // rewriteCandidateWithBasis. These instructions will be actually removed
+  // after all rewriting finishes.
+  DenseSet<Instruction *> UnlinkedInstructions;
+};
+}  // anonymous namespace
+
+char StraightLineStrengthReduce::ID = 0;
+INITIALIZE_PASS_BEGIN(StraightLineStrengthReduce, "slsr",
+                      "Straight line strength reduction", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(StraightLineStrengthReduce, "slsr",
+                    "Straight line strength reduction", false, false)
+
+FunctionPass *llvm::createStraightLineStrengthReducePass() {
+  return new StraightLineStrengthReduce();
+}
+
+bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis,
+                                            const Candidate &C) {
+  return (Basis.Ins != C.Ins && // skip the same instruction
+          // Basis must dominate C in order to rewrite C with respect to Basis.
+          DT->dominates(Basis.Ins->getParent(), C.Ins->getParent()) &&
+          // They share the same base and stride.
+          Basis.Base == C.Base &&
+          Basis.Stride == C.Stride);
+}
+
+// TODO: We currently implement an algorithm whose time complexity is linear to
+// the number of existing candidates. However, a better algorithm exists. We
+// could depth-first search the dominator tree, and maintain a hash table that
+// contains all candidates that dominate the node being traversed.  This hash
+// table is indexed by the base and the stride of a candidate.  Therefore,
+// finding the immediate basis of a candidate boils down to one hash-table look
+// up.
+void StraightLineStrengthReduce::allocateCandidateAndFindBasis(Value *B,
+                                                               ConstantInt *Idx,
+                                                               Value *S,
+                                                               Instruction *I) {
+  Candidate C(B, Idx, S, I);
+  // Try to compute the immediate basis of C.
+  unsigned NumIterations = 0;
+  // Limit the scan radius to avoid running forever.
+  static const int MaxNumIterations = 50;
+  for (auto Basis = Candidates.rbegin();
+       Basis != Candidates.rend() && NumIterations < MaxNumIterations;
+       ++Basis, ++NumIterations) {
+    if (isBasisFor(*Basis, C)) {
+      C.Basis = &(*Basis);
+      break;
+    }
+  }
+  // Regardless of whether we find a basis for C, we need to push C to the
+  // candidate list.
+  Candidates.push_back(C);
+}
+
+void StraightLineStrengthReduce::allocateCandidateAndFindBasis(Instruction *I) {
+  Value *B = nullptr;
+  ConstantInt *Idx = nullptr;
+  // "(Base + Index) * Stride" must be a Mul instruction at the first hand.
+  if (I->getOpcode() == Instruction::Mul) {
+    if (IntegerType *ITy = dyn_cast<IntegerType>(I->getType())) {
+      Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+      for (unsigned Swapped = 0; Swapped < 2; ++Swapped) {
+        // Only handle the canonical operand ordering.
+        if (match(LHS, m_Add(m_Value(B), m_ConstantInt(Idx)))) {
+          // If LHS is in the form of "Base + Index", then I is in the form of
+          // "(Base + Index) * RHS".
+          allocateCandidateAndFindBasis(B, Idx, RHS, I);
+        } else {
+          // Otherwise, at least try the form (LHS + 0) * RHS.
+          allocateCandidateAndFindBasis(LHS, ConstantInt::get(ITy, 0), RHS, I);
+        }
+        // Swap LHS and RHS so that we also cover the cases where LHS is the
+        // stride.
+        if (LHS == RHS)
+          break;
+        std::swap(LHS, RHS);
+      }
+    }
+  }
+}
+
+void StraightLineStrengthReduce::rewriteCandidateWithBasis(
+    const Candidate &C, const Candidate &Basis) {
+  // An instruction can correspond to multiple candidates. Therefore, instead of
+  // simply deleting an instruction when we rewrite it, we mark its parent as
+  // nullptr (i.e. unlink it) so that we can skip the candidates whose
+  // instruction is already rewritten.
+  if (!C.Ins->getParent())
+    return;
+  assert(C.Base == Basis.Base && C.Stride == Basis.Stride);
+  // Basis = (B + i) * S
+  // C     = (B + i') * S
+  //   ==>
+  // C     = Basis + (i' - i) * S
+  IRBuilder<> Builder(C.Ins);
+  ConstantInt *IndexOffset = ConstantInt::get(
+      C.Ins->getContext(), C.Index->getValue() - Basis.Index->getValue());
+  Value *Reduced;
+  // TODO: preserve nsw/nuw in some cases.
+  if (IndexOffset->isOne()) {
+    // If (i' - i) is 1, fold C into Basis + S.
+    Reduced = Builder.CreateAdd(Basis.Ins, C.Stride);
+  } else if (IndexOffset->isMinusOne()) {
+    // If (i' - i) is -1, fold C into Basis - S.
+    Reduced = Builder.CreateSub(Basis.Ins, C.Stride);
+  } else {
+    Value *Bump = Builder.CreateMul(C.Stride, IndexOffset);
+    Reduced = Builder.CreateAdd(Basis.Ins, Bump);
+  }
+  Reduced->takeName(C.Ins);
+  C.Ins->replaceAllUsesWith(Reduced);
+  C.Ins->dropAllReferences();
+  // Unlink C.Ins so that we can skip other candidates also corresponding to
+  // C.Ins. The actual deletion is postponed to the end of runOnFunction.
+  C.Ins->removeFromParent();
+  UnlinkedInstructions.insert(C.Ins);
+}
+
+bool StraightLineStrengthReduce::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  // Traverse the dominator tree in the depth-first order. This order makes sure
+  // all bases of a candidate are in Candidates when we process it.
+  for (auto node = GraphTraits<DominatorTree *>::nodes_begin(DT);
+       node != GraphTraits<DominatorTree *>::nodes_end(DT); ++node) {
+    BasicBlock *B = node->getBlock();
+    for (auto I = B->begin(); I != B->end(); ++I) {
+      allocateCandidateAndFindBasis(I);
+    }
+  }
+
+  // Rewrite candidates in the reverse depth-first order. This order makes sure
+  // a candidate being rewritten is not a basis for any other candidate.
+  while (!Candidates.empty()) {
+    const Candidate &C = Candidates.back();
+    if (C.Basis != nullptr) {
+      rewriteCandidateWithBasis(C, *C.Basis);
+    }
+    Candidates.pop_back();
+  }
+
+  // Delete all unlink instructions.
+  for (auto I : UnlinkedInstructions) {
+    delete I;
+  }
+  bool Ret = !UnlinkedInstructions.empty();
+  UnlinkedInstructions.clear();
+  return Ret;
+}
diff --git a/test/Transforms/StraightLineStrengthReduce/slsr.ll b/test/Transforms/StraightLineStrengthReduce/slsr.ll

new file mode 100644 (file)

index 0000000..951cbb0
--- /dev/null
+++ b/test/Transforms/StraightLineStrengthReduce/slsr.ll
@@ -0,0 +1,119 @@
+; RUN: opt < %s -slsr -gvn -dce -S | FileCheck %s
+
+declare i32 @foo(i32 %a)
+
+define i32 @slsr1(i32 %b, i32 %s) {
+; CHECK-LABEL: @slsr1(
+  ; v0 = foo(b * s);
+  %mul0 = mul i32 %b, %s
+; CHECK: mul i32
+; CHECK-NOT: mul i32
+  %v0 = call i32 @foo(i32 %mul0)
+
+  ; v1 = foo((b + 1) * s);
+  %b1 = add i32 %b, 1
+  %mul1 = mul i32 %b1, %s
+  %v1 = call i32 @foo(i32 %mul1)
+
+  ; v2 = foo((b + 2) * s);
+  %b2 = add i32 %b, 2
+  %mul2 = mul i32 %b2, %s
+  %v2 = call i32 @foo(i32 %mul2)
+
+  ; return v0 + v1 + v2;
+  %1 = add i32 %v0, %v1
+  %2 = add i32 %1, %v2
+  ret i32 %2
+}
+
+; v0 = foo(a * b)
+; v1 = foo((a + 1) * b)
+; v2 = foo(a * (b + 1))
+; v3 = foo((a + 1) * (b + 1))
+define i32 @slsr2(i32 %a, i32 %b) {
+; CHECK-LABEL: @slsr2(
+  %a1 = add i32 %a, 1
+  %b1 = add i32 %b, 1
+  %mul0 = mul i32 %a, %b
+; CHECK: mul i32
+; CHECK-NOT: mul i32
+  %mul1 = mul i32 %a1, %b
+  %mul2 = mul i32 %a, %b1
+  %mul3 = mul i32 %a1, %b1
+
+  %v0 = call i32 @foo(i32 %mul0)
+  %v1 = call i32 @foo(i32 %mul1)
+  %v2 = call i32 @foo(i32 %mul2)
+  %v3 = call i32 @foo(i32 %mul3)
+
+  %1 = add i32 %v0, %v1
+  %2 = add i32 %1, %v2
+  %3 = add i32 %2, %v3
+  ret i32 %3
+}
+
+; The bump is a multiple of the stride.
+;
+; v0 = foo(b * s);
+; v1 = foo((b + 2) * s);
+; v2 = foo((b + 4) * s);
+; return v0 + v1 + v2;
+;
+; ==>
+;
+; mul0 = b * s;
+; v0 = foo(mul0);
+; bump = s * 2;
+; mul1 = mul0 + bump; // GVN ensures mul1 and mul2 use the same bump.
+; v1 = foo(mul1);
+; mul2 = mul1 + bump;
+; v2 = foo(mul2);
+; return v0 + v1 + v2;
+define i32 @slsr3(i32 %b, i32 %s) {
+; CHECK-LABEL: @slsr3(
+  %mul0 = mul i32 %b, %s
+; CHECK: mul i32
+  %v0 = call i32 @foo(i32 %mul0)
+
+  %b1 = add i32 %b, 2
+  %mul1 = mul i32 %b1, %s
+; CHECK: [[BUMP:%[a-zA-Z0-9]+]] = mul i32 %s, 2
+; CHECK: %mul1 = add i32 %mul0, [[BUMP]]
+  %v1 = call i32 @foo(i32 %mul1)
+
+  %b2 = add i32 %b, 4
+  %mul2 = mul i32 %b2, %s
+; CHECK: %mul2 = add i32 %mul1, [[BUMP]]
+  %v2 = call i32 @foo(i32 %mul2)
+
+  %1 = add i32 %v0, %v1
+  %2 = add i32 %1, %v2
+  ret i32 %2
+}
+
+; Do not rewrite a candidate if its potential basis does not dominate it.
+; v0 = 0;
+; if (cond)
+;   v0 = foo(a * b);
+; v1 = foo((a + 1) * b);
+; return v0 + v1;
+define i32 @not_dominate(i1 %cond, i32 %a, i32 %b) {
+; CHECK-LABEL: @not_dominate(
+entry:
+  %a1 = add i32 %a, 1
+  br i1 %cond, label %then, label %merge
+
+then:
+  %mul0 = mul i32 %a, %b
+; CHECK: %mul0 = mul i32 %a, %b
+  %v0 = call i32 @foo(i32 %mul0)
+  br label %merge
+
+merge:
+  %v0.phi = phi i32 [ 0, %entry ], [ %mul0, %then ]
+  %mul1 = mul i32 %a1, %b
+; CHECK: %mul1 = mul i32 %a1, %b
+  %v1 = call i32 @foo(i32 %mul1)
+  %sum = add i32 %v0.phi, %v1
+  ret i32 %sum
+}
author	Jingyue Wu <jingyue@google.com>
	Tue, 3 Feb 2015 19:37:06 +0000 (19:37 +0000)
committer	Jingyue Wu <jingyue@google.com>
	Tue, 3 Feb 2015 19:37:06 +0000 (19:37 +0000)
include/llvm/InitializePasses.h		patch \| blob \| history
include/llvm/LinkAllPasses.h		patch \| blob \| history
include/llvm/Transforms/Scalar.h		patch \| blob \| history
lib/Target/NVPTX/NVPTXTargetMachine.cpp		patch \| blob \| history
lib/Transforms/Scalar/CMakeLists.txt		patch \| blob \| history
lib/Transforms/Scalar/Scalar.cpp		patch \| blob \| history
lib/Transforms/Scalar/StraightLineStrengthReduce.cpp	[new file with mode: 0644]	patch \| blob
test/Transforms/StraightLineStrengthReduce/slsr.ll	[new file with mode: 0644]	patch \| blob