From eb653bd9d1acc484934bc10d078f6f1fdc08441d Mon Sep 17 00:00:00 2001
From: Jingyue Wu <jingyue@google.com>
Date: Thu, 23 Jul 2015 04:59:07 +0000
Subject: [PATCH] [NVPTX] run LSR before straight-line optimizations

Summary:
Straight-line optimizations can simplify the loop body and make LSR's
cost analysis more precise. This significantly improves several Eigen3
CUDA benchmarks.

With this change, EigenContractionKernel runs up to 40% faster
(https://bitbucket.org/eigen/eigen/src/753ceee5f206ff7dde9f6a41a5a420749fc9406f/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h?at=default#cl-502).
EigenConvolutionKernel2D runs up to 10% faster
(https://bitbucket.org/eigen/eigen/src/753ceee5f206ff7dde9f6a41a5a420749fc9406f/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h?at=default#cl-605).

I have some difficulties writing small tests that benefit from this
reordering due to a seemingly issue with LSR (being discussed at
http://lists.cs.uiuc.edu/pipermail/llvmdev/2015-July/088244.html).

See the review thread for the compilation time impact of GVN.

Reviewers: eliben, jholewinski

Subscribers: llvm-commits, jholewinski

Differential Revision: http://reviews.llvm.org/D11304

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@242982 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/NVPTX/NVPTXTargetMachine.cpp | 37 +++++++++++++++++++++----
 1 file changed, 32 insertions(+), 5 deletions(-)

diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index c6944845a15..706314c9ed3 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -141,6 +141,10 @@ public:
   FunctionPass *createTargetRegisterAllocator(bool) override;
   void addFastRegAlloc(FunctionPass *RegAllocPass) override;
   void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
+
+private:
+  // if the opt level is aggressive, add GVN; otherwise, add EarlyCSE.
+  void addEarlyCSEOrGVNPass();
 };
 } // end anonymous namespace
 
@@ -155,6 +159,13 @@ TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() {
   });
 }
 
+void NVPTXPassConfig::addEarlyCSEOrGVNPass() {
+  if (getOptLevel() == CodeGenOpt::Aggressive)
+    addPass(createGVNPass());
+  else
+    addPass(createEarlyCSEPass());
+}
+
 void NVPTXPassConfig::addIRPasses() {
   // The following passes are known to not play well with virtual regs hanging
   // around after register allocation (which in our case, is *all* registers).
@@ -166,9 +177,10 @@ void NVPTXPassConfig::addIRPasses() {
   disablePass(&TailDuplicateID);
 
   addPass(createNVPTXImageOptimizerPass());
-  TargetPassConfig::addIRPasses();
   addPass(createNVPTXAssignValidGlobalNamesPass());
   addPass(createGenericToNVVMPass());
+
+  // === Propagate special address spaces ===
   addPass(createNVPTXLowerKernelArgsPass(&getNVPTXTargetMachine()));
   // NVPTXLowerKernelArgs emits alloca for byval parameters which can often
   // be eliminated by SROA.
@@ -179,6 +191,8 @@ void NVPTXPassConfig::addIRPasses() {
   // them unused. We could remove dead code in an ad-hoc manner, but that
   // requires manual work and might be error-prone.
   addPass(createDeadCodeEliminationPass());
+
+  // === Straight-line scalar optimizations ===
   addPass(createSeparateConstOffsetFromGEPPass());
   addPass(createSpeculativeExecutionPass());
   // ReassociateGEPs exposes more opportunites for SLSR. See
@@ -187,15 +201,28 @@ void NVPTXPassConfig::addIRPasses() {
   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
   // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE
   // for some of our benchmarks.
-  if (getOptLevel() == CodeGenOpt::Aggressive)
-    addPass(createGVNPass());
-  else
-    addPass(createEarlyCSEPass());
+  addEarlyCSEOrGVNPass();
   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
   addPass(createNaryReassociatePass());
   // NaryReassociate on GEPs creates redundant common expressions, so run
   // EarlyCSE after it.
   addPass(createEarlyCSEPass());
+
+  // === LSR and other generic IR passes ===
+  TargetPassConfig::addIRPasses();
+  // EarlyCSE is not always strong enough to clean up what LSR produces. For
+  // example, GVN can combine
+  //
+  //   %0 = add %a, %b
+  //   %1 = add %b, %a
+  //
+  // and
+  //
+  //   %0 = shl nsw %a, 2
+  //   %1 = shl %a, 2
+  //
+  // but EarlyCSE can do neither of them.
+  addEarlyCSEOrGVNPass();
 }
 
 bool NVPTXPassConfig::addInstSelector() {
-- 
2.34.1