LSR: rewrite inner loops only.

author Andrew Trick <atrick@apple.com>

Thu, 29 Sep 2011 01:33:38 +0000 (01:33 +0000)

committer Andrew Trick <atrick@apple.com>

Thu, 29 Sep 2011 01:33:38 +0000 (01:33 +0000)
author Andrew Trick <atrick@apple.com>
Thu, 29 Sep 2011 01:33:38 +0000 (01:33 +0000)
committer Andrew Trick <atrick@apple.com>
Thu, 29 Sep 2011 01:33:38 +0000 (01:33 +0000)
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp

index f9c18c8ae2b27116f6d9992432af2596caa79326..49228a88cd32b7a0f75d4f9921be941a6cd25bbd 100644 (file)
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -78,6 +78,9 @@
  using namespace llvm;
  
  namespace llvm {
+cl::opt<bool> EnableNested(
+  "enable-lsr-nested", cl::Hidden, cl::desc("Enable LSR on nested loops"));
+
  cl::opt<bool> EnableRetry(
      "enable-lsr-retry", cl::Hidden, cl::desc("Enable LSR retry"));
  }
@@ -723,11 +726,14 @@ void Cost::RateRegister(const SCEV *Reg,
      if (AR->getLoop() == L)
        AddRecCost += 1; /// TODO: This should be a function of the stride.
  
-    // If this is an addrec for a loop that's already been visited by LSR,
-    // don't second-guess its addrec phi nodes. LSR isn't currently smart
-    // enough to reason about more than one loop at a time. Consider these
-    // registers free and leave them alone.
-    else if (L->contains(AR->getLoop()) ||
+    // If this is an addrec for another loop, don't second-guess its addrec phi
+    // nodes. LSR isn't currently smart enough to reason about more than one
+    // loop at a time. LSR has either already run on inner loops, will not run
+    // on other loops, and cannot be expected to change sibling loops. If the
+    // AddRec exists, consider it's register free and leave it alone. Otherwise,
+    // do not consider this formula at all.
+    // FIXME: why do we need to generate such fomulae?
+    else if (!EnableNested || L->contains(AR->getLoop()) ||
               (!AR->getLoop()->contains(L) &&
                DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) {
        for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
@@ -738,6 +744,10 @@ void Cost::RateRegister(const SCEV *Reg,
              SE.getSCEV(PN) == AR)
            return;
        }
+      if (!EnableNested) {
+        Loose();
+        return;
+      }
        // If this isn't one of the addrecs that the loop already has, it
        // would require a costly new phi and add. TODO: This isn't
        // precisely modeled right now.
@@ -3801,6 +3811,12 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
    // If loop preparation eliminates all interesting IV users, bail.
    if (IU.empty()) return;
  
+  // Skip nested loops until we can model them better with forulae.
+  if (!EnableNested && !L->empty()) {
+    DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
+    return false;
+  }
+
    // Start collecting data and preparing for the solver.
    CollectInterestingTypesAndFactors();
    CollectFixupsAndInitialFormulae();
diff --git a/test/CodeGen/ARM/lsr-on-unrolled-loops.ll b/test/CodeGen/ARM/lsr-on-unrolled-loops.ll

index 140b02cb39e0a2b202e8621418213bcfa99a7cca..47379016cf14946fa4404e412d58b9dc9cfb2915 100644 (file)
--- a/test/CodeGen/ARM/lsr-on-unrolled-loops.ll
+++ b/test/CodeGen/ARM/lsr-on-unrolled-loops.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=thumbv7-apple-darwin10 -mcpu=cortex-a8 < %s | FileCheck %s
+; RUN: llc -mtriple=thumbv7-apple-darwin10 -mcpu=cortex-a8  -enable-lsr-nested < %s | FileCheck %s
  
  ; LSR should recognize that this is an unrolled loop which can use
  ; constant offset addressing, so that each of the following stores
@@ -8,6 +8,9 @@
  ; CHECK: vstr.32 s{{.*}}, [{{(r[0-9]+)|(lr)}}, #64]
  ; CHECK: vstr.32 s{{.*}}, [{{(r[0-9]+)|(lr)}}, #96]
  
+; We can also save a register in the outer loop, but that requires
+; performing LSR on the outer loop.
+
  target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32-n32"
  
  %0 = type { %1*, %3*, %6*, i8*, i32, i32, %8*, i32, i32, i32, i32, i32, i32, i32, double, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8**, i32, i32, i32, i32, i32, [64 x i32]*, [4 x %9*], [4 x %10*], [4 x %10*], i32, %11*, i32, i32, [16 x i8], [16 x i8], [16 x i8], i32, i32, i8, i8, i8, i16, i16, i32, i8, i32, %12*, i32, i32, i32, i32, i8*, i32, [4 x %11*], i32, i32, i32, [10 x i32], i32, i32, i32, i32, i32, %13*, %14*, %15*, %16*, %17*, %18*, %19*, %20*, %21*, %22*, %23* }
diff --git a/test/CodeGen/X86/change-compare-stride-0.ll b/test/CodeGen/X86/change-compare-stride-0.ll

index 3a383ee9c1d6bca7f41c172ef9e7a49057d7887a..439f7b0d4f6c70472c043a21acc493021f6072dc 100644 (file)
--- a/test/CodeGen/X86/change-compare-stride-0.ll
+++ b/test/CodeGen/X86/change-compare-stride-0.ll
@@ -1,4 +1,7 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -march=x86 -enable-lsr-nested | FileCheck %s
+;
+; Nested LSR is required to optimize this case.
+; We do not expect to see this form of IR without -enable-iv-rewrite.
  
  define void @borf(i8* nocapture %in, i8* nocapture %out) nounwind {
  ; CHECK: borf:
diff --git a/test/CodeGen/X86/change-compare-stride-1.ll b/test/CodeGen/X86/change-compare-stride-1.ll

index eee3b79acfafbee6b48a8cff9bab86918b2c0fd1..8b53ae2817c643f6310e0f6be4da818212d6c271 100644 (file)
--- a/test/CodeGen/X86/change-compare-stride-1.ll
+++ b/test/CodeGen/X86/change-compare-stride-1.ll
@@ -1,4 +1,7 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -enable-lsr-nested | FileCheck %s
+;
+; Nested LSR is required to optimize this case.
+; We do not expect to see this form of IR without -enable-iv-rewrite.
  
  define void @borf(i8* nocapture %in, i8* nocapture %out) nounwind {
  ; CHECK: borf:
diff --git a/test/CodeGen/X86/iv-users-in-other-loops.ll b/test/CodeGen/X86/iv-users-in-other-loops.ll

index 8385a29fa22bb085f497012161bcfd462b6a52ef..8f79fb8cde27b1fb7933107d78a70c8f33878657 100644 (file)
--- a/test/CodeGen/X86/iv-users-in-other-loops.ll
+++ b/test/CodeGen/X86/iv-users-in-other-loops.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -o %t
+; RUN: llc < %s -march=x86-64 -enable-lsr-nested -o %t
  ; RUN: not grep inc %t
  ; RUN: grep dec %t | count 2
  ; RUN: grep addq %t | count 12
@@ -11,6 +11,10 @@
  ; to insert new induction variables. Previously it would create a
  ; flood of new induction variables.
  ; Also, the loop reversal should kick in once.
+;
+; In this example, performing LSR on the entire loop nest,
+; as opposed to only the inner loop can further reduce induction variables,
+; and their related instructions and registers.
  
  target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
  target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/loop-strength-reduce3.ll b/test/CodeGen/X86/loop-strength-reduce3.ll

index c45a37411194aeba09f6385b45e560cf520c3447..d6c265f329a1235643e4816541b6b08028ba4a79 100644 (file)
--- a/test/CodeGen/X86/loop-strength-reduce3.ll
+++ b/test/CodeGen/X86/loop-strength-reduce3.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 | grep cmp | grep 240
-; RUN: llc < %s -march=x86 | grep inc | count 1
+; RUN: llc < %s -march=x86 -enable-lsr-nested | grep cmp | grep 240
+; RUN: llc < %s -march=x86 -enable-lsr-nested | grep inc | count 1
  
  define i32 @foo(i32 %A, i32 %B, i32 %C, i32 %D) nounwind {
  entry:
diff --git a/test/CodeGen/X86/pr3495.ll b/test/CodeGen/X86/pr3495.ll

index 3dcd6da0bb0efeb496210e7734264c9b007c621c..7efd35b8b6d02eed1c21e913f4afaeb559b3cdc5 100644 (file)
--- a/test/CodeGen/X86/pr3495.ll
+++ b/test/CodeGen/X86/pr3495.ll
@@ -1,7 +1,9 @@
-; RUN: llc < %s -march=x86 -stats -regalloc=linearscan |& grep {Number of loads added} | grep 2
-; RUN: llc < %s -march=x86 -stats -regalloc=linearscan |& grep {Number of spill slots allocated} | grep 1
-; RUN: llc < %s -march=x86 -stats -regalloc=linearscan |& grep {Number of machine instrs printed} | grep 34
+; RUN: llc < %s -march=x86 -stats -regalloc=linearscan -enable-lsr-nested |& grep {Number of loads added} | grep 2
+; RUN: llc < %s -march=x86 -stats -regalloc=linearscan -enable-lsr-nested |& grep {Number of spill slots allocated} | grep 1
+; RUN: llc < %s -march=x86 -stats -regalloc=linearscan -enable-lsr-nested |& grep {Number of machine instrs printed} | grep 34
  ; PR3495
+;
+; Note: this should not spill at all with either good LSR or good regalloc.
  
  target triple = "i386-pc-linux-gnu"
  @x = external global [8 x i32], align 32               ; <[8 x i32]*> [#uses=1]
author	Andrew Trick <atrick@apple.com>
	Thu, 29 Sep 2011 01:33:38 +0000 (01:33 +0000)
committer	Andrew Trick <atrick@apple.com>
	Thu, 29 Sep 2011 01:33:38 +0000 (01:33 +0000)
lib/Transforms/Scalar/LoopStrengthReduce.cpp		patch \| blob \| history
test/CodeGen/ARM/lsr-on-unrolled-loops.ll		patch \| blob \| history
test/CodeGen/X86/change-compare-stride-0.ll		patch \| blob \| history
test/CodeGen/X86/change-compare-stride-1.ll		patch \| blob \| history
test/CodeGen/X86/iv-users-in-other-loops.ll		patch \| blob \| history
test/CodeGen/X86/loop-strength-reduce3.ll		patch \| blob \| history
test/CodeGen/X86/pr3495.ll		patch \| blob \| history