SLPVectorizer: limit the scheduling region size per basic block.

author Erik Eckstein <eeckstein@apple.com>

Wed, 30 Sep 2015 17:00:44 +0000 (17:00 +0000)

committer Erik Eckstein <eeckstein@apple.com>

Wed, 30 Sep 2015 17:00:44 +0000 (17:00 +0000)
author Erik Eckstein <eeckstein@apple.com>
Wed, 30 Sep 2015 17:00:44 +0000 (17:00 +0000)
committer Erik Eckstein <eeckstein@apple.com>
Wed, 30 Sep 2015 17:00:44 +0000 (17:00 +0000)
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp

index 477273135f1a7815c9f501603b1975d6537b2fc1..fd8818c1ca92ff59b24f0b15217611982ffb7cfe 100644 (file)
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -73,6 +73,14 @@ static cl::opt<int>
  MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
      cl::desc("Attempt to vectorize for this register size in bits"));
  
+/// Limits the size of scheduling regions in a block.
+/// It avoid long compile times for _very_ large blocks where vector
+/// instructions are spread over a wide range.
+/// This limit is way higher than needed by real-world functions.
+static cl::opt<int>
+ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
+    cl::desc("Limit the size of the SLP scheduling region per block"));
+
  namespace {
  
  // FIXME: Set this via cl::opt to allow overriding.
@@ -89,6 +97,10 @@ static const unsigned AliasedCheckLimit = 10;
  // This limit is useful for very large basic blocks.
  static const unsigned MaxMemDepDistance = 160;
  
+/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
+/// regions to be handled.
+static const int MinScheduleRegionSize = 16;
+
  /// \brief Predicate for the element types that the SLP vectorizer supports.
  ///
  /// The most important thing to filter here are types which are invalid in LLVM
@@ -720,6 +732,8 @@ private:
          : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
            ScheduleStart(nullptr), ScheduleEnd(nullptr),
            FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
+          ScheduleRegionSize(0),
+          ScheduleRegionSizeLimit(ScheduleRegionSizeBudget),
            // Make sure that the initial SchedulingRegionID is greater than the
            // initial SchedulingRegionID in ScheduleData (which is 0).
            SchedulingRegionID(1) {}
@@ -731,6 +745,13 @@ private:
        FirstLoadStoreInRegion = nullptr;
        LastLoadStoreInRegion = nullptr;
  
+      // Reduce the maximum schedule region size by the size of the
+      // previous scheduling run.
+      ScheduleRegionSizeLimit -= ScheduleRegionSize;
+      if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
+        ScheduleRegionSizeLimit = MinScheduleRegionSize;
+      ScheduleRegionSize = 0;
+
        // Make a new scheduling region, i.e. all existing ScheduleData is not
        // in the new region yet.
        ++SchedulingRegionID;
@@ -807,7 +828,8 @@ private:
      void cancelScheduling(ArrayRef<Value *> VL);
  
      /// Extends the scheduling region so that V is inside the region.
-    void extendSchedulingRegion(Value *V);
+    /// \returns true if the region size is within the limit.
+    bool extendSchedulingRegion(Value *V);
  
      /// Initialize the ScheduleData structures for new instructions in the
      /// scheduling region.
@@ -861,6 +883,12 @@ private:
      /// (can be null).
      ScheduleData *LastLoadStoreInRegion;
  
+    /// The current size of the scheduling region.
+    int ScheduleRegionSize;
+    
+    /// The maximum size allowed for the scheduling region.
+    int ScheduleRegionSizeLimit;
+
      /// The ID of the scheduling region. For a new vectorization iteration this
      /// is incremented which "removes" all ScheduleData from the region.
      int SchedulingRegionID;
@@ -1080,7 +1108,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
  
    if (!BS.tryScheduleBundle(VL, this)) {
      DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
-    BS.cancelScheduling(VL);
+    assert((!BS.getScheduleData(VL[0]) ||
+            !BS.getScheduleData(VL[0])->isPartOfBundle()) &&
+           "tryScheduleBundle should cancelScheduling on failure");
      newTreeEntry(VL, false);
      return;
    }
@@ -2686,8 +2716,15 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
    ScheduleData *Bundle = nullptr;
    bool ReSchedule = false;
    DEBUG(dbgs() << "SLP:  bundle: " << *VL[0] << "\n");
+
+  // Make sure that the scheduling region contains all
+  // instructions of the bundle.
+  for (Value *V : VL) {
+    if (!extendSchedulingRegion(V))
+      return false;
+  }
+
    for (Value *V : VL) {
-    extendSchedulingRegion(V);
      ScheduleData *BundleMember = getScheduleData(V);
      assert(BundleMember &&
             "no ScheduleData for bundle member (maybe not in same basic block)");
@@ -2748,7 +2785,11 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
        schedule(pickedSD, ReadyInsts);
      }
    }
-  return Bundle->isReady();
+  if (!Bundle->isReady()) {
+    cancelScheduling(VL);
+    return false;
+  }
+  return true;
  }
  
  void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
@@ -2777,9 +2818,9 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
    }
  }
  
-void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
+bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
    if (getScheduleData(V))
-    return;
+    return true;
    Instruction *I = dyn_cast<Instruction>(V);
    assert(I && "bundle member must be an instruction");
    assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
@@ -2790,7 +2831,7 @@ void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
      ScheduleEnd = I->getNextNode();
      assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
      DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
-    return;
+    return true;
    }
    // Search up and down at the same time, because we don't know if the new
    // instruction is above or below the existing scheduling region.
@@ -2799,12 +2840,17 @@ void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
    BasicBlock::iterator DownIter(ScheduleEnd);
    BasicBlock::iterator LowerEnd = BB->end();
    for (;;) {
+    if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
+      DEBUG(dbgs() << "SLP:  exceeded schedule region size limit\n");
+      return false;
+    }
+
      if (UpIter != UpperEnd) {
        if (&*UpIter == I) {
          initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
          ScheduleStart = I;
          DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I << "\n");
-        return;
+        return true;
        }
        UpIter++;
      }
@@ -2815,13 +2861,14 @@ void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
          ScheduleEnd = I->getNextNode();
          assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
          DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");
-        return;
+        return true;
        }
        DownIter++;
      }
      assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
             "instruction not found in block");
    }
+  return true;
  }
  
  void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
diff --git a/test/Transforms/SLPVectorizer/X86/schedule_budget.ll b/test/Transforms/SLPVectorizer/X86/schedule_budget.ll

new file mode 100644 (file)

index 0000000..348c954
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/schedule_budget.ll
@@ -0,0 +1,66 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S  -slp-schedule-budget=16 -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; Test if the budget for the scheduling region size works.
+; We test with a reduced budget of 16 which should prevent vectorizing the loads.
+
+declare void @unknown()
+
+; CHECK-LABEL: @test
+; CHECK: load float
+; CHECK: load float
+; CHECK: load float
+; CHECK: load float
+; CHECK: call void @unknown
+define void @test(float * %a, float * %b) {
+entry:
+  %l0 = load float, float* %a
+  %a1 = getelementptr inbounds float, float* %a, i64 1
+  %l1 = load float, float* %a1
+  %a2 = getelementptr inbounds float, float* %a, i64 2
+  %l2 = load float, float* %a2
+  %a3 = getelementptr inbounds float, float* %a, i64 3
+  %l3 = load float, float* %a3
+
+  ; some unrelated instructions inbetween to enlarge the scheduling region
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+
+  store float %l0, float* %b
+  %b1 = getelementptr inbounds float, float* %b, i64 1
+  store float %l1, float* %b1
+  %b2 = getelementptr inbounds float, float* %b, i64 2
+  store float %l2, float* %b2
+  %b3 = getelementptr inbounds float, float* %b, i64 3
+  store float %l3, float* %b3
+  ret void
+}
+
author	Erik Eckstein <eeckstein@apple.com>
	Wed, 30 Sep 2015 17:00:44 +0000 (17:00 +0000)
committer	Erik Eckstein <eeckstein@apple.com>
	Wed, 30 Sep 2015 17:00:44 +0000 (17:00 +0000)
lib/Transforms/Vectorize/SLPVectorizer.cpp		patch \| blob \| history
test/Transforms/SLPVectorizer/X86/schedule_budget.ll	[new file with mode: 0644]	patch \| blob