Revert the revision 251592 as it fails a test on some platforms.

author Cong Hou <congh@google.com>

Thu, 29 Oct 2015 05:35:22 +0000 (05:35 +0000)

committer Cong Hou <congh@google.com>

Thu, 29 Oct 2015 05:35:22 +0000 (05:35 +0000)
author Cong Hou <congh@google.com>
Thu, 29 Oct 2015 05:35:22 +0000 (05:35 +0000)
committer Cong Hou <congh@google.com>
Thu, 29 Oct 2015 05:35:22 +0000 (05:35 +0000)
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp

index 7a28473c25c8c2182cebe9e5588f6069075c612d..ae5ec8cb88a84411c7c22f81b98464daf431860c 100644 (file)
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -126,11 +126,6 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
                                        "trip count that is smaller than this "
                                        "value."));
  
-static cl::opt<bool> MaximizeBandwidth(
-    "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
-    cl::desc("Maximize bandwidth when selecting vectorization factor which "
-             "will be determined by the smallest type in loop."));
-
  /// This enables versioning on the strides of symbolically striding memory
  /// accesses in code like the following.
  ///   for (i = 0; i < N; ++i)
@@ -1382,10 +1377,10 @@ public:
    /// possible.
    VectorizationFactor selectVectorizationFactor(bool OptForSize);
  
-  /// \return The size (in bits) of the smallest and widest types in the code
-  /// that needs to be vectorized. We ignore values that remain scalar such as
+  /// \return The size (in bits) of the widest type in the code that
+  /// needs to be vectorized. We ignore values that remain scalar such as
    /// 64 bit loop indices.
-  std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
+  unsigned getWidestType();
  
    /// \return The desired interleave count.
    /// If interleave count has been specified by metadata it will be returned.
@@ -1412,10 +1407,8 @@ public:
      unsigned NumInstructions;
    };
  
-  /// \return Returns information about the register usages of the loop for the
-  /// given vectorization factors.
-  SmallVector<RegisterUsage, 8>
-  calculateRegisterUsage(const SmallVector<unsigned, 8> &VFs);
+  /// \return  information about the register usage of the loop.
+  RegisterUsage calculateRegisterUsage();
  
  private:
    /// Returns the expected execution cost. The unit of the cost does
@@ -4714,8 +4707,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
    DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
  
    MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
-  unsigned SmallestType, WidestType;
-  std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
+  unsigned WidestType = getWidestType();
    unsigned WidestRegister = TTI.getRegisterBitWidth(true);
    unsigned MaxSafeDepDist = -1U;
    if (Legal->getMaxSafeDepDistBytes() != -1U)
@@ -4723,9 +4715,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
    WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
                      WidestRegister : MaxSafeDepDist);
    unsigned MaxVectorSize = WidestRegister / WidestType;
-
-  DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / "
-               << WidestType << " bits.\n");
+  DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
    DEBUG(dbgs() << "LV: The Widest register is: "
            << WidestRegister << " bits.\n");
  
@@ -4738,26 +4728,6 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
           " into one vector!");
  
    unsigned VF = MaxVectorSize;
-  if (MaximizeBandwidth && !OptForSize) {
-    // Collect all viable vectorization factors.
-    SmallVector<unsigned, 8> VFs;
-    unsigned NewMaxVectorSize = WidestRegister / SmallestType;
-    for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2)
-      VFs.push_back(VS);
-
-    // For each VF calculate its register usage.
-    auto RUs = calculateRegisterUsage(VFs);
-
-    // Select the largest VF which doesn't require more registers than existing
-    // ones.
-    unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
-    for (int i = RUs.size() - 1; i >= 0; --i) {
-      if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
-        VF = VFs[i];
-        break;
-      }
-    }
-  }
  
    // If we optimize the program for size, avoid creating the tail loop.
    if (OptForSize) {
@@ -4833,9 +4803,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
    return Factor;
  }
  
-std::pair<unsigned, unsigned>
-LoopVectorizationCostModel::getSmallestAndWidestTypes() {
-  unsigned MinWidth = -1U;
+unsigned LoopVectorizationCostModel::getWidestType() {
    unsigned MaxWidth = 8;
    const DataLayout &DL = TheFunction->getParent()->getDataLayout();
  
@@ -4875,14 +4843,12 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
        if (T->isPointerTy() && !isConsecutiveLoadOrStore(&*it))
          continue;
  
-      MinWidth = std::min(MinWidth,
-                          (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
        MaxWidth = std::max(MaxWidth,
                            (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
      }
    }
  
-  return {MinWidth, MaxWidth};
+  return MaxWidth;
  }
  
  unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
@@ -4928,7 +4894,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
        TargetNumRegisters = ForceTargetNumVectorRegs;
    }
  
-  RegisterUsage R = calculateRegisterUsage({VF})[0];
+  LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
    // We divide by these constants so assume that we have at least one
    // instruction that uses at least one register.
    R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
@@ -5038,9 +5004,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
    return 1;
  }
  
-SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
-LoopVectorizationCostModel::calculateRegisterUsage(
-    const SmallVector<unsigned, 8> &VFs) {
+LoopVectorizationCostModel::RegisterUsage
+LoopVectorizationCostModel::calculateRegisterUsage() {
    // This function calculates the register usage by measuring the highest number
    // of values that are alive at a single location. Obviously, this is a very
    // rough estimation. We scan the loop in a topological order in order and
@@ -5061,8 +5026,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(
    LoopBlocksDFS DFS(TheLoop);
    DFS.perform(LI);
  
-  RegisterUsage RU;
-  RU.NumInstructions = 0;
+  RegisterUsage R;
+  R.NumInstructions = 0;
  
    // Each 'key' in the map opens a new interval. The values
    // of the map are the index of the 'last seen' usage of the
@@ -5081,7 +5046,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(
    unsigned Index = 0;
    for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
         be = DFS.endRPO(); bb != be; ++bb) {
-    RU.NumInstructions += (*bb)->size();
+    R.NumInstructions += (*bb)->size();
      for (Instruction &I : **bb) {
        IdxToInstr[Index++] = &I;
  
@@ -5116,20 +5081,10 @@ LoopVectorizationCostModel::calculateRegisterUsage(
      TransposeEnds[it->second].push_back(it->first);
  
    SmallSet<Instruction*, 8> OpenIntervals;
+  unsigned MaxUsage = 0;
  
-  // Get the size of the widest register.
-  unsigned MaxSafeDepDist = -1U;
-  if (Legal->getMaxSafeDepDistBytes() != -1U)
-    MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
-  unsigned WidestRegister =
-      std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
-  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
-
-  SmallVector<RegisterUsage, 8> RUs(VFs.size());
-  SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
  
    DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
-
    for (unsigned int i = 0; i < Index; ++i) {
      Instruction *I = IdxToInstr[i];
      // Ignore instructions that are never used within the loop.
@@ -5141,47 +5096,27 @@ LoopVectorizationCostModel::calculateRegisterUsage(
  
      // Remove all of the instructions that end at this location.
      InstrList &List = TransposeEnds[i];
-    for (unsigned int j = 0, e = List.size(); j < e; ++j)
+    for (unsigned int j=0, e = List.size(); j < e; ++j)
        OpenIntervals.erase(List[j]);
  
-    // For each VF find the maximum usage of registers.
-    for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
-      // Count the number of live interals.
-      unsigned RegUsage = 0;
-      for (auto Inst : OpenIntervals) {
-        unsigned TypeSize =
-            DL.getTypeSizeInBits(Inst->getType()->getScalarType());
-        RegUsage += std::max<unsigned>(1, VFs[j] * TypeSize / WidestRegister);
-      }
-      MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
-    }
+    // Count the number of live interals.
+    MaxUsage = std::max(MaxUsage, OpenIntervals.size());
  
-    DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
-                 << OpenIntervals.size() << '\n');
+    DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
+          OpenIntervals.size() << '\n');
  
      // Add the current instruction to the list of open intervals.
      OpenIntervals.insert(I);
    }
  
-  for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
-    unsigned Invariant = 0;
-    for (auto Inst : LoopInvariants) {
-      unsigned TypeSize =
-          DL.getTypeSizeInBits(Inst->getType()->getScalarType());
-      Invariant += std::max<unsigned>(1, VFs[i] * TypeSize / WidestRegister);
-    }
-
-    DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] <<  '\n');
-    DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
-    DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
-    DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n');
-
-    RU.LoopInvariantRegs = Invariant;
-    RU.MaxLocalUsers = MaxUsages[i];
-    RUs[i] = RU;
-  }
+  unsigned Invariant = LoopInvariants.size();
+  DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n');
+  DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
+  DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n');
  
-  return RUs;
+  R.LoopInvariantRegs = Invariant;
+  R.MaxLocalUsers = MaxUsage;
+  return R;
  }
  
  unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
diff --git a/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll b/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll

deleted file mode 100644 (file)

index e6dc39c..0000000
--- a/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll
+++ /dev/null
@@ -1,46 +0,0 @@
-; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -mcpu=corei7-avx -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
-; REQUIRES: asserts
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@a = global [1000 x i8] zeroinitializer, align 16
-@b = global [1000 x i8] zeroinitializer, align 16
-@c = global [1000 x i8] zeroinitializer, align 16
-@u = global [1000 x i32] zeroinitializer, align 16
-@v = global [1000 x i32] zeroinitializer, align 16
-@w = global [1000 x i32] zeroinitializer, align 16
-
-; Tests that the vectorization factor is determined by the smallest instead of
-; widest type in the loop for maximum bandwidth when
-; -vectorizer-maximize-bandwidth is indicated.
-;
-; CHECK-label: foo
-; CHECK: LV: Selecting VF: 16.
-define void @foo() {
-entry:
-  br label %for.body
-
-for.cond.cleanup:
-  ret void
-
-for.body:
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds [1000 x i8], [1000 x i8]* @b, i64 0, i64 %indvars.iv
-  %0 = load i8, i8* %arrayidx, align 1
-  %arrayidx2 = getelementptr inbounds [1000 x i8], [1000 x i8]* @c, i64 0, i64 %indvars.iv
-  %1 = load i8, i8* %arrayidx2, align 1
-  %add = add i8 %1, %0
-  %arrayidx6 = getelementptr inbounds [1000 x i8], [1000 x i8]* @a, i64 0, i64 %indvars.iv
-  store i8 %add, i8* %arrayidx6, align 1
-  %arrayidx8 = getelementptr inbounds [1000 x i32], [1000 x i32]* @v, i64 0, i64 %indvars.iv
-  %2 = load i32, i32* %arrayidx8, align 4
-  %arrayidx10 = getelementptr inbounds [1000 x i32], [1000 x i32]* @w, i64 0, i64 %indvars.iv
-  %3 = load i32, i32* %arrayidx10, align 4
-  %add11 = add nsw i32 %3, %2
-  %arrayidx13 = getelementptr inbounds [1000 x i32], [1000 x i32]* @u, i64 0, i64 %indvars.iv
-  store i32 %add11, i32* %arrayidx13, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1000
-  br i1 %exitcond, label %for.cond.cleanup, label %for.body
-}
diff --git a/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll b/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll

index cca829b9457e0a80828e60c4b1d615187550629e..6cd3c9c3bc01193af4b2537aa18b940ac89ace38 100644 (file)
--- a/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll
+++ b/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll
@@ -17,7 +17,7 @@ target triple = "x86_64-apple-macosx10.8.0"
  ; widest vector count.
  ;
  ; CHECK: test_consecutive_store
-; CHECK: The Smallest and Widest types: 64 / 64 bits.
+; CHECK: The Widest type: 64 bits
  define void @test_consecutive_store(%0**, %0**, %0** nocapture) nounwind ssp uwtable align 2 {
    %4 = load %0*, %0** %2, align 8
    %5 = icmp eq %0** %0, %1
@@ -51,7 +51,7 @@ define void @test_consecutive_store(%0**, %0**, %0** nocapture) nounwind ssp uwt
  ;       p[i][y] = (int*) (1 + q[i]);
  ;     }
  ; CHECK: test_nonconsecutive_store
-; CHECK: The Smallest and Widest types: 16 / 16 bits.
+; CHECK: The Widest type: 16 bits
  define void @test_nonconsecutive_store() nounwind ssp uwtable {
    br label %1
  
@@ -93,7 +93,7 @@ define void @test_nonconsecutive_store() nounwind ssp uwtable {
  ;; Now we check the same rules for loads. We should take consecutive loads of
  ;; pointer types into account.
  ; CHECK: test_consecutive_ptr_load
-; CHECK: The Smallest and Widest types: 8 / 64 bits.
+; CHECK: The Widest type: 64 bits
  define i8 @test_consecutive_ptr_load() nounwind readonly ssp uwtable {
    br label %1
  
@@ -117,7 +117,7 @@ define i8 @test_consecutive_ptr_load() nounwind readonly ssp uwtable {
  
  ;; However, we should not take unconsecutive loads of pointers into account.
  ; CHECK: test_nonconsecutive_ptr_load
-; CHECK: LV: The Smallest and Widest types: 16 / 16 bits.
+; CHECK: The Widest type: 16 bits
  define void @test_nonconsecutive_ptr_load() nounwind ssp uwtable {
    br label %1
author	Cong Hou <congh@google.com>
	Thu, 29 Oct 2015 05:35:22 +0000 (05:35 +0000)
committer	Cong Hou <congh@google.com>
	Thu, 29 Oct 2015 05:35:22 +0000 (05:35 +0000)
lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll	[deleted file]	patch \| blob \| history
test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll		patch \| blob \| history