fix timeWaiting computation in TimeUtil

author Adam Simpkins <simpkins@fb.com>

Tue, 6 Jun 2017 21:09:39 +0000 (14:09 -0700)

committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>

Tue, 6 Jun 2017 21:21:27 +0000 (14:21 -0700)
author Adam Simpkins <simpkins@fb.com>
Tue, 6 Jun 2017 21:09:39 +0000 (14:09 -0700)
committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
Tue, 6 Jun 2017 21:21:27 +0000 (14:21 -0700)
diff --git a/folly/io/async/test/TimeUtil.cpp b/folly/io/async/test/TimeUtil.cpp

index 74ac884c1dfc2b10ced58ce8379df5b28d705fb2..11187fbe3ad7e6051b4354eceef15e54b1db2b43 100644 (file)
--- a/folly/io/async/test/TimeUtil.cpp
+++ b/folly/io/async/test/TimeUtil.cpp
@@ -32,6 +32,7 @@
  #include <stdexcept>
  
  #include <folly/Conv.h>
+#include <folly/ScopeGuard.h>
  #include <folly/portability/SysSyscall.h>
  #include <folly/portability/Unistd.h>
  #include <folly/portability/Windows.h>
@@ -54,18 +55,35 @@ static pid_t gettid() {
  static pid_t gettid() {
    return syscall(FOLLY_SYS_gettid);
  }
+#endif
+
+static int getLinuxVersion(StringPiece release) {
+  auto dot1 = release.find('.');
+  if (dot1 == StringPiece::npos) {
+    throw std::invalid_argument("could not find first dot");
+  }
+  auto v1 = folly::to<int>(release.subpiece(0, dot1));
  
+  auto dot2 = release.find('.', dot1 + 1);
+  if (dot2 == StringPiece::npos) {
+    throw std::invalid_argument("could not find second dot");
+  }
+  auto v2 = folly::to<int>(release.subpiece(dot1 + 1, dot2 - (dot1 + 1)));
+
+  int dash = release.find('-', dot2 + 1);
+  auto v3 = folly::to<int>(release.subpiece(dot2 + 1, dash - (dot2 + 1)));
+
+  return ((v1 * 1000 + v2) * 1000) + v3;
+}
+
+#ifdef __linux__
  /**
- * The /proc/<pid>/schedstat file reports time values in jiffies.
+ * Determine the time units used in /proc/<pid>/schedstat
   *
- * Determine how many jiffies are in a second.
- * Returns -1 if the number of jiffies/second cannot be determined.
+ * Returns the number of nanoseconds per time unit,
+ * or -1 if we cannot determine the units.
   */
-static int64_t determineJiffiesHZ() {
-  // It seems like the only real way to figure out the CONFIG_HZ value used by
-  // this kernel is to look it up in the config file.
-  //
-  // Look in /boot/config-<kernel_release>
+static int64_t determineSchedstatUnits() {
    struct utsname unameInfo;
    if (uname(&unameInfo) != 0) {
      LOG(ERROR) << "unable to determine jiffies/second: uname failed: %s"
@@ -73,6 +91,35 @@ static int64_t determineJiffiesHZ() {
      return -1;
    }
  
+  // In Linux version 2.6.23 and later, time time values are always
+  // reported in nanoseconds.
+  //
+  // This change appears to have been made in commit 425e0968a25f, which
+  // moved some of the sched stats code to a new file.  Despite the commit
+  // message claiming "no code changes are caused by this patch", it changed
+  // the task.sched_info.cpu_time and task.sched_info.run_delay counters to be
+  // computed using sched_clock() rather than jiffies.
+  int linuxVersion;
+  try {
+    linuxVersion = getLinuxVersion(unameInfo.release);
+  } catch (const std::exception&) {
+    LOG(ERROR) << "unable to determine jiffies/second: failed to parse "
+               << "kernel release string \"" << unameInfo.release << "\"";
+    return -1;
+  }
+  if (linuxVersion >= 2006023) {
+    // The units are nanoseconds
+    return 1;
+  }
+
+  // In Linux versions prior to 2.6.23, the time values are reported in
+  // jiffies.  This is somewhat unfortunate, as the number of jiffies per
+  // second is configurable.  We have to determine the units being used.
+  //
+  // It seems like the only real way to figure out the CONFIG_HZ value used by
+  // this kernel is to look it up in the config file.
+  //
+  // Look in /boot/config-<kernel_release>
    char configPath[256];
    snprintf(configPath, sizeof(configPath), "/boot/config-%s",
             unameInfo.release);
@@ -83,18 +130,16 @@ static int64_t determineJiffiesHZ() {
        "cannot open kernel config file %s" << configPath;
      return -1;
    }
+  SCOPE_EXIT {
+    fclose(f);
+  };
  
    int64_t hz = -1;
    char buf[1024];
    while (fgets(buf, sizeof(buf), f) != nullptr) {
      if (strcmp(buf, "CONFIG_NO_HZ=y\n") == 0) {
-      // schedstat info seems to be reported in nanoseconds on tickless
-      // kernels.
-      //
-      // The CONFIG_HZ value doesn't matter for our purposes,
-      // so return as soon as we see CONFIG_NO_HZ.
-      fclose(f);
-      return 1000000000;
+      LOG(ERROR) << "unable to determine jiffies/second: tickless kernel";
+      return -1;
      } else if (strcmp(buf, "CONFIG_HZ=1000\n") == 0) {
        hz = 1000;
      } else if (strcmp(buf, "CONFIG_HZ=300\n") == 0) {
@@ -105,7 +150,6 @@ static int64_t determineJiffiesHZ() {
        hz = 100;
      }
    }
-  fclose(f);
  
    if (hz == -1) {
      LOG(ERROR) << "unable to determine jiffies/second: no CONFIG_HZ setting "
@@ -125,15 +169,12 @@ static int64_t determineJiffiesHZ() {
   * time cannot be determined.
   */
  static milliseconds getTimeWaitingMS(pid_t tid) {
-#ifdef _MSC_VER
+#ifndef __linux__
+  (void)tid;
    return milliseconds(0);
  #else
-  static int64_t jiffiesHZ = 0;
-  if (jiffiesHZ == 0) {
-    jiffiesHZ = determineJiffiesHZ();
-  }
-
-  if (jiffiesHZ < 0) {
+  static int64_t timeUnits = determineSchedstatUnits();
+  if (timeUnits < 0) {
      // We couldn't figure out how many jiffies there are in a second.
      // Don't bother reading the schedstat info if we can't interpret it.
      return milliseconds(0);
@@ -174,7 +215,7 @@ static milliseconds getTimeWaitingMS(pid_t tid) {
      }
  
      close(fd);
-    return milliseconds((waitingJiffies * 1000) / jiffiesHZ);
+    return duration_cast<milliseconds>(nanoseconds(waitingJiffies * timeUnits));
    } catch (const std::runtime_error& e) {
      if (fd >= 0) {
        close(fd);
@@ -209,12 +250,12 @@ bool
  checkTimeout(const TimePoint& start, const TimePoint& end,
               milliseconds expectedMS, bool allowSmaller,
               milliseconds tolerance) {
-  auto elapsedMS = end.getTimeStart() - start.getTimeEnd();
+  auto elapsedTime = end.getTimeStart() - start.getTimeEnd();
  
    if (!allowSmaller) {
      // Timeouts should never fire before the time was up.
      // Allow 1ms of wiggle room for rounding errors.
-    if (elapsedMS < expectedMS - milliseconds(1)) {
+    if (elapsedTime < (expectedMS - milliseconds(1))) {
        return false;
      }
    }
@@ -233,12 +274,12 @@ checkTimeout(const TimePoint& start, const TimePoint& end,
      excludedMS = end.getTimeWaiting() - start.getTimeWaiting();
      assert(end.getTimeWaiting() >= start.getTimeWaiting());
      // Add a tolerance here due to precision issues on linux, see below note.
-    assert( (elapsedMS + tolerance) >= excludedMS);
+    assert((elapsedTime + tolerance) >= excludedMS);
    }
  
    milliseconds effectiveElapsedMS = milliseconds(0);
-  if (elapsedMS > excludedMS) {
-    effectiveElapsedMS =  duration_cast<milliseconds>(elapsedMS) - excludedMS;
+  if (elapsedTime > excludedMS) {
+    effectiveElapsedMS = duration_cast<milliseconds>(elapsedTime) - excludedMS;
    }
  
    // On x86 Linux, sleep calls generally have precision only to the nearest
diff --git a/folly/io/async/test/TimeUtilTest.cpp b/folly/io/async/test/TimeUtilTest.cpp

new file mode 100644 (file)

index 0000000..2945f1f
--- /dev/null
+++ b/folly/io/async/test/TimeUtilTest.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright 2004-present Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <thread>
+
+#include <glog/logging.h>
+
+#include <folly/io/async/test/TimeUtil.h>
+#include <folly/portability/GTest.h>
+#include <folly/portability/Unistd.h>
+
+using folly::TimePoint;
+using namespace std::literals::chrono_literals;
+using std::chrono::duration_cast;
+using std::chrono::milliseconds;
+using std::chrono::nanoseconds;
+using std::chrono::steady_clock;
+
+// Define a PrintTo() function for std::chrono::nanoseconds so that these
+// will be printed nicely on EXPECT*() failures.
+// Define this in std::chrono so that argument-dependent lookup works.
+namespace std {
+namespace chrono {
+static inline void PrintTo(nanoseconds ns, ::std::ostream* os) {
+  *os << ns.count() << "ns";
+}
+}
+}
+
+#ifdef __linux__
+void runThread(nanoseconds duration, nanoseconds* timeWaiting) {
+  TimePoint start;
+
+  // Loop consuming CPU until the duration has expired.
+  while (true) {
+    TimePoint now;
+    if (now.getTimeStart() - start.getTimeStart() > duration) {
+      // Time to quit
+      // Report how long we spent waiting to be scheduled on the CPU.
+      *timeWaiting = (now.getTimeWaiting() - start.getTimeWaiting());
+      VLOG(1) << "thread " << start.getTid() << ": elapsed "
+              << duration_cast<milliseconds>(
+                     now.getTimeStart() - start.getTimeStart())
+                     .count()
+              << "ms, time waiting: "
+              << duration_cast<milliseconds>(*timeWaiting).count() << "ms";
+      break;
+    }
+  }
+}
+
+// Test to make sure that TimePoint computes sane values for time
+// spent waiting on CPU.
+TEST(TimeUtil, getTimeWaiting) {
+  TimePoint tp;
+
+  // Run twice as many threads as CPU cores, to ensure that some of
+  // them should be waiting sometime.
+  auto numThreads = sysconf(_SC_NPROCESSORS_CONF) * 2;
+
+  std::vector<std::thread> threads;
+  std::vector<nanoseconds> timeWaiting;
+  timeWaiting.resize(numThreads, 0ns);
+
+  auto start = steady_clock::now();
+  for (int n = 0; n < numThreads; ++n) {
+    threads.emplace_back(runThread, 1s, &timeWaiting[n]);
+  }
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+  auto end = steady_clock::now();
+
+  auto timeSpent = end - start;
+  nanoseconds max{0};
+  for (int n = 0; n < numThreads; ++n) {
+    max = std::max(max, timeWaiting[n]);
+    // No thread could possibly have been waiting for longer than
+    // the test actually took to run.
+    EXPECT_LT(timeWaiting[n], timeSpent);
+  }
+  // Make sure that at least one thread spent some time waiting
+  EXPECT_GE(max, 1ns);
+}
+#endif
author	Adam Simpkins <simpkins@fb.com>
	Tue, 6 Jun 2017 21:09:39 +0000 (14:09 -0700)
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
	Tue, 6 Jun 2017 21:21:27 +0000 (14:21 -0700)
folly/io/async/test/TimeUtil.cpp		patch \| blob \| history
folly/io/async/test/TimeUtilTest.cpp	[new file with mode: 0644]	patch \| blob