From 68f55ed0e97dea446cf66d429e8b46331cfb9f12 Mon Sep 17 00:00:00 2001
From: Adam Simpkins <simpkins@fb.com>
Date: Tue, 6 Jun 2017 14:09:39 -0700
Subject: [PATCH] fix timeWaiting computation in TimeUtil

Summary:
Some of the code in TimeUtil.cpp computes how long the current thread has spent
waiting to be schedule on the CPU, so we can discount this time when testing
timeout behavior.

It appears that this code has been broken for a fairly long time.  Older Linux
kernels reported values in /proc/<pid>/schedstat using jiffies.  However, it
appears that this was changed in Linux 2.6.23 to report values using
nanoseconds.  (It looks to me like this was changed in commit 425e0968a25f,
which purports to simply be moving code from sched.c to sched_stats.h,
but it appears to also have also changed this behavior.)

This updates TimeUtil.cpp to compute the value correctly in modern Linux
kernels.

Reviewed By: yfeldblum

Differential Revision: D5173598

fbshipit-source-id: 98436d3a687400bf68661156ffce93eea7232632
---
 folly/io/async/test/TimeUtil.cpp     | 99 ++++++++++++++++++++--------
 folly/io/async/test/TimeUtilTest.cpp | 98 +++++++++++++++++++++++++++
 2 files changed, 168 insertions(+), 29 deletions(-)
 create mode 100644 folly/io/async/test/TimeUtilTest.cpp
diff --git a/folly/io/async/test/TimeUtil.cpp b/folly/io/async/test/TimeUtil.cpp
index 74ac884c..11187fbe 100644
--- a/folly/io/async/test/TimeUtil.cpp
+++ b/folly/io/async/test/TimeUtil.cpp
@@ -32,6 +32,7 @@
 #include <stdexcept>
 
 #include <folly/Conv.h>
+#include <folly/ScopeGuard.h>
 #include <folly/portability/SysSyscall.h>
 #include <folly/portability/Unistd.h>
 #include <folly/portability/Windows.h>
@@ -54,18 +55,35 @@ static pid_t gettid() {
 static pid_t gettid() {
   return syscall(FOLLY_SYS_gettid);
 }
+#endif
+
+static int getLinuxVersion(StringPiece release) {
+  auto dot1 = release.find('.');
+  if (dot1 == StringPiece::npos) {
+    throw std::invalid_argument("could not find first dot");
+  }
+  auto v1 = folly::to<int>(release.subpiece(0, dot1));
 
+  auto dot2 = release.find('.', dot1 + 1);
+  if (dot2 == StringPiece::npos) {
+    throw std::invalid_argument("could not find second dot");
+  }
+  auto v2 = folly::to<int>(release.subpiece(dot1 + 1, dot2 - (dot1 + 1)));
+
+  int dash = release.find('-', dot2 + 1);
+  auto v3 = folly::to<int>(release.subpiece(dot2 + 1, dash - (dot2 + 1)));
+
+  return ((v1 * 1000 + v2) * 1000) + v3;
+}
+
+#ifdef __linux__
 /**
- * The /proc/<pid>/schedstat file reports time values in jiffies.
+ * Determine the time units used in /proc/<pid>/schedstat
  *
- * Determine how many jiffies are in a second.
- * Returns -1 if the number of jiffies/second cannot be determined.
+ * Returns the number of nanoseconds per time unit,
+ * or -1 if we cannot determine the units.
  */
-static int64_t determineJiffiesHZ() {
-  // It seems like the only real way to figure out the CONFIG_HZ value used by
-  // this kernel is to look it up in the config file.
-  //
-  // Look in /boot/config-<kernel_release>
+static int64_t determineSchedstatUnits() {
   struct utsname unameInfo;
   if (uname(&unameInfo) != 0) {
     LOG(ERROR) << "unable to determine jiffies/second: uname failed: %s"
@@ -73,6 +91,35 @@ static int64_t determineJiffiesHZ() {
     return -1;
   }
 
+  // In Linux version 2.6.23 and later, time time values are always
+  // reported in nanoseconds.
+  //
+  // This change appears to have been made in commit 425e0968a25f, which
+  // moved some of the sched stats code to a new file.  Despite the commit
+  // message claiming "no code changes are caused by this patch", it changed
+  // the task.sched_info.cpu_time and task.sched_info.run_delay counters to be
+  // computed using sched_clock() rather than jiffies.
+  int linuxVersion;
+  try {
+    linuxVersion = getLinuxVersion(unameInfo.release);
+  } catch (const std::exception&) {
+    LOG(ERROR) << "unable to determine jiffies/second: failed to parse "
+               << "kernel release string \"" << unameInfo.release << "\"";
+    return -1;
+  }
+  if (linuxVersion >= 2006023) {
+    // The units are nanoseconds
+    return 1;
+  }
+
+  // In Linux versions prior to 2.6.23, the time values are reported in
+  // jiffies.  This is somewhat unfortunate, as the number of jiffies per
+  // second is configurable.  We have to determine the units being used.
+  //
+  // It seems like the only real way to figure out the CONFIG_HZ value used by
+  // this kernel is to look it up in the config file.
+  //
+  // Look in /boot/config-<kernel_release>
   char configPath[256];
   snprintf(configPath, sizeof(configPath), "/boot/config-%s",
            unameInfo.release);
@@ -83,18 +130,16 @@ static int64_t determineJiffiesHZ() {
       "cannot open kernel config file %s" << configPath;
     return -1;
   }
+  SCOPE_EXIT {
+    fclose(f);
+  };
 
   int64_t hz = -1;
   char buf[1024];
   while (fgets(buf, sizeof(buf), f) != nullptr) {
     if (strcmp(buf, "CONFIG_NO_HZ=y\n") == 0) {
-      // schedstat info seems to be reported in nanoseconds on tickless
-      // kernels.
-      //
-      // The CONFIG_HZ value doesn't matter for our purposes,
-      // so return as soon as we see CONFIG_NO_HZ.
-      fclose(f);
-      return 1000000000;
+      LOG(ERROR) << "unable to determine jiffies/second: tickless kernel";
+      return -1;
     } else if (strcmp(buf, "CONFIG_HZ=1000\n") == 0) {
       hz = 1000;
     } else if (strcmp(buf, "CONFIG_HZ=300\n") == 0) {
@@ -105,7 +150,6 @@ static int64_t determineJiffiesHZ() {
       hz = 100;
     }
   }
-  fclose(f);
 
   if (hz == -1) {
     LOG(ERROR) << "unable to determine jiffies/second: no CONFIG_HZ setting "
@@ -125,15 +169,12 @@ static int64_t determineJiffiesHZ() {
  * time cannot be determined.
  */
 static milliseconds getTimeWaitingMS(pid_t tid) {
-#ifdef _MSC_VER
+#ifndef __linux__
+  (void)tid;
   return milliseconds(0);
 #else
-  static int64_t jiffiesHZ = 0;
-  if (jiffiesHZ == 0) {
-    jiffiesHZ = determineJiffiesHZ();
-  }
-
-  if (jiffiesHZ < 0) {
+  static int64_t timeUnits = determineSchedstatUnits();
+  if (timeUnits < 0) {
     // We couldn't figure out how many jiffies there are in a second.
     // Don't bother reading the schedstat info if we can't interpret it.
     return milliseconds(0);
@@ -174,7 +215,7 @@ static milliseconds getTimeWaitingMS(pid_t tid) {
     }
 
     close(fd);
-    return milliseconds((waitingJiffies * 1000) / jiffiesHZ);
+    return duration_cast<milliseconds>(nanoseconds(waitingJiffies * timeUnits));
   } catch (const std::runtime_error& e) {
     if (fd >= 0) {
       close(fd);
@@ -209,12 +250,12 @@ bool
 checkTimeout(const TimePoint& start, const TimePoint& end,
              milliseconds expectedMS, bool allowSmaller,
              milliseconds tolerance) {
-  auto elapsedMS = end.getTimeStart() - start.getTimeEnd();
+  auto elapsedTime = end.getTimeStart() - start.getTimeEnd();
 
   if (!allowSmaller) {
     // Timeouts should never fire before the time was up.
     // Allow 1ms of wiggle room for rounding errors.
-    if (elapsedMS < expectedMS - milliseconds(1)) {
+    if (elapsedTime < (expectedMS - milliseconds(1))) {
       return false;
     }
   }
@@ -233,12 +274,12 @@ checkTimeout(const TimePoint& start, const TimePoint& end,
     excludedMS = end.getTimeWaiting() - start.getTimeWaiting();
     assert(end.getTimeWaiting() >= start.getTimeWaiting());
     // Add a tolerance here due to precision issues on linux, see below note.
-    assert( (elapsedMS + tolerance) >= excludedMS);
+    assert((elapsedTime + tolerance) >= excludedMS);
   }
 
   milliseconds effectiveElapsedMS = milliseconds(0);
-  if (elapsedMS > excludedMS) {
-    effectiveElapsedMS =  duration_cast<milliseconds>(elapsedMS) - excludedMS;
+  if (elapsedTime > excludedMS) {
+    effectiveElapsedMS = duration_cast<milliseconds>(elapsedTime) - excludedMS;
   }
 
   // On x86 Linux, sleep calls generally have precision only to the nearest
diff --git a/folly/io/async/test/TimeUtilTest.cpp b/folly/io/async/test/TimeUtilTest.cpp
new file mode 100644
index 00000000..2945f1fb
--- /dev/null
+++ b/folly/io/async/test/TimeUtilTest.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright 2004-present Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <thread>
+
+#include <glog/logging.h>
+
+#include <folly/io/async/test/TimeUtil.h>
+#include <folly/portability/GTest.h>
+#include <folly/portability/Unistd.h>
+
+using folly::TimePoint;
+using namespace std::literals::chrono_literals;
+using std::chrono::duration_cast;
+using std::chrono::milliseconds;
+using std::chrono::nanoseconds;
+using std::chrono::steady_clock;
+
+// Define a PrintTo() function for std::chrono::nanoseconds so that these
+// will be printed nicely on EXPECT*() failures.
+// Define this in std::chrono so that argument-dependent lookup works.
+namespace std {
+namespace chrono {
+static inline void PrintTo(nanoseconds ns, ::std::ostream* os) {
+  *os << ns.count() << "ns";
+}
+}
+}
+
+#ifdef __linux__
+void runThread(nanoseconds duration, nanoseconds* timeWaiting) {
+  TimePoint start;
+
+  // Loop consuming CPU until the duration has expired.
+  while (true) {
+    TimePoint now;
+    if (now.getTimeStart() - start.getTimeStart() > duration) {
+      // Time to quit
+      // Report how long we spent waiting to be scheduled on the CPU.
+      *timeWaiting = (now.getTimeWaiting() - start.getTimeWaiting());
+      VLOG(1) << "thread " << start.getTid() << ": elapsed "
+              << duration_cast<milliseconds>(
+                     now.getTimeStart() - start.getTimeStart())
+                     .count()
+              << "ms, time waiting: "
+              << duration_cast<milliseconds>(*timeWaiting).count() << "ms";
+      break;
+    }
+  }
+}
+
+// Test to make sure that TimePoint computes sane values for time
+// spent waiting on CPU.
+TEST(TimeUtil, getTimeWaiting) {
+  TimePoint tp;
+
+  // Run twice as many threads as CPU cores, to ensure that some of
+  // them should be waiting sometime.
+  auto numThreads = sysconf(_SC_NPROCESSORS_CONF) * 2;
+
+  std::vector<std::thread> threads;
+  std::vector<nanoseconds> timeWaiting;
+  timeWaiting.resize(numThreads, 0ns);
+
+  auto start = steady_clock::now();
+  for (int n = 0; n < numThreads; ++n) {
+    threads.emplace_back(runThread, 1s, &timeWaiting[n]);
+  }
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+  auto end = steady_clock::now();
+
+  auto timeSpent = end - start;
+  nanoseconds max{0};
+  for (int n = 0; n < numThreads; ++n) {
+    max = std::max(max, timeWaiting[n]);
+    // No thread could possibly have been waiting for longer than
+    // the test actually took to run.
+    EXPECT_LT(timeWaiting[n], timeSpent);
+  }
+  // Make sure that at least one thread spent some time waiting
+  EXPECT_GE(max, 1ns);
+}
+#endif
-- 
2.34.1