rcu: Rework detection of use of RCU by offline CPUs

[firefly-linux-kernel-4.4.55.git] / kernel / rcutree.c
diff --git a/kernel/rcutree.c b/kernel/rcutree.c

index 61adb351d2c735306d547c1b34bc2a6c6174b54f..708469a06860b7935ea750f7fb7fb5fe44c410e7 100644 (file)
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -50,6 +50,8 @@
  #include <linux/wait.h>
  #include <linux/kthread.h>
  #include <linux/prefetch.h>
+#include <linux/delay.h>
+#include <linux/stop_machine.h>
  
  #include "rcutree.h"
  #include <trace/events/rcu.h>
@@ -208,8 +210,11 @@ module_param(blimit, int, 0);
  module_param(qhimark, int, 0);
  module_param(qlowmark, int, 0);
  
-int rcu_cpu_stall_suppress __read_mostly;
+int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
+int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
+
  module_param(rcu_cpu_stall_suppress, int, 0644);
+module_param(rcu_cpu_stall_timeout, int, 0644);
  
  static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
  static int rcu_pending(int cpu);
@@ -315,25 +320,18 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
  static int rcu_implicit_offline_qs(struct rcu_data *rdp)
  {
         /*
-        * If the CPU is offline, it is in a quiescent state.  We can
-        * trust its state not to change because interrupts are disabled.
+        * If the CPU is offline for more than a jiffy, it is in a quiescent
+        * state.  We can trust its state not to change because interrupts
+        * are disabled.  The reason for the jiffy's worth of slack is to
+        * handle CPUs initializing on the way up and finding their way
+        * to the idle loop on the way down.
          */
-       if (cpu_is_offline(rdp->cpu)) {
+       if (cpu_is_offline(rdp->cpu) &&
+           ULONG_CMP_LT(rdp->rsp->gp_start + 2, jiffies)) {
                 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
                 rdp->offline_fqs++;
                 return 1;
         }
-
-       /*
-        * The CPU is online, so send it a reschedule IPI.  This forces
-        * it through the scheduler, and (inefficiently) also handles cases
-        * where idle loops fail to inform RCU about the CPU being idle.
-        */
-       if (rdp->cpu != smp_processor_id())
-               smp_send_reschedule(rdp->cpu);
-       else
-               set_need_resched();
-       rdp->resched_ipi++;
         return 0;
  }
  
@@ -588,6 +586,49 @@ int rcu_is_cpu_idle(void)
  }
  EXPORT_SYMBOL(rcu_is_cpu_idle);
  
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Is the current CPU online?  Disable preemption to avoid false positives
+ * that could otherwise happen due to the current CPU number being sampled,
+ * this task being preempted, its old CPU being taken offline, resuming
+ * on some other CPU, then determining that its old CPU is now offline.
+ * It is OK to use RCU on an offline processor during initial boot, hence
+ * the check for rcu_scheduler_fully_active.  Note also that it is OK
+ * for a CPU coming online to use RCU for one jiffy prior to marking itself
+ * online in the cpu_online_mask.  Similarly, it is OK for a CPU going
+ * offline to continue to use RCU for one jiffy after marking itself
+ * offline in the cpu_online_mask.  This leniency is necessary given the
+ * non-atomic nature of the online and offline processing, for example,
+ * the fact that a CPU enters the scheduler after completing the CPU_DYING
+ * notifiers.
+ *
+ * This is also why RCU internally marks CPUs online during the
+ * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase.
+ *
+ * Disable checking if in an NMI handler because we cannot safely report
+ * errors from NMI handlers anyway.
+ */
+bool rcu_lockdep_current_cpu_online(void)
+{
+       struct rcu_data *rdp;
+       struct rcu_node *rnp;
+       bool ret;
+
+       if (in_nmi())
+               return 1;
+       preempt_disable();
+       rdp = &__get_cpu_var(rcu_sched_data);
+       rnp = rdp->mynode;
+       ret = (rdp->grpmask & rnp->qsmaskinit) ||
+             !rcu_scheduler_fully_active;
+       preempt_enable();
+       return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
+
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+
  #endif /* #ifdef CONFIG_PROVE_RCU */
  
  /**
@@ -645,10 +686,28 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
         return rcu_implicit_offline_qs(rdp);
  }
  
+static int jiffies_till_stall_check(void)
+{
+       int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
+
+       /*
+        * Limit check must be consistent with the Kconfig limits
+        * for CONFIG_RCU_CPU_STALL_TIMEOUT.
+        */
+       if (till_stall_check < 3) {
+               ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
+               till_stall_check = 3;
+       } else if (till_stall_check > 300) {
+               ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
+               till_stall_check = 300;
+       }
+       return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
+}
+
  static void record_gp_stall_check_time(struct rcu_state *rsp)
  {
         rsp->gp_start = jiffies;
-       rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
+       rsp->jiffies_stall = jiffies + jiffies_till_stall_check();
  }
  
  static void print_other_cpu_stall(struct rcu_state *rsp)
@@ -667,13 +726,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
                 return;
         }
-       rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
-
-       /*
-        * Now rat on any tasks that got kicked up to the root rcu_node
-        * due to CPU offlining.
-        */
-       ndetected = rcu_print_task_stall(rnp);
+       rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3;
         raw_spin_unlock_irqrestore(&rnp->lock, flags);
  
         /*
@@ -681,8 +734,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
          * See Documentation/RCU/stallwarn.txt for info on how to debug
          * RCU CPU stall warnings.
          */
-       printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
+       printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:",
                rsp->name);
+       print_cpu_stall_info_begin();
         rcu_for_each_leaf_node(rsp, rnp) {
                 raw_spin_lock_irqsave(&rnp->lock, flags);
                 ndetected += rcu_print_task_stall(rnp);
@@ -691,11 +745,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
                         continue;
                 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
                         if (rnp->qsmask & (1UL << cpu)) {
-                               printk(" %d", rnp->grplo + cpu);
+                               print_cpu_stall_info(rsp, rnp->grplo + cpu);
                                 ndetected++;
                         }
         }
-       printk("} (detected by %d, t=%ld jiffies)\n",
+
+       /*
+        * Now rat on any tasks that got kicked up to the root rcu_node
+        * due to CPU offlining.
+        */
+       rnp = rcu_get_root(rsp);
+       raw_spin_lock_irqsave(&rnp->lock, flags);
+       ndetected = rcu_print_task_stall(rnp);
+       raw_spin_unlock_irqrestore(&rnp->lock, flags);
+
+       print_cpu_stall_info_end();
+       printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n",
                smp_processor_id(), (long)(jiffies - rsp->gp_start));
         if (ndetected == 0)
                 printk(KERN_ERR "INFO: Stall ended before state dump start\n");
@@ -719,15 +784,18 @@ static void print_cpu_stall(struct rcu_state *rsp)
          * See Documentation/RCU/stallwarn.txt for info on how to debug
          * RCU CPU stall warnings.
          */
-       printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
-              rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
+       printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name);
+       print_cpu_stall_info_begin();
+       print_cpu_stall_info(rsp, smp_processor_id());
+       print_cpu_stall_info_end();
+       printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start);
         if (!trigger_all_cpu_backtrace())
                 dump_stack();
  
         raw_spin_lock_irqsave(&rnp->lock, flags);
         if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
-               rsp->jiffies_stall =
-                       jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+               rsp->jiffies_stall = jiffies +
+                                    3 * jiffies_till_stall_check() + 3;
         raw_spin_unlock_irqrestore(&rnp->lock, flags);
  
         set_need_resched();  /* kick ourselves to get things going. */
@@ -810,6 +878,7 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
                         rdp->passed_quiesce = 0;
                 } else
                         rdp->qs_pending = 0;
+               zero_cpu_stall_ticks(rdp);
         }
  }
  
@@ -1246,14 +1315,12 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
   */
  static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
  {
-       unsigned long flags;
         int i;
         unsigned long mask;
-       int need_report;
         int receive_cpu = cpumask_any(cpu_online_mask);
         struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
         struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
-       struct rcu_node *rnp = rdp->mynode; /* For dying CPU. */
+       RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */
  
         /* First, adjust the counts. */
         if (rdp->nxtlist != NULL) {
@@ -1319,32 +1386,6 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
                                "cpuofl");
         rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum);
         /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */
-
-       /*
-        * Remove the dying CPU from the bitmasks in the rcu_node
-        * hierarchy.  Because we are in stop_machine() context, we
-        * automatically exclude ->onofflock critical sections.
-        */
-       do {
-               raw_spin_lock_irqsave(&rnp->lock, flags);
-               rnp->qsmaskinit &= ~mask;
-               if (rnp->qsmaskinit != 0) {
-                       raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                       break;
-               }
-               if (rnp == rdp->mynode) {
-                       need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
-                       if (need_report & RCU_OFL_TASKS_NORM_GP)
-                               rcu_report_unblock_qs_rnp(rnp, flags);
-                       else
-                               raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                       if (need_report & RCU_OFL_TASKS_EXP_GP)
-                               rcu_report_exp_rnp(rsp, rnp, true);
-               } else
-                       raw_spin_unlock_irqrestore(&rnp->lock, flags);
-               mask = rnp->grpmask;
-               rnp = rnp->parent;
-       } while (rnp != NULL);
  }
  
  /*
@@ -1355,11 +1396,53 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
   */
  static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
  {
+       unsigned long flags;
+       unsigned long mask;
+       int need_report = 0;
         struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-       struct rcu_node *rnp = rdp->mynode;
+       struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rnp. */
  
+       /* Adjust any no-longer-needed kthreads. */
         rcu_stop_cpu_kthread(cpu);
         rcu_node_kthread_setaffinity(rnp, -1);
+
+       /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */
+
+       /* Exclude any attempts to start a new grace period. */
+       raw_spin_lock_irqsave(&rsp->onofflock, flags);
+
+       /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
+       mask = rdp->grpmask;    /* rnp->grplo is constant. */
+       do {
+               raw_spin_lock(&rnp->lock);      /* irqs already disabled. */
+               rnp->qsmaskinit &= ~mask;
+               if (rnp->qsmaskinit != 0) {
+                       if (rnp != rdp->mynode)
+                               raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+                       break;
+               }
+               if (rnp == rdp->mynode)
+                       need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
+               else
+                       raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+               mask = rnp->grpmask;
+               rnp = rnp->parent;
+       } while (rnp != NULL);
+
+       /*
+        * We still hold the leaf rcu_node structure lock here, and
+        * irqs are still disabled.  The reason for this subterfuge is
+        * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
+        * held leads to deadlock.
+        */
+       raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
+       rnp = rdp->mynode;
+       if (need_report & RCU_OFL_TASKS_NORM_GP)
+               rcu_report_unblock_qs_rnp(rnp, flags);
+       else
+               raw_spin_unlock_irqrestore(&rnp->lock, flags);
+       if (need_report & RCU_OFL_TASKS_EXP_GP)
+               rcu_report_exp_rnp(rsp, rnp, true);
  }
  
  #else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -1475,6 +1558,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
  void rcu_check_callbacks(int cpu, int user)
  {
         trace_rcu_utilization("Start scheduler-tick");
+       increment_cpu_stall_ticks();
         if (user || rcu_is_cpu_rrupt_from_idle()) {
  
                 /*
@@ -1857,6 +1941,121 @@ void synchronize_rcu_bh(void)
  }
  EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
  
+static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
+static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
+
+static int synchronize_sched_expedited_cpu_stop(void *data)
+{
+       /*
+        * There must be a full memory barrier on each affected CPU
+        * between the time that try_stop_cpus() is called and the
+        * time that it returns.
+        *
+        * In the current initial implementation of cpu_stop, the
+        * above condition is already met when the control reaches
+        * this point and the following smp_mb() is not strictly
+        * necessary.  Do smp_mb() anyway for documentation and
+        * robustness against future implementation changes.
+        */
+       smp_mb(); /* See above comment block. */
+       return 0;
+}
+
+/*
+ * Wait for an rcu-sched grace period to elapse, but use "big hammer"
+ * approach to force grace period to end quickly.  This consumes
+ * significant time on all CPUs, and is thus not recommended for
+ * any sort of common-case code.
+ *
+ * Note that it is illegal to call this function while holding any
+ * lock that is acquired by a CPU-hotplug notifier.  Failing to
+ * observe this restriction will result in deadlock.
+ *
+ * This implementation can be thought of as an application of ticket
+ * locking to RCU, with sync_sched_expedited_started and
+ * sync_sched_expedited_done taking on the roles of the halves
+ * of the ticket-lock word.  Each task atomically increments
+ * sync_sched_expedited_started upon entry, snapshotting the old value,
+ * then attempts to stop all the CPUs.  If this succeeds, then each
+ * CPU will have executed a context switch, resulting in an RCU-sched
+ * grace period.  We are then done, so we use atomic_cmpxchg() to
+ * update sync_sched_expedited_done to match our snapshot -- but
+ * only if someone else has not already advanced past our snapshot.
+ *
+ * On the other hand, if try_stop_cpus() fails, we check the value
+ * of sync_sched_expedited_done.  If it has advanced past our
+ * initial snapshot, then someone else must have forced a grace period
+ * some time after we took our snapshot.  In this case, our work is
+ * done for us, and we can simply return.  Otherwise, we try again,
+ * but keep our initial snapshot for purposes of checking for someone
+ * doing our work for us.
+ *
+ * If we fail too many times in a row, we fall back to synchronize_sched().
+ */
+void synchronize_sched_expedited(void)
+{
+       int firstsnap, s, snap, trycount = 0;
+
+       /* Note that atomic_inc_return() implies full memory barrier. */
+       firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
+       get_online_cpus();
+       WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
+
+       /*
+        * Each pass through the following loop attempts to force a
+        * context switch on each CPU.
+        */
+       while (try_stop_cpus(cpu_online_mask,
+                            synchronize_sched_expedited_cpu_stop,
+                            NULL) == -EAGAIN) {
+               put_online_cpus();
+
+               /* No joy, try again later.  Or just synchronize_sched(). */
+               if (trycount++ < 10)
+                       udelay(trycount * num_online_cpus());
+               else {
+                       synchronize_sched();
+                       return;
+               }
+
+               /* Check to see if someone else did our work for us. */
+               s = atomic_read(&sync_sched_expedited_done);
+               if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
+                       smp_mb(); /* ensure test happens before caller kfree */
+                       return;
+               }
+
+               /*
+                * Refetching sync_sched_expedited_started allows later
+                * callers to piggyback on our grace period.  We subtract
+                * 1 to get the same token that the last incrementer got.
+                * We retry after they started, so our grace period works
+                * for them, and they started after our first try, so their
+                * grace period works for us.
+                */
+               get_online_cpus();
+               snap = atomic_read(&sync_sched_expedited_started);
+               smp_mb(); /* ensure read is before try_stop_cpus(). */
+       }
+
+       /*
+        * Everyone up to our most recent fetch is covered by our grace
+        * period.  Update the counter, but only if our work is still
+        * relevant -- which it won't be if someone who started later
+        * than we did beat us to the punch.
+        */
+       do {
+               s = atomic_read(&sync_sched_expedited_done);
+               if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
+                       smp_mb(); /* ensure test happens before caller kfree */
+                       break;
+               }
+       } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
+
+       put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
  /*
   * Check to see if there is any immediate RCU-related work to be done
   * by the current CPU, for the specified type of RCU, returning 1 if so.