Merge branch 'timers/posix-cpu-timers-for-tglx' of
authorThomas Gleixner <tglx@linutronix.de>
Thu, 4 Jul 2013 21:11:22 +0000 (23:11 +0200)
committerThomas Gleixner <tglx@linutronix.de>
Thu, 4 Jul 2013 21:11:22 +0000 (23:11 +0200)
git://git.kernel.org/pub/scm/linux/kernel/git/frederic/linux-dynticks into timers/core

Frederic sayed: "Most of these patches have been hanging around for
several month now, in -mmotm for a significant chunk. They already
missed a few releases."

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
1  2 
arch/arm/Kconfig
arch/x86/kernel/kvmclock.c
arch/x86/platform/efi/efi.c
init/Kconfig
kernel/time/tick-broadcast.c
kernel/time/timekeeping.c

diff --combined arch/arm/Kconfig
index 53d3a356f61f877acebc12a22fc7b3d884a7cfc8,136f263ed47b79d010cf4ca06e7b1e2a07e4fd73..b02e6bbc1b46e9a73f555f291f884d953b9a44aa
@@@ -14,7 -14,6 +14,7 @@@ config AR
        select GENERIC_IRQ_PROBE
        select GENERIC_IRQ_SHOW
        select GENERIC_PCI_IOMAP
 +      select GENERIC_SCHED_CLOCK
        select GENERIC_SMP_IDLE_THREAD
        select GENERIC_IDLE_POLL_SETUP
        select GENERIC_STRNCPY_FROM_USER
@@@ -1088,6 -1087,20 +1088,20 @@@ if !MM
  source "arch/arm/Kconfig-nommu"
  endif
  
+ config PJ4B_ERRATA_4742
+       bool "PJ4B Errata 4742: IDLE Wake Up Commands can Cause the CPU Core to Cease Operation"
+       depends on CPU_PJ4B && MACH_ARMADA_370
+       default y
+       help
+         When coming out of either a Wait for Interrupt (WFI) or a Wait for
+         Event (WFE) IDLE states, a specific timing sensitivity exists between
+         the retiring WFI/WFE instructions and the newly issued subsequent
+         instructions.  This sensitivity can result in a CPU hang scenario.
+         Workaround:
+         The software must insert either a Data Synchronization Barrier (DSB)
+         or Data Memory Barrier (DMB) command immediately after the WFI/WFE
+         instruction
  config ARM_ERRATA_326103
        bool "ARM errata: FSR write bit incorrect on a SWP to read-only memory"
        depends on CPU_V6
@@@ -1190,6 -1203,16 +1204,16 @@@ config PL310_ERRATA_58836
           is not correctly implemented in PL310 as clean lines are not
           invalidated as a result of these operations.
  
+ config ARM_ERRATA_643719
+       bool "ARM errata: LoUIS bit field in CLIDR register is incorrect"
+       depends on CPU_V7 && SMP
+       help
+         This option enables the workaround for the 643719 Cortex-A9 (prior to
+         r1p0) erratum. On affected cores the LoUIS bit field of the CLIDR
+         register returns zero when it should return one. The workaround
+         corrects this value, ensuring cache maintenance operations which use
+         it behave as intended and avoiding data corruption.
  config ARM_ERRATA_720789
        bool "ARM errata: TLBIASIDIS and TLBIMVAIS operations can broadcast a faulty ASID"
        depends on CPU_V7
@@@ -2007,7 -2030,7 +2031,7 @@@ config XIP_PHYS_ADD
  
  config KEXEC
        bool "Kexec system call (EXPERIMENTAL)"
-       depends on (!SMP || HOTPLUG_CPU)
+       depends on (!SMP || PM_SLEEP_SMP)
        help
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
index 0db81ab511cc64f74b3b2e9a1443d7594a35e2f5,3dd37ebd591b36db493d449506d33a6b8915841f..1f354f4b602be9e251dbdc02dcf607c1c4463f89
@@@ -48,9 -48,10 +48,9 @@@ static struct pvclock_wall_clock wall_c
   * have elapsed since the hypervisor wrote the data. So we try to account for
   * that with system time
   */
 -static unsigned long kvm_get_wallclock(void)
 +static void kvm_get_wallclock(struct timespec *now)
  {
        struct pvclock_vcpu_time_info *vcpu_time;
 -      struct timespec ts;
        int low, high;
        int cpu;
  
        cpu = smp_processor_id();
  
        vcpu_time = &hv_clock[cpu].pvti;
 -      pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
 +      pvclock_read_wallclock(&wall_clock, vcpu_time, now);
  
        preempt_enable();
 -
 -      return ts.tv_sec;
  }
  
 -static int kvm_set_wallclock(unsigned long now)
 +static int kvm_set_wallclock(const struct timespec *now)
  {
        return -1;
  }
@@@ -239,6 -242,7 +239,7 @@@ void __init kvmclock_init(void
        if (!mem)
                return;
        hv_clock = __va(mem);
+       memset(hv_clock, 0, size);
  
        if (kvm_register_clock("boot clock")) {
                hv_clock = NULL;
index dd3b82530145a2d25b0897b5b0f532637e313fbc,d2fbcedcf6eaf2fd77179075c1be2c5c4d3d8347..90f6ed127096566ab06c0a9e6f25a355ccdfc048
@@@ -42,7 -42,6 +42,6 @@@
  #include <linux/io.h>
  #include <linux/reboot.h>
  #include <linux/bcd.h>
- #include <linux/ucs2_string.h>
  
  #include <asm/setup.h>
  #include <asm/efi.h>
  
  #define EFI_DEBUG     1
  
- /*
-  * There's some additional metadata associated with each
-  * variable. Intel's reference implementation is 60 bytes - bump that
-  * to account for potential alignment constraints
-  */
- #define VAR_METADATA_SIZE 64
+ #define EFI_MIN_RESERVE 5120
+ #define EFI_DUMMY_GUID \
+       EFI_GUID(0x4424ac57, 0xbe4b, 0x47dd, 0x9e, 0x97, 0xed, 0x50, 0xf0, 0x9f, 0x92, 0xa9)
+ static efi_char16_t efi_dummy_name[6] = { 'D', 'U', 'M', 'M', 'Y', 0 };
  
  struct efi __read_mostly efi = {
        .mps        = EFI_INVALID_TABLE_ADDR,
@@@ -79,13 -78,6 +78,6 @@@ struct efi_memory_map memmap
  static struct efi efi_phys __initdata;
  static efi_system_table_t efi_systab __initdata;
  
- static u64 efi_var_store_size;
- static u64 efi_var_remaining_size;
- static u64 efi_var_max_var_size;
- static u64 boot_used_size;
- static u64 boot_var_size;
- static u64 active_size;
  unsigned long x86_efi_facility;
  
  /*
@@@ -188,53 -180,8 +180,8 @@@ static efi_status_t virt_efi_get_next_v
                                               efi_char16_t *name,
                                               efi_guid_t *vendor)
  {
-       efi_status_t status;
-       static bool finished = false;
-       static u64 var_size;
-       status = efi_call_virt3(get_next_variable,
-                               name_size, name, vendor);
-       if (status == EFI_NOT_FOUND) {
-               finished = true;
-               if (var_size < boot_used_size) {
-                       boot_var_size = boot_used_size - var_size;
-                       active_size += boot_var_size;
-               } else {
-                       printk(KERN_WARNING FW_BUG  "efi: Inconsistent initial sizes\n");
-               }
-       }
-       if (boot_used_size && !finished) {
-               unsigned long size;
-               u32 attr;
-               efi_status_t s;
-               void *tmp;
-               s = virt_efi_get_variable(name, vendor, &attr, &size, NULL);
-               if (s != EFI_BUFFER_TOO_SMALL || !size)
-                       return status;
-               tmp = kmalloc(size, GFP_ATOMIC);
-               if (!tmp)
-                       return status;
-               s = virt_efi_get_variable(name, vendor, &attr, &size, tmp);
-               if (s == EFI_SUCCESS && (attr & EFI_VARIABLE_NON_VOLATILE)) {
-                       var_size += size;
-                       var_size += ucs2_strsize(name, 1024);
-                       active_size += size;
-                       active_size += VAR_METADATA_SIZE;
-                       active_size += ucs2_strsize(name, 1024);
-               }
-               kfree(tmp);
-       }
-       return status;
+       return efi_call_virt3(get_next_variable,
+                             name_size, name, vendor);
  }
  
  static efi_status_t virt_efi_set_variable(efi_char16_t *name,
                                          unsigned long data_size,
                                          void *data)
  {
-       efi_status_t status;
-       u32 orig_attr = 0;
-       unsigned long orig_size = 0;
-       status = virt_efi_get_variable(name, vendor, &orig_attr, &orig_size,
-                                      NULL);
-       if (status != EFI_BUFFER_TOO_SMALL)
-               orig_size = 0;
-       status = efi_call_virt5(set_variable,
-                               name, vendor, attr,
-                               data_size, data);
-       if (status == EFI_SUCCESS) {
-               if (orig_size) {
-                       active_size -= orig_size;
-                       active_size -= ucs2_strsize(name, 1024);
-                       active_size -= VAR_METADATA_SIZE;
-               }
-               if (data_size) {
-                       active_size += data_size;
-                       active_size += ucs2_strsize(name, 1024);
-                       active_size += VAR_METADATA_SIZE;
-               }
-       }
-       return status;
+       return efi_call_virt5(set_variable,
+                             name, vendor, attr,
+                             data_size, data);
  }
  
  static efi_status_t virt_efi_query_variable_info(u32 attr,
@@@ -352,9 -274,8 +274,9 @@@ static efi_status_t __init phys_efi_get
        return status;
  }
  
 -int efi_set_rtc_mmss(unsigned long nowtime)
 +int efi_set_rtc_mmss(const struct timespec *now)
  {
 +      unsigned long nowtime = now->tv_sec;
        efi_status_t    status;
        efi_time_t      eft;
        efi_time_cap_t  cap;
        return 0;
  }
  
 -unsigned long efi_get_time(void)
 +void efi_get_time(struct timespec *now)
  {
        efi_status_t status;
        efi_time_t eft;
        if (status != EFI_SUCCESS)
                pr_err("Oops: efitime: can't read time!\n");
  
 -      return mktime(eft.year, eft.month, eft.day, eft.hour,
 -                    eft.minute, eft.second);
 +      now->tv_sec = mktime(eft.year, eft.month, eft.day, eft.hour,
 +                           eft.minute, eft.second);
 +      now->tv_nsec = 0;
  }
  
  /*
@@@ -788,9 -708,6 +710,6 @@@ void __init efi_init(void
        char vendor[100] = "unknown";
        int i = 0;
        void *tmp;
-       struct setup_data *data;
-       struct efi_var_bootdata *efi_var_data;
-       u64 pa_data;
  
  #ifdef CONFIG_X86_32
        if (boot_params.efi_info.efi_systab_hi ||
        if (efi_systab_init(efi_phys.systab))
                return;
  
-       pa_data = boot_params.hdr.setup_data;
-       while (pa_data) {
-               data = early_ioremap(pa_data, sizeof(*efi_var_data));
-               if (data->type == SETUP_EFI_VARS) {
-                       efi_var_data = (struct efi_var_bootdata *)data;
-                       efi_var_store_size = efi_var_data->store_size;
-                       efi_var_remaining_size = efi_var_data->remaining_size;
-                       efi_var_max_var_size = efi_var_data->max_var_size;
-               }
-               pa_data = data->next;
-               early_iounmap(data, sizeof(*efi_var_data));
-       }
-       boot_used_size = efi_var_store_size - efi_var_remaining_size;
        set_bit(EFI_SYSTEM_TABLES, &x86_efi_facility);
  
        /*
@@@ -1087,6 -988,13 +990,13 @@@ void __init efi_enter_virtual_mode(void
                runtime_code_page_mkexec();
  
        kfree(new_memmap);
+       /* clean DUMMY object */
+       efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
+                        EFI_VARIABLE_NON_VOLATILE |
+                        EFI_VARIABLE_BOOTSERVICE_ACCESS |
+                        EFI_VARIABLE_RUNTIME_ACCESS,
+                        0, NULL);
  }
  
  /*
@@@ -1138,33 -1046,70 +1048,70 @@@ efi_status_t efi_query_variable_store(u
        efi_status_t status;
        u64 storage_size, remaining_size, max_size;
  
+       if (!(attributes & EFI_VARIABLE_NON_VOLATILE))
+               return 0;
        status = efi.query_variable_info(attributes, &storage_size,
                                         &remaining_size, &max_size);
        if (status != EFI_SUCCESS)
                return status;
  
-       if (!max_size && remaining_size > size)
-               printk_once(KERN_ERR FW_BUG "Broken EFI implementation"
-                           " is returning MaxVariableSize=0\n");
        /*
         * Some firmware implementations refuse to boot if there's insufficient
         * space in the variable store. We account for that by refusing the
         * write if permitting it would reduce the available space to under
-        * 50%. However, some firmware won't reclaim variable space until
-        * after the used (not merely the actively used) space drops below
-        * a threshold. We can approximate that case with the value calculated
-        * above. If both the firmware and our calculations indicate that the
-        * available space would drop below 50%, refuse the write.
+        * 5KB. This figure was provided by Samsung, so should be safe.
         */
+       if ((remaining_size - size < EFI_MIN_RESERVE) &&
+               !efi_no_storage_paranoia) {
+               /*
+                * Triggering garbage collection may require that the firmware
+                * generate a real EFI_OUT_OF_RESOURCES error. We can force
+                * that by attempting to use more space than is available.
+                */
+               unsigned long dummy_size = remaining_size + 1024;
+               void *dummy = kzalloc(dummy_size, GFP_ATOMIC);
+               if (!dummy)
+                       return EFI_OUT_OF_RESOURCES;
+               status = efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
+                                         EFI_VARIABLE_NON_VOLATILE |
+                                         EFI_VARIABLE_BOOTSERVICE_ACCESS |
+                                         EFI_VARIABLE_RUNTIME_ACCESS,
+                                         dummy_size, dummy);
+               if (status == EFI_SUCCESS) {
+                       /*
+                        * This should have failed, so if it didn't make sure
+                        * that we delete it...
+                        */
+                       efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
+                                        EFI_VARIABLE_NON_VOLATILE |
+                                        EFI_VARIABLE_BOOTSERVICE_ACCESS |
+                                        EFI_VARIABLE_RUNTIME_ACCESS,
+                                        0, dummy);
+               }
+               kfree(dummy);
  
-       if (!storage_size || size > remaining_size ||
-           (max_size && size > max_size))
-               return EFI_OUT_OF_RESOURCES;
+               /*
+                * The runtime code may now have triggered a garbage collection
+                * run, so check the variable info again
+                */
+               status = efi.query_variable_info(attributes, &storage_size,
+                                                &remaining_size, &max_size);
  
-       if (!efi_no_storage_paranoia &&
-           ((active_size + size + VAR_METADATA_SIZE > storage_size / 2) &&
-            (remaining_size - size < storage_size / 2)))
-               return EFI_OUT_OF_RESOURCES;
+               if (status != EFI_SUCCESS)
+                       return status;
+               /*
+                * There still isn't enough room, so return an error
+                */
+               if (remaining_size - size < EFI_MIN_RESERVE)
+                       return EFI_OUT_OF_RESOURCES;
+       }
  
        return EFI_SUCCESS;
  }
diff --combined init/Kconfig
index 1a3f93329a67b285410d5bdb4516594323bb0f89,2d9b83104dcf715197f1a84d34b892783ff79fe2..68174a5b831728187a696023c9ae07a80238b563
@@@ -431,6 -431,7 +431,7 @@@ choic
  config TREE_RCU
        bool "Tree-based hierarchical RCU"
        depends on !PREEMPT && SMP
+       select IRQ_WORK
        help
          This option selects the RCU implementation that is
          designed for very large SMP system with hundreds or
@@@ -757,9 -758,6 +758,9 @@@ config LOG_BUF_SHIF
  config HAVE_UNSTABLE_SCHED_CLOCK
        bool
  
 +config GENERIC_SCHED_CLOCK
 +      bool
 +
  #
  # For architectures that want to enable the support for NUMA-affine scheduler
  # balancing logic:
index 4430fa695b48edbeadc632d06be8b922d6426ba6,20d6fba70652094324c13ef6f8e2fcf9bba26a50..6d3f91631de62cd94e2c216e96ac56092332e126
@@@ -19,7 -19,6 +19,7 @@@
  #include <linux/profile.h>
  #include <linux/sched.h>
  #include <linux/smp.h>
 +#include <linux/module.h>
  
  #include "tick-internal.h"
  
@@@ -30,7 -29,6 +30,7 @@@
  
  static struct tick_device tick_broadcast_device;
  static cpumask_var_t tick_broadcast_mask;
 +static cpumask_var_t tick_broadcast_on;
  static cpumask_var_t tmpmask;
  static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
  static int tick_broadcast_force;
@@@ -66,34 -64,17 +66,34 @@@ static void tick_broadcast_start_period
  /*
   * Check, if the device can be utilized as broadcast device:
   */
 -int tick_check_broadcast_device(struct clock_event_device *dev)
 +static bool tick_check_broadcast_device(struct clock_event_device *curdev,
 +                                      struct clock_event_device *newdev)
 +{
 +      if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) ||
 +          (newdev->features & CLOCK_EVT_FEAT_C3STOP))
 +              return false;
 +
 +      if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT &&
 +          !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
 +              return false;
 +
 +      return !curdev || newdev->rating > curdev->rating;
 +}
 +
 +/*
 + * Conditionally install/replace broadcast device
 + */
 +void tick_install_broadcast_device(struct clock_event_device *dev)
  {
        struct clock_event_device *cur = tick_broadcast_device.evtdev;
  
 -      if ((dev->features & CLOCK_EVT_FEAT_DUMMY) ||
 -          (tick_broadcast_device.evtdev &&
 -           tick_broadcast_device.evtdev->rating >= dev->rating) ||
 -           (dev->features & CLOCK_EVT_FEAT_C3STOP))
 -              return 0;
 +      if (!tick_check_broadcast_device(cur, dev))
 +              return;
  
 -      clockevents_exchange_device(tick_broadcast_device.evtdev, dev);
 +      if (!try_module_get(dev->owner))
 +              return;
 +
 +      clockevents_exchange_device(cur, dev);
        if (cur)
                cur->event_handler = clockevents_handle_noop;
        tick_broadcast_device.evtdev = dev;
         */
        if (dev->features & CLOCK_EVT_FEAT_ONESHOT)
                tick_clock_notify();
 -      return 1;
  }
  
  /*
@@@ -141,9 -123,8 +141,9 @@@ static void tick_device_setup_broadcast
   */
  int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
  {
 +      struct clock_event_device *bc = tick_broadcast_device.evtdev;
        unsigned long flags;
 -      int ret = 0;
 +      int ret;
  
        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
  
                dev->event_handler = tick_handle_periodic;
                tick_device_setup_broadcast_func(dev);
                cpumask_set_cpu(cpu, tick_broadcast_mask);
 -              tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
 +              tick_broadcast_start_periodic(bc);
                ret = 1;
        } else {
                /*
 -               * When the new device is not affected by the stop
 -               * feature and the cpu is marked in the broadcast mask
 -               * then clear the broadcast bit.
 +               * Clear the broadcast bit for this cpu if the
 +               * device is not power state affected.
                 */
 -              if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
 -                      int cpu = smp_processor_id();
 +              if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
                        cpumask_clear_cpu(cpu, tick_broadcast_mask);
 -                      tick_broadcast_clear_oneshot(cpu);
 -              } else {
 +              else
                        tick_device_setup_broadcast_func(dev);
 +
 +              /*
 +               * Clear the broadcast bit if the CPU is not in
 +               * periodic broadcast on state.
 +               */
 +              if (!cpumask_test_cpu(cpu, tick_broadcast_on))
 +                      cpumask_clear_cpu(cpu, tick_broadcast_mask);
 +
 +              switch (tick_broadcast_device.mode) {
 +              case TICKDEV_MODE_ONESHOT:
 +                      /*
 +                       * If the system is in oneshot mode we can
 +                       * unconditionally clear the oneshot mask bit,
 +                       * because the CPU is running and therefore
 +                       * not in an idle state which causes the power
 +                       * state affected device to stop. Let the
 +                       * caller initialize the device.
 +                       */
 +                      tick_broadcast_clear_oneshot(cpu);
 +                      ret = 0;
 +                      break;
 +
 +              case TICKDEV_MODE_PERIODIC:
 +                      /*
 +                       * If the system is in periodic mode, check
 +                       * whether the broadcast device can be
 +                       * switched off now.
 +                       */
 +                      if (cpumask_empty(tick_broadcast_mask) && bc)
 +                              clockevents_shutdown(bc);
 +                      /*
 +                       * If we kept the cpu in the broadcast mask,
 +                       * tell the caller to leave the per cpu device
 +                       * in shutdown state. The periodic interrupt
 +                       * is delivered by the broadcast device.
 +                       */
 +                      ret = cpumask_test_cpu(cpu, tick_broadcast_mask);
 +                      break;
 +              default:
 +                      /* Nothing to do */
 +                      ret = 0;
 +                      break;
                }
        }
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
@@@ -339,7 -281,6 +339,7 @@@ static void tick_do_broadcast_on_off(un
        switch (*reason) {
        case CLOCK_EVT_NOTIFY_BROADCAST_ON:
        case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
 +              cpumask_set_cpu(cpu, tick_broadcast_on);
                if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
                        if (tick_broadcast_device.mode ==
                            TICKDEV_MODE_PERIODIC)
                        tick_broadcast_force = 1;
                break;
        case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
 -              if (!tick_broadcast_force &&
 -                  cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) {
 +              if (tick_broadcast_force)
 +                      break;
 +              cpumask_clear_cpu(cpu, tick_broadcast_on);
 +              if (!tick_device_is_functional(dev))
 +                      break;
 +              if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) {
                        if (tick_broadcast_device.mode ==
                            TICKDEV_MODE_PERIODIC)
                                tick_setup_periodic(dev, 0);
@@@ -412,7 -349,6 +412,7 @@@ void tick_shutdown_broadcast(unsigned i
  
        bc = tick_broadcast_device.evtdev;
        cpumask_clear_cpu(cpu, tick_broadcast_mask);
 +      cpumask_clear_cpu(cpu, tick_broadcast_on);
  
        if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
                if (bc && cpumask_empty(tick_broadcast_mask))
@@@ -539,15 -475,7 +539,15 @@@ void tick_check_oneshot_broadcast(int c
        if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) {
                struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
  
 -              clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
 +              /*
 +               * We might be in the middle of switching over from
 +               * periodic to oneshot. If the CPU has not yet
 +               * switched over, leave the device alone.
 +               */
 +              if (td->mode == TICKDEV_MODE_ONESHOT) {
 +                      clockevents_set_mode(td->evtdev,
 +                                           CLOCK_EVT_MODE_ONESHOT);
 +              }
        }
  }
  
@@@ -583,17 -511,16 +583,23 @@@ again
                }
        }
  
+       /*
+        * Remove the current cpu from the pending mask. The event is
+        * delivered immediately in tick_do_broadcast() !
+        */
+       cpumask_clear_cpu(smp_processor_id(), tick_broadcast_pending_mask);
        /* Take care of enforced broadcast requests */
        cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask);
        cpumask_clear(tick_broadcast_force_mask);
  
 +      /*
 +       * Sanity check. Catch the case where we try to broadcast to
 +       * offline cpus.
 +       */
 +      if (WARN_ON_ONCE(!cpumask_subset(tmpmask, cpu_online_mask)))
 +              cpumask_and(tmpmask, tmpmask, cpu_online_mask);
 +
        /*
         * Wakeup the cpus which have an expired event.
         */
@@@ -654,8 -581,8 +660,8 @@@ void tick_broadcast_oneshot_control(uns
  
        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
-               WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
                if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
+                       WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
                        clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
                        /*
                         * We only reprogram the broadcast timer if we
        } else {
                if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
                        clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
-                       if (dev->next_event.tv64 == KTIME_MAX)
-                               goto out;
                        /*
                         * The cpu which was handling the broadcast
                         * timer marked this cpu in the broadcast
                                       tick_broadcast_pending_mask))
                                goto out;
  
+                       /*
+                        * Bail out if there is no next event.
+                        */
+                       if (dev->next_event.tv64 == KTIME_MAX)
+                               goto out;
                        /*
                         * If the pending bit is not set, then we are
                         * either the CPU handling the broadcast
@@@ -771,10 -701,6 +780,6 @@@ void tick_broadcast_setup_oneshot(struc
  
                bc->event_handler = tick_handle_oneshot_broadcast;
  
-               /* Take the do_timer update */
-               if (!tick_nohz_full_cpu(cpu))
-                       tick_do_timer_cpu = cpu;
                /*
                 * We must be careful here. There might be other CPUs
                 * waiting for periodic broadcast. We need to set the
@@@ -835,12 -761,10 +840,12 @@@ void tick_shutdown_broadcast_oneshot(un
        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
  
        /*
 -       * Clear the broadcast mask flag for the dead cpu, but do not
 -       * stop the broadcast device!
 +       * Clear the broadcast masks for the dead cpu, but do not stop
 +       * the broadcast device!
         */
        cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
 +      cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
 +      cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
  
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
  }
@@@ -868,7 -792,6 +873,7 @@@ bool tick_broadcast_oneshot_available(v
  void __init tick_broadcast_init(void)
  {
        zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT);
 +      zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT);
        zalloc_cpumask_var(&tmpmask, GFP_NOWAIT);
  #ifdef CONFIG_TICK_ONESHOT
        zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
index 846d0a1f235e2ccf21e99a9170c97e26865cc439,baeeb5c87cf142a818122fac50140a0657f027bd..48b9fffabdc294a4bea16b9b78083a161e8eec42
  
  #include "tick-internal.h"
  #include "ntp_internal.h"
 +#include "timekeeping_internal.h"
 +
 +#define TK_CLEAR_NTP          (1 << 0)
 +#define TK_MIRROR             (1 << 1)
 +#define TK_CLOCK_WAS_SET      (1 << 2)
  
  static struct timekeeper timekeeper;
  static DEFINE_RAW_SPINLOCK(timekeeper_lock);
@@@ -205,9 -200,9 +205,9 @@@ static inline s64 timekeeping_get_ns_ra
  
  static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
  
 -static void update_pvclock_gtod(struct timekeeper *tk)
 +static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
  {
 -      raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk);
 +      raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk);
  }
  
  /**
@@@ -221,7 -216,7 +221,7 @@@ int pvclock_gtod_register_notifier(stru
  
        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
 -      update_pvclock_gtod(tk);
 +      update_pvclock_gtod(tk, true);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
  
        return ret;
@@@ -246,16 -241,16 +246,16 @@@ int pvclock_gtod_unregister_notifier(st
  EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
  
  /* must hold timekeeper_lock */
 -static void timekeeping_update(struct timekeeper *tk, bool clearntp, bool mirror)
 +static void timekeeping_update(struct timekeeper *tk, unsigned int action)
  {
 -      if (clearntp) {
 +      if (action & TK_CLEAR_NTP) {
                tk->ntp_error = 0;
                ntp_clear();
        }
        update_vsyscall(tk);
 -      update_pvclock_gtod(tk);
 +      update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
  
 -      if (mirror)
 +      if (action & TK_MIRROR)
                memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
  }
  
@@@ -513,7 -508,7 +513,7 @@@ int do_settimeofday(const struct timesp
  
        tk_set_xtime(tk, tv);
  
 -      timekeeping_update(tk, true, true);
 +      timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
  
        write_seqcount_end(&timekeeper_seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@@ -557,7 -552,7 +557,7 @@@ int timekeeping_inject_offset(struct ti
        tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
  
  error: /* even if we error out, we forwarded the time, so call update */
 -      timekeeping_update(tk, true, true);
 +      timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
  
        write_seqcount_end(&timekeeper_seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@@ -632,22 -627,13 +632,22 @@@ static int change_clocksource(void *dat
        write_seqcount_begin(&timekeeper_seq);
  
        timekeeping_forward_now(tk);
 -      if (!new->enable || new->enable(new) == 0) {
 -              old = tk->clock;
 -              tk_setup_internals(tk, new);
 -              if (old->disable)
 -                      old->disable(old);
 +      /*
 +       * If the cs is in module, get a module reference. Succeeds
 +       * for built-in code (owner == NULL) as well.
 +       */
 +      if (try_module_get(new->owner)) {
 +              if (!new->enable || new->enable(new) == 0) {
 +                      old = tk->clock;
 +                      tk_setup_internals(tk, new);
 +                      if (old->disable)
 +                              old->disable(old);
 +                      module_put(old->owner);
 +              } else {
 +                      module_put(new->owner);
 +              }
        }
 -      timekeeping_update(tk, true, true);
 +      timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
  
        write_seqcount_end(&timekeeper_seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
   * This function is called from clocksource.c after a new, better clock
   * source has been registered. The caller holds the clocksource_mutex.
   */
 -void timekeeping_notify(struct clocksource *clock)
 +int timekeeping_notify(struct clocksource *clock)
  {
        struct timekeeper *tk = &timekeeper;
  
        if (tk->clock == clock)
 -              return;
 +              return 0;
        stop_machine(change_clocksource, clock, NULL);
        tick_clock_notify();
 +      return tk->clock == clock ? 0 : -1;
  }
  
  /**
@@@ -856,7 -841,6 +856,7 @@@ static void __timekeeping_inject_sleept
        tk_xtime_add(tk, delta);
        tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta));
        tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta));
 +      tk_debug_account_sleep_time(delta);
  }
  
  /**
@@@ -888,7 -872,7 +888,7 @@@ void timekeeping_inject_sleeptime(struc
  
        __timekeeping_inject_sleeptime(tk, delta);
  
 -      timekeeping_update(tk, true, true);
 +      timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
  
        write_seqcount_end(&timekeeper_seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@@ -970,7 -954,7 +970,7 @@@ static void timekeeping_resume(void
        tk->cycle_last = clock->cycle_last = cycle_now;
        tk->ntp_error = 0;
        timekeeping_suspended = 0;
 -      timekeeping_update(tk, false, true);
 +      timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
        write_seqcount_end(&timekeeper_seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
  
@@@ -991,6 -975,14 +991,14 @@@ static int timekeeping_suspend(void
  
        read_persistent_clock(&timekeeping_suspend_time);
  
+       /*
+        * On some systems the persistent_clock can not be detected at
+        * timekeeping_init by its return value, so if we see a valid
+        * value returned, update the persistent_clock_exists flag.
+        */
+       if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
+               persistent_clock_exist = true;
        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&timekeeper_seq);
        timekeeping_forward_now(tk);
@@@ -1244,10 -1236,9 +1252,10 @@@ out_adjust
   * It also calls into the NTP code to handle leapsecond processing.
   *
   */
 -static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
 +static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
  {
        u64 nsecps = (u64)NSEC_PER_SEC << tk->shift;
 +      unsigned int action = 0;
  
        while (tk->xtime_nsec >= nsecps) {
                int leap;
                        __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
  
                        clock_was_set_delayed();
 +                      action = TK_CLOCK_WAS_SET;
                }
        }
 +      return action;
  }
  
  /**
@@@ -1358,7 -1347,6 +1366,7 @@@ static void update_wall_time(void
        struct timekeeper *tk = &shadow_timekeeper;
        cycle_t offset;
        int shift = 0, maxshift;
 +      unsigned int action;
        unsigned long flags;
  
        raw_spin_lock_irqsave(&timekeeper_lock, flags);
         * Finally, make sure that after the rounding
         * xtime_nsec isn't larger than NSEC_PER_SEC
         */
 -      accumulate_nsecs_to_secs(tk);
 +      action = accumulate_nsecs_to_secs(tk);
  
        write_seqcount_begin(&timekeeper_seq);
        /* Update clock->cycle_last with the new value */
         * updating.
         */
        memcpy(real_tk, tk, sizeof(*tk));
 -      timekeeping_update(real_tk, false, false);
 +      timekeeping_update(real_tk, action);
        write_seqcount_end(&timekeeper_seq);
  out:
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@@ -1689,7 -1677,6 +1697,7 @@@ int do_adjtimex(struct timex *txc
  
        if (tai != orig_tai) {
                __timekeeping_set_tai_offset(tk, tai);
 +              update_pvclock_gtod(tk, true);
                clock_was_set_delayed();
        }
        write_seqcount_end(&timekeeper_seq);