[PATCH] cpu state clean after hot remove
authorLi Shaohua <shaohua.li@intel.com>
Sat, 25 Jun 2005 21:54:56 +0000 (14:54 -0700)
committerLinus Torvalds <torvalds@ppc970.osdl.org>
Sat, 25 Jun 2005 23:24:30 +0000 (16:24 -0700)
Clean CPU states in order to reuse smp boot code for CPU hotplug.

Signed-off-by: Li Shaohua<shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
arch/i386/kernel/cpu/common.c
arch/i386/kernel/irq.c
arch/i386/kernel/process.c
arch/i386/kernel/smpboot.c
drivers/base/cpu.c
include/asm-i386/irq.h
include/asm-i386/smp.h

index aac74758caf498a423ad0734d4a8682cad58e739..2203a9d2021251a59776109064d10d558b898574 100644 (file)
@@ -651,3 +651,15 @@ void __devinit cpu_init(void)
        clear_used_math();
        mxcsr_feature_mask_init();
 }
+
+#ifdef CONFIG_HOTPLUG_CPU
+void __devinit cpu_uninit(void)
+{
+       int cpu = raw_smp_processor_id();
+       cpu_clear(cpu, cpu_initialized);
+
+       /* lazy TLB state */
+       per_cpu(cpu_tlbstate, cpu).state = 0;
+       per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
+}
+#endif
index af115004aec58ae383bbfdb628a5d29e0410dff8..ce66dcc26d9072cf14bea63c83ae1db8ddde1934 100644 (file)
@@ -156,6 +156,11 @@ void irq_ctx_init(int cpu)
                cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
 }
 
+void irq_ctx_exit(int cpu)
+{
+       hardirq_ctx[cpu] = NULL;
+}
+
 extern asmlinkage void __do_softirq(void);
 
 asmlinkage void do_softirq(void)
index e06f2dc7123dffc8a7f5decb662543c4603a2831..5f8cfa6b794085b9fb224e229398369bc0f4c64a 100644 (file)
@@ -152,21 +152,19 @@ static void poll_idle (void)
 /* We don't actually take CPU down, just spin without interrupts. */
 static inline void play_dead(void)
 {
+       /* This must be done before dead CPU ack */
+       cpu_exit_clear();
+       wbinvd();
+       mb();
        /* Ack it */
        __get_cpu_var(cpu_state) = CPU_DEAD;
 
-       /* We shouldn't have to disable interrupts while dead, but
-        * some interrupts just don't seem to go away, and this makes
-        * it "work" for testing purposes. */
-       /* Death loop */
-       while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE)
-               cpu_relax();
-
+       /*
+        * With physical CPU hotplug, we should halt the cpu
+        */
        local_irq_disable();
-       __flush_tlb_all();
-       cpu_set(smp_processor_id(), cpu_online_map);
-       enable_APIC_timer();
-       local_irq_enable();
+       while (1)
+               __asm__ __volatile__("hlt":::"memory");
 }
 #else
 static inline void play_dead(void)
index fb0b200d1d851041c4c49e541afa44653ffff014..d66bf489a2e90782044a09e2562fb84d06d893a0 100644 (file)
@@ -90,6 +90,12 @@ cpumask_t cpu_callout_map;
 EXPORT_SYMBOL(cpu_callout_map);
 static cpumask_t smp_commenced_mask;
 
+/* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there
+ * is no way to resync one AP against BP. TBD: for prescott and above, we
+ * should use IA64's algorithm
+ */
+static int __devinitdata tsc_sync_disabled;
+
 /* Per CPU bogomips and other parameters */
 struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
 EXPORT_SYMBOL(cpu_data);
@@ -427,7 +433,7 @@ static void __devinit smp_callin(void)
        /*
         *      Synchronize the TSC with the BP
         */
-       if (cpu_has_tsc && cpu_khz)
+       if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled)
                synchronize_tsc_ap();
 }
 
@@ -507,6 +513,7 @@ static void __devinit start_secondary(void *unused)
        lock_ipi_call_lock();
        cpu_set(smp_processor_id(), cpu_online_map);
        unlock_ipi_call_lock();
+       per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
 
        /* We can take interrupts now: we're officially "up". */
        local_irq_enable();
@@ -816,8 +823,43 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
 #endif /* WAKE_SECONDARY_VIA_INIT */
 
 extern cpumask_t cpu_initialized;
+static inline int alloc_cpu_id(void)
+{
+       cpumask_t       tmp_map;
+       int cpu;
+       cpus_complement(tmp_map, cpu_present_map);
+       cpu = first_cpu(tmp_map);
+       if (cpu >= NR_CPUS)
+               return -ENODEV;
+       return cpu;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static struct task_struct * __devinitdata cpu_idle_tasks[NR_CPUS];
+static inline struct task_struct * alloc_idle_task(int cpu)
+{
+       struct task_struct *idle;
+
+       if ((idle = cpu_idle_tasks[cpu]) != NULL) {
+               /* initialize thread_struct.  we really want to avoid destroy
+                * idle tread
+                */
+               idle->thread.esp = (unsigned long)(((struct pt_regs *)
+                       (THREAD_SIZE + (unsigned long) idle->thread_info)) - 1);
+               init_idle(idle, cpu);
+               return idle;
+       }
+       idle = fork_idle(cpu);
+
+       if (!IS_ERR(idle))
+               cpu_idle_tasks[cpu] = idle;
+       return idle;
+}
+#else
+#define alloc_idle_task(cpu) fork_idle(cpu)
+#endif
 
-static int __devinit do_boot_cpu(int apicid)
+static int __devinit do_boot_cpu(int apicid, int cpu)
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
  * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -826,16 +868,17 @@ static int __devinit do_boot_cpu(int apicid)
 {
        struct task_struct *idle;
        unsigned long boot_error;
-       int timeout, cpu;
+       int timeout;
        unsigned long start_eip;
        unsigned short nmi_high = 0, nmi_low = 0;
 
-       cpu = ++cpucount;
+       ++cpucount;
+
        /*
         * We can't use kernel_thread since we must avoid to
         * reschedule the child.
         */
-       idle = fork_idle(cpu);
+       idle = alloc_idle_task(cpu);
        if (IS_ERR(idle))
                panic("failed fork for CPU %d", cpu);
        idle->thread.eip = (unsigned long) start_secondary;
@@ -902,13 +945,16 @@ static int __devinit do_boot_cpu(int apicid)
                        inquire_remote_apic(apicid);
                }
        }
-       x86_cpu_to_apicid[cpu] = apicid;
+
        if (boot_error) {
                /* Try to put things back the way they were before ... */
                unmap_cpu_to_logical_apicid(cpu);
                cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
                cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
                cpucount--;
+       } else {
+               x86_cpu_to_apicid[cpu] = apicid;
+               cpu_set(cpu, cpu_present_map);
        }
 
        /* mark "stuck" area as not stuck */
@@ -917,6 +963,75 @@ static int __devinit do_boot_cpu(int apicid)
        return boot_error;
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+void cpu_exit_clear(void)
+{
+       int cpu = raw_smp_processor_id();
+
+       idle_task_exit();
+
+       cpucount --;
+       cpu_uninit();
+       irq_ctx_exit(cpu);
+
+       cpu_clear(cpu, cpu_callout_map);
+       cpu_clear(cpu, cpu_callin_map);
+       cpu_clear(cpu, cpu_present_map);
+
+       cpu_clear(cpu, smp_commenced_mask);
+       unmap_cpu_to_logical_apicid(cpu);
+}
+
+struct warm_boot_cpu_info {
+       struct completion *complete;
+       int apicid;
+       int cpu;
+};
+
+static void __devinit do_warm_boot_cpu(void *p)
+{
+       struct warm_boot_cpu_info *info = p;
+       do_boot_cpu(info->apicid, info->cpu);
+       complete(info->complete);
+}
+
+int __devinit smp_prepare_cpu(int cpu)
+{
+       DECLARE_COMPLETION(done);
+       struct warm_boot_cpu_info info;
+       struct work_struct task;
+       int     apicid, ret;
+
+       lock_cpu_hotplug();
+       apicid = x86_cpu_to_apicid[cpu];
+       if (apicid == BAD_APICID) {
+               ret = -ENODEV;
+               goto exit;
+       }
+
+       info.complete = &done;
+       info.apicid = apicid;
+       info.cpu = cpu;
+       INIT_WORK(&task, do_warm_boot_cpu, &info);
+
+       tsc_sync_disabled = 1;
+
+       /* init low mem mapping */
+       memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
+                       sizeof(swapper_pg_dir[0]) * KERNEL_PGD_PTRS);
+       flush_tlb_all();
+       schedule_work(&task);
+       wait_for_completion(&done);
+
+       tsc_sync_disabled = 0;
+       zap_low_mappings();
+       ret = 0;
+exit:
+       unlock_cpu_hotplug();
+       return ret;
+}
+#endif
+
 static void smp_tune_scheduling (void)
 {
        unsigned long cachesize;       /* kB   */
@@ -1069,7 +1184,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
                if (max_cpus <= cpucount+1)
                        continue;
 
-               if (do_boot_cpu(apicid))
+               if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
                        printk("CPU #%d not responding - cannot use it.\n",
                                                                apicid);
                else
@@ -1149,25 +1264,24 @@ void __devinit smp_prepare_boot_cpu(void)
 {
        cpu_set(smp_processor_id(), cpu_online_map);
        cpu_set(smp_processor_id(), cpu_callout_map);
+       cpu_set(smp_processor_id(), cpu_present_map);
+       per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-
-/* must be called with the cpucontrol mutex held */
-static int __devinit cpu_enable(unsigned int cpu)
+static void
+remove_siblinginfo(int cpu)
 {
-       /* get the target out of its holding state */
-       per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
-       wmb();
-
-       /* wait for the processor to ack it. timeout? */
-       while (!cpu_online(cpu))
-               cpu_relax();
-
-       fixup_irqs(cpu_online_map);
-       /* counter the disable in fixup_irqs() */
-       local_irq_enable();
-       return 0;
+       int sibling;
+
+       for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
+               cpu_clear(cpu, cpu_sibling_map[sibling]);
+       for_each_cpu_mask(sibling, cpu_core_map[cpu])
+               cpu_clear(cpu, cpu_core_map[sibling]);
+       cpus_clear(cpu_sibling_map[cpu]);
+       cpus_clear(cpu_core_map[cpu]);
+       phys_proc_id[cpu] = BAD_APICID;
+       cpu_core_id[cpu] = BAD_APICID;
 }
 
 int __cpu_disable(void)
@@ -1193,6 +1307,8 @@ int __cpu_disable(void)
        mdelay(1);
        local_irq_disable();
 
+       remove_siblinginfo(cpu);
+
        cpu_clear(cpu, map);
        fixup_irqs(map);
        /* It's now safe to remove this processor from the online map */
@@ -1207,8 +1323,10 @@ void __cpu_die(unsigned int cpu)
 
        for (i = 0; i < 10; i++) {
                /* They ack this in play_dead by setting CPU_DEAD */
-               if (per_cpu(cpu_state, cpu) == CPU_DEAD)
+               if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+                       printk ("CPU %d is now offline\n", cpu);
                        return;
+               }
                current->state = TASK_UNINTERRUPTIBLE;
                schedule_timeout(HZ/10);
        }
@@ -1236,15 +1354,8 @@ int __devinit __cpu_up(unsigned int cpu)
                return -EIO;
        }
 
-#ifdef CONFIG_HOTPLUG_CPU
-       /* Already up, and in cpu_quiescent now? */
-       if (cpu_isset(cpu, smp_commenced_mask)) {
-               cpu_enable(cpu);
-               return 0;
-       }
-#endif
-
        local_irq_enable();
+       per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
        /* Unleash the CPU! */
        cpu_set(cpu, smp_commenced_mask);
        while (!cpu_isset(cpu, cpu_online_map))
@@ -1258,10 +1369,12 @@ void __init smp_cpus_done(unsigned int max_cpus)
        setup_ioapic_dest();
 #endif
        zap_low_mappings();
+#ifndef CONFIG_HOTPLUG_CPU
        /*
         * Disable executability of the SMP trampoline:
         */
        set_kernel_exec((unsigned long)trampoline_base, trampoline_exec);
+#endif
 }
 
 void __init smp_intr_init(void)
index 6ef3069b57107f2e12c5d2c7f238b77f5dacd97c..bdd7e9f55c81fac3e19d76ddb376972211d48779 100644 (file)
@@ -16,6 +16,10 @@ struct sysdev_class cpu_sysdev_class = {
 EXPORT_SYMBOL(cpu_sysdev_class);
 
 #ifdef CONFIG_HOTPLUG_CPU
+#ifndef __HAVE_ARCH_SMP_PREPARE_CPU
+#define smp_prepare_cpu(cpu) (0)
+#endif
+
 static ssize_t show_online(struct sys_device *dev, char *buf)
 {
        struct cpu *cpu = container_of(dev, struct cpu, sysdev);
@@ -36,7 +40,9 @@ static ssize_t store_online(struct sys_device *dev, const char *buf,
                        kobject_hotplug(&dev->kobj, KOBJ_OFFLINE);
                break;
        case '1':
-               ret = cpu_up(cpu->sysdev.id);
+               ret = smp_prepare_cpu(cpu->sysdev.id);
+               if (ret == 0)
+                       ret = cpu_up(cpu->sysdev.id);
                break;
        default:
                ret = -EINVAL;
index e2d8bf23ad701e92081848b8262af987c537953f..270f1986b19f781ff7788a126d2d1486b5e5bfbc 100644 (file)
@@ -29,9 +29,11 @@ extern void release_vm86_irqs(struct task_struct *);
 
 #ifdef CONFIG_4KSTACKS
   extern void irq_ctx_init(int cpu);
+  extern void irq_ctx_exit(int cpu);
 # define __ARCH_HAS_DO_SOFTIRQ
 #else
 # define irq_ctx_init(cpu) do { } while (0)
+# define irq_ctx_exit(cpu) do { } while (0)
 #endif
 
 #ifdef CONFIG_IRQBALANCE
index 2451ead0ca5c92d134f80cfb1f63fd161a47a758..c9996eda54083979a841d8ca058ad5aaaf44c528 100644 (file)
@@ -48,6 +48,14 @@ extern void unlock_ipi_call_lock(void);
 #define MAX_APICID 256
 extern u8 x86_cpu_to_apicid[];
 
+#ifdef CONFIG_HOTPLUG_CPU
+extern void cpu_exit_clear(void);
+extern void cpu_uninit(void);
+
+#define __HAVE_ARCH_SMP_PREPARE_CPU
+extern int smp_prepare_cpu(int cpu);
+#endif
+
 /*
  * This function is needed by all SMP systems. It must _always_ be valid
  * from the initial startup. We map APIC_BASE very early in page_setup(),