x86_64: move kernel
authorThomas Gleixner <tglx@linutronix.de>
Thu, 11 Oct 2007 09:17:24 +0000 (11:17 +0200)
committerThomas Gleixner <tglx@linutronix.de>
Thu, 11 Oct 2007 09:17:24 +0000 (11:17 +0200)
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
136 files changed:
arch/x86/boot/compressed/head_64.S
arch/x86/kernel/Makefile
arch/x86/kernel/Makefile_32
arch/x86/kernel/Makefile_64 [new file with mode: 0644]
arch/x86/kernel/acpi/wakeup_64.S
arch/x86/kernel/aperture_64.c [new file with mode: 0644]
arch/x86/kernel/apic_64.c [new file with mode: 0644]
arch/x86/kernel/asm-offsets_64.c [new file with mode: 0644]
arch/x86/kernel/audit_64.c [new file with mode: 0644]
arch/x86/kernel/bugs_64.c [new file with mode: 0644]
arch/x86/kernel/crash_64.c [new file with mode: 0644]
arch/x86/kernel/crash_dump_64.c [new file with mode: 0644]
arch/x86/kernel/e820_64.c [new file with mode: 0644]
arch/x86/kernel/early-quirks_64.c [new file with mode: 0644]
arch/x86/kernel/early_printk.c
arch/x86/kernel/entry_64.S [new file with mode: 0644]
arch/x86/kernel/genapic_64.c [new file with mode: 0644]
arch/x86/kernel/genapic_flat_64.c [new file with mode: 0644]
arch/x86/kernel/head64.c [new file with mode: 0644]
arch/x86/kernel/head_64.S [new file with mode: 0644]
arch/x86/kernel/hpet_64.c [new file with mode: 0644]
arch/x86/kernel/i387_64.c [new file with mode: 0644]
arch/x86/kernel/i8259_64.c [new file with mode: 0644]
arch/x86/kernel/init_task_64.c [new file with mode: 0644]
arch/x86/kernel/io_apic_64.c [new file with mode: 0644]
arch/x86/kernel/ioport_64.c [new file with mode: 0644]
arch/x86/kernel/irq_64.c [new file with mode: 0644]
arch/x86/kernel/k8.c [new file with mode: 0644]
arch/x86/kernel/kprobes_64.c [new file with mode: 0644]
arch/x86/kernel/ldt_64.c [new file with mode: 0644]
arch/x86/kernel/machine_kexec_64.c [new file with mode: 0644]
arch/x86/kernel/mce_64.c [new file with mode: 0644]
arch/x86/kernel/mce_amd_64.c [new file with mode: 0644]
arch/x86/kernel/mce_intel_64.c [new file with mode: 0644]
arch/x86/kernel/module_64.c [new file with mode: 0644]
arch/x86/kernel/mpparse_64.c [new file with mode: 0644]
arch/x86/kernel/nmi_64.c [new file with mode: 0644]
arch/x86/kernel/pci-calgary_64.c [new file with mode: 0644]
arch/x86/kernel/pci-dma_64.c [new file with mode: 0644]
arch/x86/kernel/pci-gart_64.c [new file with mode: 0644]
arch/x86/kernel/pci-nommu_64.c [new file with mode: 0644]
arch/x86/kernel/pci-swiotlb_64.c [new file with mode: 0644]
arch/x86/kernel/pmtimer_64.c [new file with mode: 0644]
arch/x86/kernel/process_64.c [new file with mode: 0644]
arch/x86/kernel/ptrace_64.c [new file with mode: 0644]
arch/x86/kernel/reboot_64.c [new file with mode: 0644]
arch/x86/kernel/relocate_kernel_64.S [new file with mode: 0644]
arch/x86/kernel/setup64.c [new file with mode: 0644]
arch/x86/kernel/setup_64.c [new file with mode: 0644]
arch/x86/kernel/signal_64.c [new file with mode: 0644]
arch/x86/kernel/smp_64.c [new file with mode: 0644]
arch/x86/kernel/smpboot_64.c [new file with mode: 0644]
arch/x86/kernel/stacktrace.c [new file with mode: 0644]
arch/x86/kernel/suspend_64.c [new file with mode: 0644]
arch/x86/kernel/suspend_asm_64.S [new file with mode: 0644]
arch/x86/kernel/sys_x86_64.c [new file with mode: 0644]
arch/x86/kernel/syscall_64.c [new file with mode: 0644]
arch/x86/kernel/tce_64.c [new file with mode: 0644]
arch/x86/kernel/time_64.c [new file with mode: 0644]
arch/x86/kernel/trampoline_64.S [new file with mode: 0644]
arch/x86/kernel/traps_64.c [new file with mode: 0644]
arch/x86/kernel/tsc_64.c [new file with mode: 0644]
arch/x86/kernel/tsc_sync.c
arch/x86/kernel/verify_cpu_64.S [new file with mode: 0644]
arch/x86/kernel/vmlinux_64.lds.S [new file with mode: 0644]
arch/x86/kernel/vsmp_64.c [new file with mode: 0644]
arch/x86/kernel/vsyscall_64.c [new file with mode: 0644]
arch/x86/kernel/x8664_ksyms_64.c [new file with mode: 0644]
arch/x86_64/Makefile
arch/x86_64/kernel/Makefile [deleted file]
arch/x86_64/kernel/Makefile_64 [deleted file]
arch/x86_64/kernel/aperture_64.c [deleted file]
arch/x86_64/kernel/apic_64.c [deleted file]
arch/x86_64/kernel/asm-offsets.c [deleted file]
arch/x86_64/kernel/asm-offsets_64.c [deleted file]
arch/x86_64/kernel/audit_64.c [deleted file]
arch/x86_64/kernel/bugs_64.c [deleted file]
arch/x86_64/kernel/crash_64.c [deleted file]
arch/x86_64/kernel/crash_dump_64.c [deleted file]
arch/x86_64/kernel/e820_64.c [deleted file]
arch/x86_64/kernel/early-quirks_64.c [deleted file]
arch/x86_64/kernel/early_printk.c [deleted file]
arch/x86_64/kernel/entry_64.S [deleted file]
arch/x86_64/kernel/genapic_64.c [deleted file]
arch/x86_64/kernel/genapic_flat_64.c [deleted file]
arch/x86_64/kernel/head64.c [deleted file]
arch/x86_64/kernel/head_64.S [deleted file]
arch/x86_64/kernel/hpet_64.c [deleted file]
arch/x86_64/kernel/i387_64.c [deleted file]
arch/x86_64/kernel/i8259_64.c [deleted file]
arch/x86_64/kernel/init_task_64.c [deleted file]
arch/x86_64/kernel/io_apic_64.c [deleted file]
arch/x86_64/kernel/ioport_64.c [deleted file]
arch/x86_64/kernel/irq_64.c [deleted file]
arch/x86_64/kernel/k8.c [deleted file]
arch/x86_64/kernel/kprobes_64.c [deleted file]
arch/x86_64/kernel/ldt_64.c [deleted file]
arch/x86_64/kernel/machine_kexec_64.c [deleted file]
arch/x86_64/kernel/mce_64.c [deleted file]
arch/x86_64/kernel/mce_amd_64.c [deleted file]
arch/x86_64/kernel/mce_intel_64.c [deleted file]
arch/x86_64/kernel/module_64.c [deleted file]
arch/x86_64/kernel/mpparse_64.c [deleted file]
arch/x86_64/kernel/nmi_64.c [deleted file]
arch/x86_64/kernel/pci-calgary_64.c [deleted file]
arch/x86_64/kernel/pci-dma_64.c [deleted file]
arch/x86_64/kernel/pci-gart_64.c [deleted file]
arch/x86_64/kernel/pci-nommu_64.c [deleted file]
arch/x86_64/kernel/pci-swiotlb_64.c [deleted file]
arch/x86_64/kernel/pmtimer_64.c [deleted file]
arch/x86_64/kernel/process_64.c [deleted file]
arch/x86_64/kernel/ptrace_64.c [deleted file]
arch/x86_64/kernel/reboot_64.c [deleted file]
arch/x86_64/kernel/relocate_kernel_64.S [deleted file]
arch/x86_64/kernel/setup64.c [deleted file]
arch/x86_64/kernel/setup_64.c [deleted file]
arch/x86_64/kernel/signal_64.c [deleted file]
arch/x86_64/kernel/smp_64.c [deleted file]
arch/x86_64/kernel/smpboot_64.c [deleted file]
arch/x86_64/kernel/stacktrace.c [deleted file]
arch/x86_64/kernel/suspend_64.c [deleted file]
arch/x86_64/kernel/suspend_asm_64.S [deleted file]
arch/x86_64/kernel/sys_x86_64.c [deleted file]
arch/x86_64/kernel/syscall_64.c [deleted file]
arch/x86_64/kernel/tce_64.c [deleted file]
arch/x86_64/kernel/time_64.c [deleted file]
arch/x86_64/kernel/trampoline_64.S [deleted file]
arch/x86_64/kernel/traps_64.c [deleted file]
arch/x86_64/kernel/tsc_64.c [deleted file]
arch/x86_64/kernel/tsc_sync.c [deleted file]
arch/x86_64/kernel/verify_cpu_64.S [deleted file]
arch/x86_64/kernel/vmlinux.lds.S [deleted file]
arch/x86_64/kernel/vmlinux_64.lds.S [deleted file]
arch/x86_64/kernel/vsmp_64.c [deleted file]
arch/x86_64/kernel/vsyscall_64.c [deleted file]
arch/x86_64/kernel/x8664_ksyms_64.c [deleted file]

index cff3d1dc5dd453eea0b71ff40e804b235dc42d5b..49467640751fc0eb0a67c96ca7e9fa0b3d973fb0 100644 (file)
@@ -174,7 +174,7 @@ no_longmode:
        hlt
        jmp     1b
 
-#include "../../../x86_64/kernel/verify_cpu_64.S"
+#include "../../kernel/verify_cpu_64.S"
 
        /* Be careful here startup_64 needs to be at a predictable
         * address so I can export it in an ELF header.  Bootloaders
index 577d08f4b8bbfcd898271a4e014a6a26c134e7a2..45855c97923ee17a9a58b2bd188664d257cda2f1 100644 (file)
@@ -1,5 +1,5 @@
 ifeq ($(CONFIG_X86_32),y)
 include ${srctree}/arch/x86/kernel/Makefile_32
 else
-include ${srctree}/arch/x86_64/kernel/Makefile_64
+include ${srctree}/arch/x86/kernel/Makefile_64
 endif
index 5096f486d389592378e1b6c2d2742c0e19c82393..cb25523026a6447d61f2312d110c9d063a07f337 100644 (file)
@@ -83,6 +83,4 @@ $(obj)/vsyscall-syms.o: $(src)/vsyscall_32.lds \
                        $(obj)/vsyscall-sysenter_32.o $(obj)/vsyscall-note_32.o FORCE
        $(call if_changed,syscall)
 
-k8-y                      += ../../x86_64/kernel/k8.o
-stacktrace-y             += ../../x86_64/kernel/stacktrace.o
 
diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64
new file mode 100644 (file)
index 0000000..6e6b590
--- /dev/null
@@ -0,0 +1,54 @@
+#
+# Makefile for the linux kernel.
+#
+
+extra-y        := head_64.o head64.o init_task_64.o vmlinux.lds
+EXTRA_AFLAGS   := -traditional
+obj-y  := process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \
+               ptrace_64.o time_64.o ioport_64.o ldt_64.o setup_64.o i8259_64.o sys_x86_64.o \
+               x8664_ksyms_64.o i387_64.o syscall_64.o vsyscall_64.o \
+               setup64.o bootflag.o e820_64.o reboot_64.o quirks.o i8237.o \
+               pci-dma_64.o pci-nommu_64.o alternative.o hpet_64.o tsc_64.o bugs_64.o \
+               perfctr-watchdog.o
+
+obj-$(CONFIG_STACKTRACE)       += stacktrace.o
+obj-$(CONFIG_X86_MCE)          += mce_64.o therm_throt.o
+obj-$(CONFIG_X86_MCE_INTEL)    += mce_intel_64.o
+obj-$(CONFIG_X86_MCE_AMD)      += mce_amd_64.o
+obj-$(CONFIG_MTRR)             += ../../x86/kernel/cpu/mtrr/
+obj-$(CONFIG_ACPI)             += ../../x86/kernel/acpi/
+obj-$(CONFIG_X86_MSR)          += msr.o
+obj-$(CONFIG_MICROCODE)                += microcode.o
+obj-$(CONFIG_X86_CPUID)                += cpuid.o
+obj-$(CONFIG_SMP)              += smp_64.o smpboot_64.o trampoline_64.o tsc_sync.o
+obj-y                          += apic_64.o  nmi_64.o
+obj-y                          += io_apic_64.o mpparse_64.o genapic_64.o genapic_flat_64.o
+obj-$(CONFIG_KEXEC)            += machine_kexec_64.o relocate_kernel_64.o crash_64.o
+obj-$(CONFIG_CRASH_DUMP)       += crash_dump_64.o
+obj-$(CONFIG_PM)               += suspend_64.o
+obj-$(CONFIG_HIBERNATION)      += suspend_asm_64.o
+obj-$(CONFIG_CPU_FREQ)         += ../../x86/kernel/cpu/cpufreq/
+obj-$(CONFIG_EARLY_PRINTK)     += early_printk.o
+obj-$(CONFIG_IOMMU)            += pci-gart_64.o aperture_64.o
+obj-$(CONFIG_CALGARY_IOMMU)    += pci-calgary_64.o tce_64.o
+obj-$(CONFIG_SWIOTLB)          += pci-swiotlb_64.o
+obj-$(CONFIG_KPROBES)          += kprobes_64.o
+obj-$(CONFIG_X86_PM_TIMER)     += pmtimer_64.o
+obj-$(CONFIG_X86_VSMP)         += vsmp_64.o
+obj-$(CONFIG_K8_NB)            += k8.o
+obj-$(CONFIG_AUDIT)            += audit_64.o
+
+obj-$(CONFIG_MODULES)          += module_64.o
+obj-$(CONFIG_PCI)              += early-quirks_64.o
+
+obj-y                          += topology.o
+obj-y                          += intel_cacheinfo.o
+obj-y                          += addon_cpuid_features.o
+obj-y                          += pcspeaker.o
+
+CFLAGS_vsyscall_64.o           := $(PROFILING) -g0
+
+therm_throt-y                   += ../../x86/kernel/cpu/mcheck/therm_throt.o
+intel_cacheinfo-y              += ../../x86/kernel/cpu/intel_cacheinfo.o
+addon_cpuid_features-y         += ../../x86/kernel/cpu/addon_cpuid_features.o
+perfctr-watchdog-y             += ../../x86/kernel/cpu/perfctr-watchdog.o
index 5e3b3f5496c5ff8c9fc6244ca6370cf79ef6e9f3..8b4357e1efe0b97dc37b24e5d4cc66321ecfc0cb 100644 (file)
@@ -269,7 +269,7 @@ no_longmode:
        movb    $0xbc,%al       ;  outb %al,$0x80
        jmp no_longmode
 
-#include "../../../x86_64/kernel/verify_cpu_64.S"
+#include "../verify_cpu_64.S"
        
 /* This code uses an extended set of video mode numbers. These include:
  * Aliases for standard modes
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
new file mode 100644 (file)
index 0000000..8f681ca
--- /dev/null
@@ -0,0 +1,298 @@
+/* 
+ * Firmware replacement code.
+ * 
+ * Work around broken BIOSes that don't set an aperture or only set the
+ * aperture in the AGP bridge. 
+ * If all fails map the aperture over some low memory.  This is cheaper than 
+ * doing bounce buffering. The memory is lost. This is done at early boot 
+ * because only the bootmem allocator can allocate 32+MB. 
+ * 
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/pci_ids.h>
+#include <linux/pci.h>
+#include <linux/bitops.h>
+#include <linux/ioport.h>
+#include <asm/e820.h>
+#include <asm/io.h>
+#include <asm/iommu.h>
+#include <asm/pci-direct.h>
+#include <asm/dma.h>
+#include <asm/k8.h>
+
+int iommu_aperture;
+int iommu_aperture_disabled __initdata = 0;
+int iommu_aperture_allowed __initdata = 0;
+
+int fallback_aper_order __initdata = 1; /* 64MB */
+int fallback_aper_force __initdata = 0; 
+
+int fix_aperture __initdata = 1;
+
+static struct resource gart_resource = {
+       .name   = "GART",
+       .flags  = IORESOURCE_MEM,
+};
+
+static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
+{
+       gart_resource.start = aper_base;
+       gart_resource.end = aper_base + aper_size - 1;
+       insert_resource(&iomem_resource, &gart_resource);
+}
+
+/* This code runs before the PCI subsystem is initialized, so just
+   access the northbridge directly. */
+
+static u32 __init allocate_aperture(void) 
+{
+       u32 aper_size;
+       void *p; 
+
+       if (fallback_aper_order > 7) 
+               fallback_aper_order = 7; 
+       aper_size = (32 * 1024 * 1024) << fallback_aper_order; 
+
+       /* 
+        * Aperture has to be naturally aligned. This means an 2GB aperture won't
+        * have much chance of finding a place in the lower 4GB of memory.
+        * Unfortunately we cannot move it up because that would make the
+        * IOMMU useless.
+        */
+       p = __alloc_bootmem_nopanic(aper_size, aper_size, 0);
+       if (!p || __pa(p)+aper_size > 0xffffffff) {
+               printk("Cannot allocate aperture memory hole (%p,%uK)\n",
+                      p, aper_size>>10);
+               if (p)
+                       free_bootmem(__pa(p), aper_size);
+               return 0;
+       }
+       printk("Mapping aperture over %d KB of RAM @ %lx\n",
+              aper_size >> 10, __pa(p)); 
+       insert_aperture_resource((u32)__pa(p), aper_size);
+       return (u32)__pa(p); 
+}
+
+static int __init aperture_valid(u64 aper_base, u32 aper_size)
+{ 
+       if (!aper_base) 
+               return 0;
+       if (aper_size < 64*1024*1024) { 
+               printk("Aperture too small (%d MB)\n", aper_size>>20);
+               return 0;
+       }
+       if (aper_base + aper_size > 0x100000000UL) {
+               printk("Aperture beyond 4GB. Ignoring.\n");
+               return 0; 
+       }
+       if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
+               printk("Aperture pointing to e820 RAM. Ignoring.\n");
+               return 0; 
+       } 
+       return 1;
+} 
+
+/* Find a PCI capability */
+static __u32 __init find_cap(int num, int slot, int func, int cap) 
+{ 
+       u8 pos;
+       int bytes;
+       if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST))
+               return 0;
+       pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST);
+       for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { 
+               u8 id;
+               pos &= ~3; 
+               id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID);
+               if (id == 0xff)
+                       break;
+               if (id == cap) 
+                       return pos; 
+               pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); 
+       } 
+       return 0;
+} 
+
+/* Read a standard AGPv3 bridge header */
+static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
+{ 
+       u32 apsize;
+       u32 apsizereg;
+       int nbits;
+       u32 aper_low, aper_hi;
+       u64 aper;
+
+       printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func);
+       apsizereg = read_pci_config_16(num,slot,func, cap + 0x14);
+       if (apsizereg == 0xffffffff) {
+               printk("APSIZE in AGP bridge unreadable\n");
+               return 0;
+       }
+
+       apsize = apsizereg & 0xfff;
+       /* Some BIOS use weird encodings not in the AGPv3 table. */
+       if (apsize & 0xff) 
+               apsize |= 0xf00; 
+       nbits = hweight16(apsize);
+       *order = 7 - nbits;
+       if ((int)*order < 0) /* < 32MB */
+               *order = 0;
+       
+       aper_low = read_pci_config(num,slot,func, 0x10);
+       aper_hi = read_pci_config(num,slot,func,0x14);
+       aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
+
+       printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", 
+              aper, 32 << *order, apsizereg);
+
+       if (!aperture_valid(aper, (32*1024*1024) << *order))
+           return 0;
+       return (u32)aper; 
+} 
+
+/* Look for an AGP bridge. Windows only expects the aperture in the
+   AGP bridge and some BIOS forget to initialize the Northbridge too.
+   Work around this here. 
+
+   Do an PCI bus scan by hand because we're running before the PCI
+   subsystem. 
+
+   All K8 AGP bridges are AGPv3 compliant, so we can do this scan
+   generically. It's probably overkill to always scan all slots because
+   the AGP bridges should be always an own bus on the HT hierarchy, 
+   but do it here for future safety. */
+static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
+{
+       int num, slot, func;
+
+       /* Poor man's PCI discovery */
+       for (num = 0; num < 256; num++) { 
+               for (slot = 0; slot < 32; slot++) { 
+                       for (func = 0; func < 8; func++) { 
+                               u32 class, cap;
+                               u8 type;
+                               class = read_pci_config(num,slot,func,
+                                                       PCI_CLASS_REVISION);
+                               if (class == 0xffffffff)
+                                       break; 
+                               
+                               switch (class >> 16) { 
+                               case PCI_CLASS_BRIDGE_HOST:
+                               case PCI_CLASS_BRIDGE_OTHER: /* needed? */
+                                       /* AGP bridge? */
+                                       cap = find_cap(num,slot,func,PCI_CAP_ID_AGP);
+                                       if (!cap)
+                                               break;
+                                       *valid_agp = 1; 
+                                       return read_agp(num,slot,func,cap,order);
+                               } 
+                               
+                               /* No multi-function device? */
+                               type = read_pci_config_byte(num,slot,func,
+                                                              PCI_HEADER_TYPE);
+                               if (!(type & 0x80))
+                                       break;
+                       } 
+               } 
+       }
+       printk("No AGP bridge found\n"); 
+       return 0;
+}
+
+void __init iommu_hole_init(void) 
+{ 
+       int fix, num; 
+       u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
+       u64 aper_base, last_aper_base = 0;
+       int valid_agp = 0;
+
+       if (iommu_aperture_disabled || !fix_aperture || !early_pci_allowed())
+               return;
+
+       printk(KERN_INFO  "Checking aperture...\n");
+
+       fix = 0;
+       for (num = 24; num < 32; num++) {               
+               if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
+                       continue;
+
+               iommu_detected = 1;
+               iommu_aperture = 1; 
+
+               aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; 
+               aper_size = (32 * 1024 * 1024) << aper_order; 
+               aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
+               aper_base <<= 25; 
+
+               printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, 
+                      aper_base, aper_size>>20);
+               
+               if (!aperture_valid(aper_base, aper_size)) {
+                       fix = 1; 
+                       break; 
+               }
+
+               if ((last_aper_order && aper_order != last_aper_order) ||
+                   (last_aper_base && aper_base != last_aper_base)) {
+                       fix = 1;
+                       break;
+               }
+               last_aper_order = aper_order;
+               last_aper_base = aper_base;
+       } 
+
+       if (!fix && !fallback_aper_force) {
+               if (last_aper_base) {
+                       unsigned long n = (32 * 1024 * 1024) << last_aper_order;
+                       insert_aperture_resource((u32)last_aper_base, n);
+               }
+               return; 
+       }
+
+       if (!fallback_aper_force)
+               aper_alloc = search_agp_bridge(&aper_order, &valid_agp); 
+               
+       if (aper_alloc) { 
+               /* Got the aperture from the AGP bridge */
+       } else if (swiotlb && !valid_agp) {
+               /* Do nothing */
+       } else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) ||
+                  force_iommu ||
+                  valid_agp ||
+                  fallback_aper_force) { 
+               printk("Your BIOS doesn't leave a aperture memory hole\n");
+               printk("Please enable the IOMMU option in the BIOS setup\n");
+               printk("This costs you %d MB of RAM\n",
+                      32 << fallback_aper_order);
+
+               aper_order = fallback_aper_order;
+               aper_alloc = allocate_aperture();
+               if (!aper_alloc) { 
+                       /* Could disable AGP and IOMMU here, but it's probably
+                          not worth it. But the later users cannot deal with
+                          bad apertures and turning on the aperture over memory
+                          causes very strange problems, so it's better to 
+                          panic early. */
+                       panic("Not enough memory for aperture");
+               }
+       } else { 
+               return; 
+       } 
+
+       /* Fix up the north bridges */
+       for (num = 24; num < 32; num++) {               
+               if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
+                       continue;       
+
+               /* Don't enable translation yet. That is done later. 
+                  Assume this BIOS didn't initialise the GART so 
+                  just overwrite all previous bits */ 
+               write_pci_config(0, num, 3, 0x90, aper_order<<1); 
+               write_pci_config(0, num, 3, 0x94, aper_alloc>>25); 
+       } 
+} 
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
new file mode 100644 (file)
index 0000000..925758d
--- /dev/null
@@ -0,0 +1,1253 @@
+/*
+ *     Local APIC handling, local APIC timers
+ *
+ *     (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ *
+ *     Fixes
+ *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
+ *                                     thanks to Eric Gilmore
+ *                                     and Rolf G. Tews
+ *                                     for testing these extensively.
+ *     Maciej W. Rozycki       :       Various updates and fixes.
+ *     Mikael Pettersson       :       Power Management for UP-APIC.
+ *     Pavel Machek and
+ *     Mikael Pettersson       :       PM converted to driver model.
+ */
+
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/interrupt.h>
+#include <linux/mc146818rtc.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/module.h>
+#include <linux/ioport.h>
+
+#include <asm/atomic.h>
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/pgalloc.h>
+#include <asm/mach_apic.h>
+#include <asm/nmi.h>
+#include <asm/idle.h>
+#include <asm/proto.h>
+#include <asm/timex.h>
+#include <asm/hpet.h>
+#include <asm/apic.h>
+
+int apic_mapped;
+int apic_verbosity;
+int apic_runs_main_timer;
+int apic_calibrate_pmtmr __initdata;
+
+int disable_apic_timer __initdata;
+
+/* Local APIC timer works in C2? */
+int local_apic_timer_c2_ok;
+EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
+
+static struct resource *ioapic_resources;
+static struct resource lapic_resource = {
+       .name = "Local APIC",
+       .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
+};
+
+/*
+ * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
+ * IPIs in place of local APIC timers
+ */
+static cpumask_t timer_interrupt_broadcast_ipi_mask;
+
+/* Using APIC to generate smp_local_timer_interrupt? */
+int using_apic_timer __read_mostly = 0;
+
+static void apic_pm_activate(void);
+
+void apic_wait_icr_idle(void)
+{
+       while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
+               cpu_relax();
+}
+
+unsigned int safe_apic_wait_icr_idle(void)
+{
+       unsigned int send_status;
+       int timeout;
+
+       timeout = 0;
+       do {
+               send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
+               if (!send_status)
+                       break;
+               udelay(100);
+       } while (timeout++ < 1000);
+
+       return send_status;
+}
+
+void enable_NMI_through_LVT0 (void * dummy)
+{
+       unsigned int v;
+
+       /* unmask and set to NMI */
+       v = APIC_DM_NMI;
+       apic_write(APIC_LVT0, v);
+}
+
+int get_maxlvt(void)
+{
+       unsigned int v, maxlvt;
+
+       v = apic_read(APIC_LVR);
+       maxlvt = GET_APIC_MAXLVT(v);
+       return maxlvt;
+}
+
+/*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themselves.
+ */
+void ack_bad_irq(unsigned int irq)
+{
+       printk("unexpected IRQ trap at vector %02x\n", irq);
+       /*
+        * Currently unexpected vectors happen only on SMP and APIC.
+        * We _must_ ack these because every local APIC has only N
+        * irq slots per priority level, and a 'hanging, unacked' IRQ
+        * holds up an irq slot - in excessive cases (when multiple
+        * unexpected vectors occur) that might lock up the APIC
+        * completely.
+        * But don't ack when the APIC is disabled. -AK
+        */
+       if (!disable_apic)
+               ack_APIC_irq();
+}
+
+void clear_local_APIC(void)
+{
+       int maxlvt;
+       unsigned int v;
+
+       maxlvt = get_maxlvt();
+
+       /*
+        * Masking an LVT entry can trigger a local APIC error
+        * if the vector is zero. Mask LVTERR first to prevent this.
+        */
+       if (maxlvt >= 3) {
+               v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
+               apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
+       }
+       /*
+        * Careful: we have to set masks only first to deassert
+        * any level-triggered sources.
+        */
+       v = apic_read(APIC_LVTT);
+       apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
+       v = apic_read(APIC_LVT0);
+       apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
+       v = apic_read(APIC_LVT1);
+       apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
+       if (maxlvt >= 4) {
+               v = apic_read(APIC_LVTPC);
+               apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
+       }
+
+       /*
+        * Clean APIC state for other OSs:
+        */
+       apic_write(APIC_LVTT, APIC_LVT_MASKED);
+       apic_write(APIC_LVT0, APIC_LVT_MASKED);
+       apic_write(APIC_LVT1, APIC_LVT_MASKED);
+       if (maxlvt >= 3)
+               apic_write(APIC_LVTERR, APIC_LVT_MASKED);
+       if (maxlvt >= 4)
+               apic_write(APIC_LVTPC, APIC_LVT_MASKED);
+       apic_write(APIC_ESR, 0);
+       apic_read(APIC_ESR);
+}
+
+void disconnect_bsp_APIC(int virt_wire_setup)
+{
+       /* Go back to Virtual Wire compatibility mode */
+       unsigned long value;
+
+       /* For the spurious interrupt use vector F, and enable it */
+       value = apic_read(APIC_SPIV);
+       value &= ~APIC_VECTOR_MASK;
+       value |= APIC_SPIV_APIC_ENABLED;
+       value |= 0xf;
+       apic_write(APIC_SPIV, value);
+
+       if (!virt_wire_setup) {
+               /* For LVT0 make it edge triggered, active high, external and enabled */
+               value = apic_read(APIC_LVT0);
+               value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+                       APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+                       APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
+               value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+               value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
+               apic_write(APIC_LVT0, value);
+       } else {
+               /* Disable LVT0 */
+               apic_write(APIC_LVT0, APIC_LVT_MASKED);
+       }
+
+       /* For LVT1 make it edge triggered, active high, nmi and enabled */
+       value = apic_read(APIC_LVT1);
+       value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+                       APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+                       APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+       value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+       value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+       apic_write(APIC_LVT1, value);
+}
+
+void disable_local_APIC(void)
+{
+       unsigned int value;
+
+       clear_local_APIC();
+
+       /*
+        * Disable APIC (implies clearing of registers
+        * for 82489DX!).
+        */
+       value = apic_read(APIC_SPIV);
+       value &= ~APIC_SPIV_APIC_ENABLED;
+       apic_write(APIC_SPIV, value);
+}
+
+/*
+ * This is to verify that we're looking at a real local APIC.
+ * Check these against your board if the CPUs aren't getting
+ * started for no apparent reason.
+ */
+int __init verify_local_APIC(void)
+{
+       unsigned int reg0, reg1;
+
+       /*
+        * The version register is read-only in a real APIC.
+        */
+       reg0 = apic_read(APIC_LVR);
+       apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
+       apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
+       reg1 = apic_read(APIC_LVR);
+       apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
+
+       /*
+        * The two version reads above should print the same
+        * numbers.  If the second one is different, then we
+        * poke at a non-APIC.
+        */
+       if (reg1 != reg0)
+               return 0;
+
+       /*
+        * Check if the version looks reasonably.
+        */
+       reg1 = GET_APIC_VERSION(reg0);
+       if (reg1 == 0x00 || reg1 == 0xff)
+               return 0;
+       reg1 = get_maxlvt();
+       if (reg1 < 0x02 || reg1 == 0xff)
+               return 0;
+
+       /*
+        * The ID register is read/write in a real APIC.
+        */
+       reg0 = apic_read(APIC_ID);
+       apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
+       apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
+       reg1 = apic_read(APIC_ID);
+       apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
+       apic_write(APIC_ID, reg0);
+       if (reg1 != (reg0 ^ APIC_ID_MASK))
+               return 0;
+
+       /*
+        * The next two are just to see if we have sane values.
+        * They're only really relevant if we're in Virtual Wire
+        * compatibility mode, but most boxes are anymore.
+        */
+       reg0 = apic_read(APIC_LVT0);
+       apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0);
+       reg1 = apic_read(APIC_LVT1);
+       apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
+
+       return 1;
+}
+
+void __init sync_Arb_IDs(void)
+{
+       /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
+       unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
+       if (ver >= 0x14)        /* P4 or higher */
+               return;
+
+       /*
+        * Wait for idle.
+        */
+       apic_wait_icr_idle();
+
+       apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
+       apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
+                               | APIC_DM_INIT);
+}
+
+/*
+ * An initial setup of the virtual wire mode.
+ */
+void __init init_bsp_APIC(void)
+{
+       unsigned int value;
+
+       /*
+        * Don't do the setup now if we have a SMP BIOS as the
+        * through-I/O-APIC virtual wire mode might be active.
+        */
+       if (smp_found_config || !cpu_has_apic)
+               return;
+
+       value = apic_read(APIC_LVR);
+
+       /*
+        * Do not trust the local APIC being empty at bootup.
+        */
+       clear_local_APIC();
+
+       /*
+        * Enable APIC.
+        */
+       value = apic_read(APIC_SPIV);
+       value &= ~APIC_VECTOR_MASK;
+       value |= APIC_SPIV_APIC_ENABLED;
+       value |= APIC_SPIV_FOCUS_DISABLED;
+       value |= SPURIOUS_APIC_VECTOR;
+       apic_write(APIC_SPIV, value);
+
+       /*
+        * Set up the virtual wire mode.
+        */
+       apic_write(APIC_LVT0, APIC_DM_EXTINT);
+       value = APIC_DM_NMI;
+       apic_write(APIC_LVT1, value);
+}
+
+void __cpuinit setup_local_APIC (void)
+{
+       unsigned int value, maxlvt;
+       int i, j;
+
+       value = apic_read(APIC_LVR);
+
+       BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f);
+
+       /*
+        * Double-check whether this APIC is really registered.
+        * This is meaningless in clustered apic mode, so we skip it.
+        */
+       if (!apic_id_registered())
+               BUG();
+
+       /*
+        * Intel recommends to set DFR, LDR and TPR before enabling
+        * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel
+        * document number 292116).  So here it goes...
+        */
+       init_apic_ldr();
+
+       /*
+        * Set Task Priority to 'accept all'. We never change this
+        * later on.
+        */
+       value = apic_read(APIC_TASKPRI);
+       value &= ~APIC_TPRI_MASK;
+       apic_write(APIC_TASKPRI, value);
+
+       /*
+        * After a crash, we no longer service the interrupts and a pending
+        * interrupt from previous kernel might still have ISR bit set.
+        *
+        * Most probably by now CPU has serviced that pending interrupt and
+        * it might not have done the ack_APIC_irq() because it thought,
+        * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
+        * does not clear the ISR bit and cpu thinks it has already serivced
+        * the interrupt. Hence a vector might get locked. It was noticed
+        * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
+        */
+       for (i = APIC_ISR_NR - 1; i >= 0; i--) {
+               value = apic_read(APIC_ISR + i*0x10);
+               for (j = 31; j >= 0; j--) {
+                       if (value & (1<<j))
+                               ack_APIC_irq();
+               }
+       }
+
+       /*
+        * Now that we are all set up, enable the APIC
+        */
+       value = apic_read(APIC_SPIV);
+       value &= ~APIC_VECTOR_MASK;
+       /*
+        * Enable APIC
+        */
+       value |= APIC_SPIV_APIC_ENABLED;
+
+       /* We always use processor focus */
+
+       /*
+        * Set spurious IRQ vector
+        */
+       value |= SPURIOUS_APIC_VECTOR;
+       apic_write(APIC_SPIV, value);
+
+       /*
+        * Set up LVT0, LVT1:
+        *
+        * set up through-local-APIC on the BP's LINT0. This is not
+        * strictly necessary in pure symmetric-IO mode, but sometimes
+        * we delegate interrupts to the 8259A.
+        */
+       /*
+        * TODO: set up through-local-APIC from through-I/O-APIC? --macro
+        */
+       value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
+       if (!smp_processor_id() && !value) {
+               value = APIC_DM_EXTINT;
+               apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id());
+       } else {
+               value = APIC_DM_EXTINT | APIC_LVT_MASKED;
+               apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id());
+       }
+       apic_write(APIC_LVT0, value);
+
+       /*
+        * only the BP should see the LINT1 NMI signal, obviously.
+        */
+       if (!smp_processor_id())
+               value = APIC_DM_NMI;
+       else
+               value = APIC_DM_NMI | APIC_LVT_MASKED;
+       apic_write(APIC_LVT1, value);
+
+       {
+               unsigned oldvalue;
+               maxlvt = get_maxlvt();
+               oldvalue = apic_read(APIC_ESR);
+               value = ERROR_APIC_VECTOR;      // enables sending errors
+               apic_write(APIC_LVTERR, value);
+               /*
+                * spec says clear errors after enabling vector.
+                */
+               if (maxlvt > 3)
+                       apic_write(APIC_ESR, 0);
+               value = apic_read(APIC_ESR);
+               if (value != oldvalue)
+                       apic_printk(APIC_VERBOSE,
+                       "ESR value after enabling vector: %08x, after %08x\n",
+                       oldvalue, value);
+       }
+
+       nmi_watchdog_default();
+       setup_apic_nmi_watchdog(NULL);
+       apic_pm_activate();
+}
+
+#ifdef CONFIG_PM
+
+static struct {
+       /* 'active' is true if the local APIC was enabled by us and
+          not the BIOS; this signifies that we are also responsible
+          for disabling it before entering apm/acpi suspend */
+       int active;
+       /* r/w apic fields */
+       unsigned int apic_id;
+       unsigned int apic_taskpri;
+       unsigned int apic_ldr;
+       unsigned int apic_dfr;
+       unsigned int apic_spiv;
+       unsigned int apic_lvtt;
+       unsigned int apic_lvtpc;
+       unsigned int apic_lvt0;
+       unsigned int apic_lvt1;
+       unsigned int apic_lvterr;
+       unsigned int apic_tmict;
+       unsigned int apic_tdcr;
+       unsigned int apic_thmr;
+} apic_pm_state;
+
+static int lapic_suspend(struct sys_device *dev, pm_message_t state)
+{
+       unsigned long flags;
+       int maxlvt;
+
+       if (!apic_pm_state.active)
+               return 0;
+
+       maxlvt = get_maxlvt();
+
+       apic_pm_state.apic_id = apic_read(APIC_ID);
+       apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
+       apic_pm_state.apic_ldr = apic_read(APIC_LDR);
+       apic_pm_state.apic_dfr = apic_read(APIC_DFR);
+       apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
+       apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
+       if (maxlvt >= 4)
+               apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
+       apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
+       apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
+       apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
+       apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
+       apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
+#ifdef CONFIG_X86_MCE_INTEL
+       if (maxlvt >= 5)
+               apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
+#endif
+       local_irq_save(flags);
+       disable_local_APIC();
+       local_irq_restore(flags);
+       return 0;
+}
+
+static int lapic_resume(struct sys_device *dev)
+{
+       unsigned int l, h;
+       unsigned long flags;
+       int maxlvt;
+
+       if (!apic_pm_state.active)
+               return 0;
+
+       maxlvt = get_maxlvt();
+
+       local_irq_save(flags);
+       rdmsr(MSR_IA32_APICBASE, l, h);
+       l &= ~MSR_IA32_APICBASE_BASE;
+       l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
+       wrmsr(MSR_IA32_APICBASE, l, h);
+       apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
+       apic_write(APIC_ID, apic_pm_state.apic_id);
+       apic_write(APIC_DFR, apic_pm_state.apic_dfr);
+       apic_write(APIC_LDR, apic_pm_state.apic_ldr);
+       apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
+       apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
+       apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
+       apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
+#ifdef CONFIG_X86_MCE_INTEL
+       if (maxlvt >= 5)
+               apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
+#endif
+       if (maxlvt >= 4)
+               apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
+       apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
+       apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
+       apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
+       apic_write(APIC_ESR, 0);
+       apic_read(APIC_ESR);
+       apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
+       apic_write(APIC_ESR, 0);
+       apic_read(APIC_ESR);
+       local_irq_restore(flags);
+       return 0;
+}
+
+static struct sysdev_class lapic_sysclass = {
+       set_kset_name("lapic"),
+       .resume         = lapic_resume,
+       .suspend        = lapic_suspend,
+};
+
+static struct sys_device device_lapic = {
+       .id             = 0,
+       .cls            = &lapic_sysclass,
+};
+
+static void __cpuinit apic_pm_activate(void)
+{
+       apic_pm_state.active = 1;
+}
+
+static int __init init_lapic_sysfs(void)
+{
+       int error;
+       if (!cpu_has_apic)
+               return 0;
+       /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
+       error = sysdev_class_register(&lapic_sysclass);
+       if (!error)
+               error = sysdev_register(&device_lapic);
+       return error;
+}
+device_initcall(init_lapic_sysfs);
+
+#else  /* CONFIG_PM */
+
+static void apic_pm_activate(void) { }
+
+#endif /* CONFIG_PM */
+
+static int __init apic_set_verbosity(char *str)
+{
+       if (str == NULL)  {
+               skip_ioapic_setup = 0;
+               ioapic_force = 1;
+               return 0;
+       }
+       if (strcmp("debug", str) == 0)
+               apic_verbosity = APIC_DEBUG;
+       else if (strcmp("verbose", str) == 0)
+               apic_verbosity = APIC_VERBOSE;
+       else {
+               printk(KERN_WARNING "APIC Verbosity level %s not recognised"
+                               " use apic=verbose or apic=debug\n", str);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+early_param("apic", apic_set_verbosity);
+
+/*
+ * Detect and enable local APICs on non-SMP boards.
+ * Original code written by Keir Fraser.
+ * On AMD64 we trust the BIOS - if it says no APIC it is likely
+ * not correctly set up (usually the APIC timer won't work etc.)
+ */
+
+static int __init detect_init_APIC (void)
+{
+       if (!cpu_has_apic) {
+               printk(KERN_INFO "No local APIC present\n");
+               return -1;
+       }
+
+       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+       boot_cpu_id = 0;
+       return 0;
+}
+
+#ifdef CONFIG_X86_IO_APIC
+static struct resource * __init ioapic_setup_resources(void)
+{
+#define IOAPIC_RESOURCE_NAME_SIZE 11
+       unsigned long n;
+       struct resource *res;
+       char *mem;
+       int i;
+
+       if (nr_ioapics <= 0)
+               return NULL;
+
+       n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
+       n *= nr_ioapics;
+
+       mem = alloc_bootmem(n);
+       res = (void *)mem;
+
+       if (mem != NULL) {
+               memset(mem, 0, n);
+               mem += sizeof(struct resource) * nr_ioapics;
+
+               for (i = 0; i < nr_ioapics; i++) {
+                       res[i].name = mem;
+                       res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+                       sprintf(mem,  "IOAPIC %u", i);
+                       mem += IOAPIC_RESOURCE_NAME_SIZE;
+               }
+       }
+
+       ioapic_resources = res;
+
+       return res;
+}
+
+static int __init ioapic_insert_resources(void)
+{
+       int i;
+       struct resource *r = ioapic_resources;
+
+       if (!r) {
+               printk("IO APIC resources could be not be allocated.\n");
+               return -1;
+       }
+
+       for (i = 0; i < nr_ioapics; i++) {
+               insert_resource(&iomem_resource, r);
+               r++;
+       }
+
+       return 0;
+}
+
+/* Insert the IO APIC resources after PCI initialization has occured to handle
+ * IO APICS that are mapped in on a BAR in PCI space. */
+late_initcall(ioapic_insert_resources);
+#endif
+
+void __init init_apic_mappings(void)
+{
+       unsigned long apic_phys;
+
+       /*
+        * If no local APIC can be found then set up a fake all
+        * zeroes page to simulate the local APIC and another
+        * one for the IO-APIC.
+        */
+       if (!smp_found_config && detect_init_APIC()) {
+               apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
+               apic_phys = __pa(apic_phys);
+       } else
+               apic_phys = mp_lapic_addr;
+
+       set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+       apic_mapped = 1;
+       apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys);
+
+       /* Put local APIC into the resource map. */
+       lapic_resource.start = apic_phys;
+       lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
+       insert_resource(&iomem_resource, &lapic_resource);
+
+       /*
+        * Fetch the APIC ID of the BSP in case we have a
+        * default configuration (or the MP table is broken).
+        */
+       boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
+
+       {
+               unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
+               int i;
+               struct resource *ioapic_res;
+
+               ioapic_res = ioapic_setup_resources();
+               for (i = 0; i < nr_ioapics; i++) {
+                       if (smp_found_config) {
+                               ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+                       } else {
+                               ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
+                               ioapic_phys = __pa(ioapic_phys);
+                       }
+                       set_fixmap_nocache(idx, ioapic_phys);
+                       apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n",
+                                       __fix_to_virt(idx), ioapic_phys);
+                       idx++;
+
+                       if (ioapic_res != NULL) {
+                               ioapic_res->start = ioapic_phys;
+                               ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
+                               ioapic_res++;
+                       }
+               }
+       }
+}
+
+/*
+ * This function sets up the local APIC timer, with a timeout of
+ * 'clocks' APIC bus clock. During calibration we actually call
+ * this function twice on the boot CPU, once with a bogus timeout
+ * value, second time for real. The other (noncalibrating) CPUs
+ * call this function only once, with the real, calibrated value.
+ *
+ * We do reads before writes even if unnecessary, to get around the
+ * P5 APIC double write bug.
+ */
+
+#define APIC_DIVISOR 16
+
+static void __setup_APIC_LVTT(unsigned int clocks)
+{
+       unsigned int lvtt_value, tmp_value;
+       int cpu = smp_processor_id();
+
+       lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
+
+       if (cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask))
+               lvtt_value |= APIC_LVT_MASKED;
+
+       apic_write(APIC_LVTT, lvtt_value);
+
+       /*
+        * Divide PICLK by 16
+        */
+       tmp_value = apic_read(APIC_TDCR);
+       apic_write(APIC_TDCR, (tmp_value
+                               & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
+                               | APIC_TDR_DIV_16);
+
+       apic_write(APIC_TMICT, clocks/APIC_DIVISOR);
+}
+
+static void setup_APIC_timer(unsigned int clocks)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+
+       /* wait for irq slice */
+       if (hpet_address && hpet_use_timer) {
+               u32 trigger = hpet_readl(HPET_T0_CMP);
+               while (hpet_readl(HPET_T0_CMP) == trigger)
+                       /* do nothing */ ;
+       } else {
+               int c1, c2;
+               outb_p(0x00, 0x43);
+               c2 = inb_p(0x40);
+               c2 |= inb_p(0x40) << 8;
+               do {
+                       c1 = c2;
+                       outb_p(0x00, 0x43);
+                       c2 = inb_p(0x40);
+                       c2 |= inb_p(0x40) << 8;
+               } while (c2 - c1 < 300);
+       }
+       __setup_APIC_LVTT(clocks);
+       /* Turn off PIT interrupt if we use APIC timer as main timer.
+          Only works with the PM timer right now
+          TBD fix it for HPET too. */
+       if ((pmtmr_ioport != 0) &&
+               smp_processor_id() == boot_cpu_id &&
+               apic_runs_main_timer == 1 &&
+               !cpu_isset(boot_cpu_id, timer_interrupt_broadcast_ipi_mask)) {
+               stop_timer_interrupt();
+               apic_runs_main_timer++;
+       }
+       local_irq_restore(flags);
+}
+
+/*
+ * In this function we calibrate APIC bus clocks to the external
+ * timer. Unfortunately we cannot use jiffies and the timer irq
+ * to calibrate, since some later bootup code depends on getting
+ * the first irq? Ugh.
+ *
+ * We want to do the calibration only once since we
+ * want to have local timer irqs syncron. CPUs connected
+ * by the same APIC bus have the very same bus frequency.
+ * And we want to have irqs off anyways, no accidental
+ * APIC irq that way.
+ */
+
+#define TICK_COUNT 100000000
+
+static int __init calibrate_APIC_clock(void)
+{
+       unsigned apic, apic_start;
+       unsigned long tsc, tsc_start;
+       int result;
+       /*
+        * Put whatever arbitrary (but long enough) timeout
+        * value into the APIC clock, we just want to get the
+        * counter running for calibration.
+        */
+       __setup_APIC_LVTT(4000000000);
+
+       apic_start = apic_read(APIC_TMCCT);
+#ifdef CONFIG_X86_PM_TIMER
+       if (apic_calibrate_pmtmr && pmtmr_ioport) {
+               pmtimer_wait(5000);  /* 5ms wait */
+               apic = apic_read(APIC_TMCCT);
+               result = (apic_start - apic) * 1000L / 5;
+       } else
+#endif
+       {
+               rdtscll(tsc_start);
+
+               do {
+                       apic = apic_read(APIC_TMCCT);
+                       rdtscll(tsc);
+               } while ((tsc - tsc_start) < TICK_COUNT &&
+                               (apic_start - apic) < TICK_COUNT);
+
+               result = (apic_start - apic) * 1000L * tsc_khz /
+                                       (tsc - tsc_start);
+       }
+       printk("result %d\n", result);
+
+
+       printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
+               result / 1000 / 1000, result / 1000 % 1000);
+
+       return result * APIC_DIVISOR / HZ;
+}
+
+static unsigned int calibration_result;
+
+void __init setup_boot_APIC_clock (void)
+{
+       if (disable_apic_timer) {
+               printk(KERN_INFO "Disabling APIC timer\n");
+               return;
+       }
+
+       printk(KERN_INFO "Using local APIC timer interrupts.\n");
+       using_apic_timer = 1;
+
+       local_irq_disable();
+
+       calibration_result = calibrate_APIC_clock();
+       /*
+        * Now set up the timer for real.
+        */
+       setup_APIC_timer(calibration_result);
+
+       local_irq_enable();
+}
+
+void __cpuinit setup_secondary_APIC_clock(void)
+{
+       local_irq_disable(); /* FIXME: Do we need this? --RR */
+       setup_APIC_timer(calibration_result);
+       local_irq_enable();
+}
+
+void disable_APIC_timer(void)
+{
+       if (using_apic_timer) {
+               unsigned long v;
+
+               v = apic_read(APIC_LVTT);
+               /*
+                * When an illegal vector value (0-15) is written to an LVT
+                * entry and delivery mode is Fixed, the APIC may signal an
+                * illegal vector error, with out regard to whether the mask
+                * bit is set or whether an interrupt is actually seen on input.
+                *
+                * Boot sequence might call this function when the LVTT has
+                * '0' vector value. So make sure vector field is set to
+                * valid value.
+                */
+               v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+               apic_write(APIC_LVTT, v);
+       }
+}
+
+void enable_APIC_timer(void)
+{
+       int cpu = smp_processor_id();
+
+       if (using_apic_timer &&
+           !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) {
+               unsigned long v;
+
+               v = apic_read(APIC_LVTT);
+               apic_write(APIC_LVTT, v & ~APIC_LVT_MASKED);
+       }
+}
+
+void switch_APIC_timer_to_ipi(void *cpumask)
+{
+       cpumask_t mask = *(cpumask_t *)cpumask;
+       int cpu = smp_processor_id();
+
+       if (cpu_isset(cpu, mask) &&
+           !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) {
+               disable_APIC_timer();
+               cpu_set(cpu, timer_interrupt_broadcast_ipi_mask);
+       }
+}
+EXPORT_SYMBOL(switch_APIC_timer_to_ipi);
+
+void smp_send_timer_broadcast_ipi(void)
+{
+       int cpu = smp_processor_id();
+       cpumask_t mask;
+
+       cpus_and(mask, cpu_online_map, timer_interrupt_broadcast_ipi_mask);
+
+       if (cpu_isset(cpu, mask)) {
+               cpu_clear(cpu, mask);
+               add_pda(apic_timer_irqs, 1);
+               smp_local_timer_interrupt();
+       }
+
+       if (!cpus_empty(mask)) {
+               send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
+       }
+}
+
+void switch_ipi_to_APIC_timer(void *cpumask)
+{
+       cpumask_t mask = *(cpumask_t *)cpumask;
+       int cpu = smp_processor_id();
+
+       if (cpu_isset(cpu, mask) &&
+           cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) {
+               cpu_clear(cpu, timer_interrupt_broadcast_ipi_mask);
+               enable_APIC_timer();
+       }
+}
+EXPORT_SYMBOL(switch_ipi_to_APIC_timer);
+
+int setup_profiling_timer(unsigned int multiplier)
+{
+       return -EINVAL;
+}
+
+void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector,
+                            unsigned char msg_type, unsigned char mask)
+{
+       unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE;
+       unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;
+       apic_write(reg, v);
+}
+
+#undef APIC_DIVISOR
+
+/*
+ * Local timer interrupt handler. It does both profiling and
+ * process statistics/rescheduling.
+ *
+ * We do profiling in every local tick, statistics/rescheduling
+ * happen only every 'profiling multiplier' ticks. The default
+ * multiplier is 1 and it can be changed by writing the new multiplier
+ * value into /proc/profile.
+ */
+
+void smp_local_timer_interrupt(void)
+{
+       profile_tick(CPU_PROFILING);
+#ifdef CONFIG_SMP
+       update_process_times(user_mode(get_irq_regs()));
+#endif
+       if (apic_runs_main_timer > 1 && smp_processor_id() == boot_cpu_id)
+               main_timer_handler();
+       /*
+        * We take the 'long' return path, and there every subsystem
+        * grabs the appropriate locks (kernel lock/ irq lock).
+        *
+        * We might want to decouple profiling from the 'long path',
+        * and do the profiling totally in assembly.
+        *
+        * Currently this isn't too much of an issue (performance wise),
+        * we can take more than 100K local irqs per second on a 100 MHz P5.
+        */
+}
+
+/*
+ * Local APIC timer interrupt. This is the most natural way for doing
+ * local interrupts, but local timer interrupts can be emulated by
+ * broadcast interrupts too. [in case the hw doesn't support APIC timers]
+ *
+ * [ if a single-CPU system runs an SMP kernel then we call the local
+ *   interrupt as well. Thus we cannot inline the local irq ... ]
+ */
+void smp_apic_timer_interrupt(struct pt_regs *regs)
+{
+       struct pt_regs *old_regs = set_irq_regs(regs);
+
+       /*
+        * the NMI deadlock-detector uses this.
+        */
+       add_pda(apic_timer_irqs, 1);
+
+       /*
+        * NOTE! We'd better ACK the irq immediately,
+        * because timer handling can be slow.
+        */
+       ack_APIC_irq();
+       /*
+        * update_process_times() expects us to have done irq_enter().
+        * Besides, if we don't timer interrupts ignore the global
+        * interrupt lock, which is the WrongThing (tm) to do.
+        */
+       exit_idle();
+       irq_enter();
+       smp_local_timer_interrupt();
+       irq_exit();
+       set_irq_regs(old_regs);
+}
+
+/*
+ * apic_is_clustered_box() -- Check if we can expect good TSC
+ *
+ * Thus far, the major user of this is IBM's Summit2 series:
+ *
+ * Clustered boxes may have unsynced TSC problems if they are
+ * multi-chassis. Use available data to take a good guess.
+ * If in doubt, go HPET.
+ */
+__cpuinit int apic_is_clustered_box(void)
+{
+       int i, clusters, zeros;
+       unsigned id;
+       DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
+
+       bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
+
+       for (i = 0; i < NR_CPUS; i++) {
+               id = bios_cpu_apicid[i];
+               if (id != BAD_APICID)
+                       __set_bit(APIC_CLUSTERID(id), clustermap);
+       }
+
+       /* Problem:  Partially populated chassis may not have CPUs in some of
+        * the APIC clusters they have been allocated.  Only present CPUs have
+        * bios_cpu_apicid entries, thus causing zeroes in the bitmap.  Since
+        * clusters are allocated sequentially, count zeros only if they are
+        * bounded by ones.
+        */
+       clusters = 0;
+       zeros = 0;
+       for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
+               if (test_bit(i, clustermap)) {
+                       clusters += 1 + zeros;
+                       zeros = 0;
+               } else
+                       ++zeros;
+       }
+
+       /*
+        * If clusters > 2, then should be multi-chassis.
+        * May have to revisit this when multi-core + hyperthreaded CPUs come
+        * out, but AFAIK this will work even for them.
+        */
+       return (clusters > 2);
+}
+
+/*
+ * This interrupt should _never_ happen with our APIC/SMP architecture
+ */
+asmlinkage void smp_spurious_interrupt(void)
+{
+       unsigned int v;
+       exit_idle();
+       irq_enter();
+       /*
+        * Check if this really is a spurious interrupt and ACK it
+        * if it is a vectored one.  Just in case...
+        * Spurious interrupts should not be ACKed.
+        */
+       v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
+       if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
+               ack_APIC_irq();
+
+       irq_exit();
+}
+
+/*
+ * This interrupt should never happen with our APIC/SMP architecture
+ */
+
+asmlinkage void smp_error_interrupt(void)
+{
+       unsigned int v, v1;
+
+       exit_idle();
+       irq_enter();
+       /* First tickle the hardware, only then report what went on. -- REW */
+       v = apic_read(APIC_ESR);
+       apic_write(APIC_ESR, 0);
+       v1 = apic_read(APIC_ESR);
+       ack_APIC_irq();
+       atomic_inc(&irq_err_count);
+
+       /* Here is what the APIC error bits mean:
+          0: Send CS error
+          1: Receive CS error
+          2: Send accept error
+          3: Receive accept error
+          4: Reserved
+          5: Send illegal vector
+          6: Received illegal vector
+          7: Illegal register address
+       */
+       printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
+               smp_processor_id(), v , v1);
+       irq_exit();
+}
+
+int disable_apic;
+
+/*
+ * This initializes the IO-APIC and APIC hardware if this is
+ * a UP kernel.
+ */
+int __init APIC_init_uniprocessor (void)
+{
+       if (disable_apic) {
+               printk(KERN_INFO "Apic disabled\n");
+               return -1;
+       }
+       if (!cpu_has_apic) {
+               disable_apic = 1;
+               printk(KERN_INFO "Apic disabled by BIOS\n");
+               return -1;
+       }
+
+       verify_local_APIC();
+
+       phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
+       apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id));
+
+       setup_local_APIC();
+
+       if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
+               setup_IO_APIC();
+       else
+               nr_ioapics = 0;
+       setup_boot_APIC_clock();
+       check_nmi_watchdog();
+       return 0;
+}
+
+static __init int setup_disableapic(char *str)
+{
+       disable_apic = 1;
+       clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+       return 0;
+}
+early_param("disableapic", setup_disableapic);
+
+/* same as disableapic, for compatibility */
+static __init int setup_nolapic(char *str)
+{
+       return setup_disableapic(str);
+}
+early_param("nolapic", setup_nolapic);
+
+static int __init parse_lapic_timer_c2_ok(char *arg)
+{
+       local_apic_timer_c2_ok = 1;
+       return 0;
+}
+early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
+
+static __init int setup_noapictimer(char *str)
+{
+       if (str[0] != ' ' && str[0] != 0)
+               return 0;
+       disable_apic_timer = 1;
+       return 1;
+}
+
+static __init int setup_apicmaintimer(char *str)
+{
+       apic_runs_main_timer = 1;
+       nohpet = 1;
+       return 1;
+}
+__setup("apicmaintimer", setup_apicmaintimer);
+
+static __init int setup_noapicmaintimer(char *str)
+{
+       apic_runs_main_timer = -1;
+       return 1;
+}
+__setup("noapicmaintimer", setup_noapicmaintimer);
+
+static __init int setup_apicpmtimer(char *s)
+{
+       apic_calibrate_pmtmr = 1;
+       notsc_setup(NULL);
+       return setup_apicmaintimer(NULL);
+}
+__setup("apicpmtimer", setup_apicpmtimer);
+
+__setup("noapictimer", setup_noapictimer);
+
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
new file mode 100644 (file)
index 0000000..778953b
--- /dev/null
@@ -0,0 +1,85 @@
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed to extract
+ * and format the required data.
+ */
+
+#include <linux/crypto.h>
+#include <linux/sched.h> 
+#include <linux/stddef.h>
+#include <linux/errno.h> 
+#include <linux/hardirq.h>
+#include <linux/suspend.h>
+#include <asm/pda.h>
+#include <asm/processor.h>
+#include <asm/segment.h>
+#include <asm/thread_info.h>
+#include <asm/ia32.h>
+
+#define DEFINE(sym, val) \
+        asm volatile("\n->" #sym " %0 " #val : : "i" (val))
+
+#define BLANK() asm volatile("\n->" : : )
+
+#define __NO_STUBS 1
+#undef __SYSCALL
+#undef _ASM_X86_64_UNISTD_H_
+#define __SYSCALL(nr, sym) [nr] = 1,
+static char syscalls[] = {
+#include <asm/unistd.h>
+};
+
+int main(void)
+{
+#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
+       ENTRY(state);
+       ENTRY(flags); 
+       ENTRY(thread); 
+       ENTRY(pid);
+       BLANK();
+#undef ENTRY
+#define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry))
+       ENTRY(flags);
+       ENTRY(addr_limit);
+       ENTRY(preempt_count);
+       ENTRY(status);
+       BLANK();
+#undef ENTRY
+#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
+       ENTRY(kernelstack); 
+       ENTRY(oldrsp); 
+       ENTRY(pcurrent); 
+       ENTRY(irqcount);
+       ENTRY(cpunumber);
+       ENTRY(irqstackptr);
+       ENTRY(data_offset);
+       BLANK();
+#undef ENTRY
+#ifdef CONFIG_IA32_EMULATION
+#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry))
+       ENTRY(eax);
+       ENTRY(ebx);
+       ENTRY(ecx);
+       ENTRY(edx);
+       ENTRY(esi);
+       ENTRY(edi);
+       ENTRY(ebp);
+       ENTRY(esp);
+       ENTRY(eip);
+       BLANK();
+#undef ENTRY
+       DEFINE(IA32_RT_SIGFRAME_sigcontext,
+              offsetof (struct rt_sigframe32, uc.uc_mcontext));
+       BLANK();
+#endif
+       DEFINE(pbe_address, offsetof(struct pbe, address));
+       DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
+       DEFINE(pbe_next, offsetof(struct pbe, next));
+       BLANK();
+       DEFINE(TSS_ist, offsetof(struct tss_struct, ist));
+       BLANK();
+       DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
+       BLANK();
+       DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
+       return 0;
+}
diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c
new file mode 100644 (file)
index 0000000..06d3e5a
--- /dev/null
@@ -0,0 +1,81 @@
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/audit.h>
+#include <asm/unistd.h>
+
+static unsigned dir_class[] = {
+#include <asm-generic/audit_dir_write.h>
+~0U
+};
+
+static unsigned read_class[] = {
+#include <asm-generic/audit_read.h>
+~0U
+};
+
+static unsigned write_class[] = {
+#include <asm-generic/audit_write.h>
+~0U
+};
+
+static unsigned chattr_class[] = {
+#include <asm-generic/audit_change_attr.h>
+~0U
+};
+
+static unsigned signal_class[] = {
+#include <asm-generic/audit_signal.h>
+~0U
+};
+
+int audit_classify_arch(int arch)
+{
+#ifdef CONFIG_IA32_EMULATION
+       if (arch == AUDIT_ARCH_I386)
+               return 1;
+#endif
+       return 0;
+}
+
+int audit_classify_syscall(int abi, unsigned syscall)
+{
+#ifdef CONFIG_IA32_EMULATION
+       extern int ia32_classify_syscall(unsigned);
+       if (abi == AUDIT_ARCH_I386)
+               return ia32_classify_syscall(syscall);
+#endif
+       switch(syscall) {
+       case __NR_open:
+               return 2;
+       case __NR_openat:
+               return 3;
+       case __NR_execve:
+               return 5;
+       default:
+               return 0;
+       }
+}
+
+static int __init audit_classes_init(void)
+{
+#ifdef CONFIG_IA32_EMULATION
+       extern __u32 ia32_dir_class[];
+       extern __u32 ia32_write_class[];
+       extern __u32 ia32_read_class[];
+       extern __u32 ia32_chattr_class[];
+       extern __u32 ia32_signal_class[];
+       audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class);
+       audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class);
+       audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class);
+       audit_register_class(AUDIT_CLASS_CHATTR_32, ia32_chattr_class);
+       audit_register_class(AUDIT_CLASS_SIGNAL_32, ia32_signal_class);
+#endif
+       audit_register_class(AUDIT_CLASS_WRITE, write_class);
+       audit_register_class(AUDIT_CLASS_READ, read_class);
+       audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class);
+       audit_register_class(AUDIT_CLASS_CHATTR, chattr_class);
+       audit_register_class(AUDIT_CLASS_SIGNAL, signal_class);
+       return 0;
+}
+
+__initcall(audit_classes_init);
diff --git a/arch/x86/kernel/bugs_64.c b/arch/x86/kernel/bugs_64.c
new file mode 100644 (file)
index 0000000..4e5e9d3
--- /dev/null
@@ -0,0 +1,24 @@
+/*
+ *  arch/x86_64/kernel/bugs.c
+ *
+ *  Copyright (C) 1994  Linus Torvalds
+ *  Copyright (C) 2000  SuSE
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <asm/alternative.h>
+#include <asm/bugs.h>
+#include <asm/processor.h>
+#include <asm/mtrr.h>
+
+void __init check_bugs(void)
+{
+       identify_cpu(&boot_cpu_data);
+       mtrr_bp_init();
+#if !defined(CONFIG_SMP)
+       printk("CPU: ");
+       print_cpu_info(&boot_cpu_data);
+#endif
+       alternative_instructions();
+}
diff --git a/arch/x86/kernel/crash_64.c b/arch/x86/kernel/crash_64.c
new file mode 100644 (file)
index 0000000..13432a1
--- /dev/null
@@ -0,0 +1,135 @@
+/*
+ * Architecture specific (x86_64) functions for kexec based crash dumps.
+ *
+ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *
+ * Copyright (C) IBM Corporation, 2004. All rights reserved.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/irq.h>
+#include <linux/reboot.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/kdebug.h>
+
+#include <asm/processor.h>
+#include <asm/hardirq.h>
+#include <asm/nmi.h>
+#include <asm/hw_irq.h>
+#include <asm/mach_apic.h>
+
+/* This keeps a track of which one is crashing cpu. */
+static int crashing_cpu;
+
+#ifdef CONFIG_SMP
+static atomic_t waiting_for_crash_ipi;
+
+static int crash_nmi_callback(struct notifier_block *self,
+                               unsigned long val, void *data)
+{
+       struct pt_regs *regs;
+       int cpu;
+
+       if (val != DIE_NMI_IPI)
+               return NOTIFY_OK;
+
+       regs = ((struct die_args *)data)->regs;
+       cpu = raw_smp_processor_id();
+
+       /*
+        * Don't do anything if this handler is invoked on crashing cpu.
+        * Otherwise, system will completely hang. Crashing cpu can get
+        * an NMI if system was initially booted with nmi_watchdog parameter.
+        */
+       if (cpu == crashing_cpu)
+               return NOTIFY_STOP;
+       local_irq_disable();
+
+       crash_save_cpu(regs, cpu);
+       disable_local_APIC();
+       atomic_dec(&waiting_for_crash_ipi);
+       /* Assume hlt works */
+       for(;;)
+               halt();
+
+       return 1;
+}
+
+static void smp_send_nmi_allbutself(void)
+{
+       send_IPI_allbutself(NMI_VECTOR);
+}
+
+/*
+ * This code is a best effort heuristic to get the
+ * other cpus to stop executing. So races with
+ * cpu hotplug shouldn't matter.
+ */
+
+static struct notifier_block crash_nmi_nb = {
+       .notifier_call = crash_nmi_callback,
+};
+
+static void nmi_shootdown_cpus(void)
+{
+       unsigned long msecs;
+
+       atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
+       if (register_die_notifier(&crash_nmi_nb))
+               return;         /* return what? */
+
+       /*
+        * Ensure the new callback function is set before sending
+        * out the NMI
+        */
+       wmb();
+
+       smp_send_nmi_allbutself();
+
+       msecs = 1000; /* Wait at most a second for the other cpus to stop */
+       while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
+               mdelay(1);
+               msecs--;
+       }
+       /* Leave the nmi callback set */
+       disable_local_APIC();
+}
+#else
+static void nmi_shootdown_cpus(void)
+{
+       /* There are no cpus to shootdown */
+}
+#endif
+
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+       /*
+        * This function is only called after the system
+        * has panicked or is otherwise in a critical state.
+        * The minimum amount of code to allow a kexec'd kernel
+        * to run successfully needs to happen here.
+        *
+        * In practice this means shooting down the other cpus in
+        * an SMP system.
+        */
+       /* The kernel is broken so disable interrupts */
+       local_irq_disable();
+
+       /* Make a note of crashing cpu. Will be used in NMI callback.*/
+       crashing_cpu = smp_processor_id();
+       nmi_shootdown_cpus();
+
+       if(cpu_has_apic)
+                disable_local_APIC();
+
+       disable_IO_APIC();
+
+       crash_save_cpu(regs, smp_processor_id());
+}
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
new file mode 100644 (file)
index 0000000..942deac
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ *     kernel/crash_dump.c - Memory preserving reboot related code.
+ *
+ *     Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *     Copyright (C) IBM Corporation, 2004. All rights reserved
+ */
+
+#include <linux/errno.h>
+#include <linux/crash_dump.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+/**
+ * copy_oldmem_page - copy one page from "oldmem"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ *     space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ *     otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from "oldmem". For this page, there is no pte mapped
+ * in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
+                               size_t csize, unsigned long offset, int userbuf)
+{
+       void  *vaddr;
+
+       if (!csize)
+               return 0;
+
+       vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
+
+       if (userbuf) {
+               if (copy_to_user(buf, (vaddr + offset), csize)) {
+                       iounmap(vaddr);
+                       return -EFAULT;
+               }
+       } else
+       memcpy(buf, (vaddr + offset), csize);
+
+       iounmap(vaddr);
+       return csize;
+}
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
new file mode 100644 (file)
index 0000000..0f4d5e2
--- /dev/null
@@ -0,0 +1,725 @@
+/* 
+ * Handle the memory map.
+ * The functions here do the job until bootmem takes over.
+ *
+ *  Getting sanitize_e820_map() in sync with i386 version by applying change:
+ *  -  Provisions for empty E820 memory regions (reported by certain BIOSes).
+ *     Alex Achenbach <xela@slit.de>, December 2002.
+ *  Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/string.h>
+#include <linux/kexec.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/suspend.h>
+#include <linux/pfn.h>
+
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/e820.h>
+#include <asm/proto.h>
+#include <asm/bootsetup.h>
+#include <asm/sections.h>
+
+struct e820map e820;
+
+/* 
+ * PFN of last memory page.
+ */
+unsigned long end_pfn; 
+EXPORT_SYMBOL(end_pfn);
+
+/* 
+ * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
+ * The direct mapping extends to end_pfn_map, so that we can directly access
+ * apertures, ACPI and other tables without having to play with fixmaps.
+ */ 
+unsigned long end_pfn_map; 
+
+/* 
+ * Last pfn which the user wants to use.
+ */
+static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
+
+extern struct resource code_resource, data_resource;
+
+/* Check for some hardcoded bad areas that early boot is not allowed to touch */ 
+static inline int bad_addr(unsigned long *addrp, unsigned long size)
+{ 
+       unsigned long addr = *addrp, last = addr + size; 
+
+       /* various gunk below that needed for SMP startup */
+       if (addr < 0x8000) { 
+               *addrp = PAGE_ALIGN(0x8000);
+               return 1; 
+       }
+
+       /* direct mapping tables of the kernel */
+       if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { 
+               *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
+               return 1;
+       } 
+
+       /* initrd */ 
+#ifdef CONFIG_BLK_DEV_INITRD
+       if (LOADER_TYPE && INITRD_START && last >= INITRD_START && 
+           addr < INITRD_START+INITRD_SIZE) { 
+               *addrp = PAGE_ALIGN(INITRD_START + INITRD_SIZE);
+               return 1;
+       } 
+#endif
+       /* kernel code */
+       if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) {
+               *addrp = PAGE_ALIGN(__pa_symbol(&_end));
+               return 1;
+       }
+
+       if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
+               *addrp = PAGE_ALIGN(ebda_addr + ebda_size);
+               return 1;
+       }
+
+#ifdef CONFIG_NUMA
+       /* NUMA memory to node map */
+       if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
+               *addrp = nodemap_addr + nodemap_size;
+               return 1;
+       }
+#endif
+       /* XXX ramdisk image here? */ 
+       return 0;
+} 
+
+/*
+ * This function checks if any part of the range <start,end> is mapped
+ * with type.
+ */
+int
+e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
+{ 
+       int i;
+       for (i = 0; i < e820.nr_map; i++) { 
+               struct e820entry *ei = &e820.map[i]; 
+               if (type && ei->type != type) 
+                       continue;
+               if (ei->addr >= end || ei->addr + ei->size <= start)
+                       continue; 
+               return 1; 
+       } 
+       return 0;
+}
+EXPORT_SYMBOL_GPL(e820_any_mapped);
+
+/*
+ * This function checks if the entire range <start,end> is mapped with type.
+ *
+ * Note: this function only works correct if the e820 table is sorted and
+ * not-overlapping, which is the case
+ */
+int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
+{
+       int i;
+       for (i = 0; i < e820.nr_map; i++) {
+               struct e820entry *ei = &e820.map[i];
+               if (type && ei->type != type)
+                       continue;
+               /* is the region (part) in overlap with the current region ?*/
+               if (ei->addr >= end || ei->addr + ei->size <= start)
+                       continue;
+
+               /* if the region is at the beginning of <start,end> we move
+                * start to the end of the region since it's ok until there
+                */
+               if (ei->addr <= start)
+                       start = ei->addr + ei->size;
+               /* if start is now at or beyond end, we're done, full coverage */
+               if (start >= end)
+                       return 1; /* we're done */
+       }
+       return 0;
+}
+
+/* 
+ * Find a free area in a specific range. 
+ */ 
+unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) 
+{ 
+       int i; 
+       for (i = 0; i < e820.nr_map; i++) { 
+               struct e820entry *ei = &e820.map[i]; 
+               unsigned long addr = ei->addr, last; 
+               if (ei->type != E820_RAM) 
+                       continue; 
+               if (addr < start) 
+                       addr = start;
+               if (addr > ei->addr + ei->size) 
+                       continue; 
+               while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
+                       ;
+               last = PAGE_ALIGN(addr) + size;
+               if (last > ei->addr + ei->size)
+                       continue;
+               if (last > end) 
+                       continue;
+               return addr; 
+       } 
+       return -1UL;            
+} 
+
+/*
+ * Find the highest page frame number we have available
+ */
+unsigned long __init e820_end_of_ram(void)
+{
+       unsigned long end_pfn = 0;
+       end_pfn = find_max_pfn_with_active_regions();
+       
+       if (end_pfn > end_pfn_map) 
+               end_pfn_map = end_pfn;
+       if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
+               end_pfn_map = MAXMEM>>PAGE_SHIFT;
+       if (end_pfn > end_user_pfn)
+               end_pfn = end_user_pfn;
+       if (end_pfn > end_pfn_map) 
+               end_pfn = end_pfn_map; 
+
+       printk("end_pfn_map = %lu\n", end_pfn_map);
+       return end_pfn; 
+}
+
+/*
+ * Mark e820 reserved areas as busy for the resource manager.
+ */
+void __init e820_reserve_resources(void)
+{
+       int i;
+       for (i = 0; i < e820.nr_map; i++) {
+               struct resource *res;
+               res = alloc_bootmem_low(sizeof(struct resource));
+               switch (e820.map[i].type) {
+               case E820_RAM:  res->name = "System RAM"; break;
+               case E820_ACPI: res->name = "ACPI Tables"; break;
+               case E820_NVS:  res->name = "ACPI Non-volatile Storage"; break;
+               default:        res->name = "reserved";
+               }
+               res->start = e820.map[i].addr;
+               res->end = res->start + e820.map[i].size - 1;
+               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+               request_resource(&iomem_resource, res);
+               if (e820.map[i].type == E820_RAM) {
+                       /*
+                        *  We don't know which RAM region contains kernel data,
+                        *  so we try it repeatedly and let the resource manager
+                        *  test it.
+                        */
+                       request_resource(res, &code_resource);
+                       request_resource(res, &data_resource);
+#ifdef CONFIG_KEXEC
+                       request_resource(res, &crashk_res);
+#endif
+               }
+       }
+}
+
+/*
+ * Find the ranges of physical addresses that do not correspond to
+ * e820 RAM areas and mark the corresponding pages as nosave for software
+ * suspend and suspend to RAM.
+ *
+ * This function requires the e820 map to be sorted and without any
+ * overlapping entries and assumes the first e820 area to be RAM.
+ */
+void __init e820_mark_nosave_regions(void)
+{
+       int i;
+       unsigned long paddr;
+
+       paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
+       for (i = 1; i < e820.nr_map; i++) {
+               struct e820entry *ei = &e820.map[i];
+
+               if (paddr < ei->addr)
+                       register_nosave_region(PFN_DOWN(paddr),
+                                               PFN_UP(ei->addr));
+
+               paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
+               if (ei->type != E820_RAM)
+                       register_nosave_region(PFN_UP(ei->addr),
+                                               PFN_DOWN(paddr));
+
+               if (paddr >= (end_pfn << PAGE_SHIFT))
+                       break;
+       }
+}
+
+/*
+ * Finds an active region in the address range from start_pfn to end_pfn and
+ * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
+ */
+static int __init e820_find_active_region(const struct e820entry *ei,
+                                         unsigned long start_pfn,
+                                         unsigned long end_pfn,
+                                         unsigned long *ei_startpfn,
+                                         unsigned long *ei_endpfn)
+{
+       *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
+       *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
+
+       /* Skip map entries smaller than a page */
+       if (*ei_startpfn >= *ei_endpfn)
+               return 0;
+
+       /* Check if end_pfn_map should be updated */
+       if (ei->type != E820_RAM && *ei_endpfn > end_pfn_map)
+               end_pfn_map = *ei_endpfn;
+
+       /* Skip if map is outside the node */
+       if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
+                                   *ei_startpfn >= end_pfn)
+               return 0;
+
+       /* Check for overlaps */
+       if (*ei_startpfn < start_pfn)
+               *ei_startpfn = start_pfn;
+       if (*ei_endpfn > end_pfn)
+               *ei_endpfn = end_pfn;
+
+       /* Obey end_user_pfn to save on memmap */
+       if (*ei_startpfn >= end_user_pfn)
+               return 0;
+       if (*ei_endpfn > end_user_pfn)
+               *ei_endpfn = end_user_pfn;
+
+       return 1;
+}
+
+/* Walk the e820 map and register active regions within a node */
+void __init
+e820_register_active_regions(int nid, unsigned long start_pfn,
+                                                       unsigned long end_pfn)
+{
+       unsigned long ei_startpfn;
+       unsigned long ei_endpfn;
+       int i;
+
+       for (i = 0; i < e820.nr_map; i++)
+               if (e820_find_active_region(&e820.map[i],
+                                           start_pfn, end_pfn,
+                                           &ei_startpfn, &ei_endpfn))
+                       add_active_range(nid, ei_startpfn, ei_endpfn);
+}
+
+/* 
+ * Add a memory region to the kernel e820 map.
+ */ 
+void __init add_memory_region(unsigned long start, unsigned long size, int type)
+{
+       int x = e820.nr_map;
+
+       if (x == E820MAX) {
+               printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+               return;
+       }
+
+       e820.map[x].addr = start;
+       e820.map[x].size = size;
+       e820.map[x].type = type;
+       e820.nr_map++;
+}
+
+/*
+ * Find the hole size (in bytes) in the memory range.
+ * @start: starting address of the memory range to scan
+ * @end: ending address of the memory range to scan
+ */
+unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
+{
+       unsigned long start_pfn = start >> PAGE_SHIFT;
+       unsigned long end_pfn = end >> PAGE_SHIFT;
+       unsigned long ei_startpfn;
+       unsigned long ei_endpfn;
+       unsigned long ram = 0;
+       int i;
+
+       for (i = 0; i < e820.nr_map; i++) {
+               if (e820_find_active_region(&e820.map[i],
+                                           start_pfn, end_pfn,
+                                           &ei_startpfn, &ei_endpfn))
+                       ram += ei_endpfn - ei_startpfn;
+       }
+       return end - start - (ram << PAGE_SHIFT);
+}
+
+void __init e820_print_map(char *who)
+{
+       int i;
+
+       for (i = 0; i < e820.nr_map; i++) {
+               printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
+                       (unsigned long long) e820.map[i].addr,
+                       (unsigned long long) (e820.map[i].addr + e820.map[i].size));
+               switch (e820.map[i].type) {
+               case E820_RAM:  printk("(usable)\n");
+                               break;
+               case E820_RESERVED:
+                               printk("(reserved)\n");
+                               break;
+               case E820_ACPI:
+                               printk("(ACPI data)\n");
+                               break;
+               case E820_NVS:
+                               printk("(ACPI NVS)\n");
+                               break;
+               default:        printk("type %u\n", e820.map[i].type);
+                               break;
+               }
+       }
+}
+
+/*
+ * Sanitize the BIOS e820 map.
+ *
+ * Some e820 responses include overlapping entries.  The following 
+ * replaces the original e820 map with a new one, removing overlaps.
+ *
+ */
+static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
+{
+       struct change_member {
+               struct e820entry *pbios; /* pointer to original bios entry */
+               unsigned long long addr; /* address for this change point */
+       };
+       static struct change_member change_point_list[2*E820MAX] __initdata;
+       static struct change_member *change_point[2*E820MAX] __initdata;
+       static struct e820entry *overlap_list[E820MAX] __initdata;
+       static struct e820entry new_bios[E820MAX] __initdata;
+       struct change_member *change_tmp;
+       unsigned long current_type, last_type;
+       unsigned long long last_addr;
+       int chgidx, still_changing;
+       int overlap_entries;
+       int new_bios_entry;
+       int old_nr, new_nr, chg_nr;
+       int i;
+
+       /*
+               Visually we're performing the following (1,2,3,4 = memory types)...
+
+               Sample memory map (w/overlaps):
+                  ____22__________________
+                  ______________________4_
+                  ____1111________________
+                  _44_____________________
+                  11111111________________
+                  ____________________33__
+                  ___________44___________
+                  __________33333_________
+                  ______________22________
+                  ___________________2222_
+                  _________111111111______
+                  _____________________11_
+                  _________________4______
+
+               Sanitized equivalent (no overlap):
+                  1_______________________
+                  _44_____________________
+                  ___1____________________
+                  ____22__________________
+                  ______11________________
+                  _________1______________
+                  __________3_____________
+                  ___________44___________
+                  _____________33_________
+                  _______________2________
+                  ________________1_______
+                  _________________4______
+                  ___________________2____
+                  ____________________33__
+                  ______________________4_
+       */
+
+       /* if there's only one memory region, don't bother */
+       if (*pnr_map < 2)
+               return -1;
+
+       old_nr = *pnr_map;
+
+       /* bail out if we find any unreasonable addresses in bios map */
+       for (i=0; i<old_nr; i++)
+               if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
+                       return -1;
+
+       /* create pointers for initial change-point information (for sorting) */
+       for (i=0; i < 2*old_nr; i++)
+               change_point[i] = &change_point_list[i];
+
+       /* record all known change-points (starting and ending addresses),
+          omitting those that are for empty memory regions */
+       chgidx = 0;
+       for (i=0; i < old_nr; i++)      {
+               if (biosmap[i].size != 0) {
+                       change_point[chgidx]->addr = biosmap[i].addr;
+                       change_point[chgidx++]->pbios = &biosmap[i];
+                       change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
+                       change_point[chgidx++]->pbios = &biosmap[i];
+               }
+       }
+       chg_nr = chgidx;
+
+       /* sort change-point list by memory addresses (low -> high) */
+       still_changing = 1;
+       while (still_changing)  {
+               still_changing = 0;
+               for (i=1; i < chg_nr; i++)  {
+                       /* if <current_addr> > <last_addr>, swap */
+                       /* or, if current=<start_addr> & last=<end_addr>, swap */
+                       if ((change_point[i]->addr < change_point[i-1]->addr) ||
+                               ((change_point[i]->addr == change_point[i-1]->addr) &&
+                                (change_point[i]->addr == change_point[i]->pbios->addr) &&
+                                (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
+                          )
+                       {
+                               change_tmp = change_point[i];
+                               change_point[i] = change_point[i-1];
+                               change_point[i-1] = change_tmp;
+                               still_changing=1;
+                       }
+               }
+       }
+
+       /* create a new bios memory map, removing overlaps */
+       overlap_entries=0;       /* number of entries in the overlap table */
+       new_bios_entry=0;        /* index for creating new bios map entries */
+       last_type = 0;           /* start with undefined memory type */
+       last_addr = 0;           /* start with 0 as last starting address */
+       /* loop through change-points, determining affect on the new bios map */
+       for (chgidx=0; chgidx < chg_nr; chgidx++)
+       {
+               /* keep track of all overlapping bios entries */
+               if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
+               {
+                       /* add map entry to overlap list (> 1 entry implies an overlap) */
+                       overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
+               }
+               else
+               {
+                       /* remove entry from list (order independent, so swap with last) */
+                       for (i=0; i<overlap_entries; i++)
+                       {
+                               if (overlap_list[i] == change_point[chgidx]->pbios)
+                                       overlap_list[i] = overlap_list[overlap_entries-1];
+                       }
+                       overlap_entries--;
+               }
+               /* if there are overlapping entries, decide which "type" to use */
+               /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
+               current_type = 0;
+               for (i=0; i<overlap_entries; i++)
+                       if (overlap_list[i]->type > current_type)
+                               current_type = overlap_list[i]->type;
+               /* continue building up new bios map based on this information */
+               if (current_type != last_type)  {
+                       if (last_type != 0)      {
+                               new_bios[new_bios_entry].size =
+                                       change_point[chgidx]->addr - last_addr;
+                               /* move forward only if the new size was non-zero */
+                               if (new_bios[new_bios_entry].size != 0)
+                                       if (++new_bios_entry >= E820MAX)
+                                               break;  /* no more space left for new bios entries */
+                       }
+                       if (current_type != 0)  {
+                               new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
+                               new_bios[new_bios_entry].type = current_type;
+                               last_addr=change_point[chgidx]->addr;
+                       }
+                       last_type = current_type;
+               }
+       }
+       new_nr = new_bios_entry;   /* retain count for new bios entries */
+
+       /* copy new bios mapping into original location */
+       memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
+       *pnr_map = new_nr;
+
+       return 0;
+}
+
+/*
+ * Copy the BIOS e820 map into a safe place.
+ *
+ * Sanity-check it while we're at it..
+ *
+ * If we're lucky and live on a modern system, the setup code
+ * will have given us a memory map that we can use to properly
+ * set up memory.  If we aren't, we'll fake a memory map.
+ */
+static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
+{
+       /* Only one memory region (or negative)? Ignore it */
+       if (nr_map < 2)
+               return -1;
+
+       do {
+               unsigned long start = biosmap->addr;
+               unsigned long size = biosmap->size;
+               unsigned long end = start + size;
+               unsigned long type = biosmap->type;
+
+               /* Overflow in 64 bits? Ignore the memory map. */
+               if (start > end)
+                       return -1;
+
+               add_memory_region(start, size, type);
+       } while (biosmap++,--nr_map);
+       return 0;
+}
+
+void early_panic(char *msg)
+{
+       early_printk(msg);
+       panic(msg);
+}
+
+void __init setup_memory_region(void)
+{
+       /*
+        * Try to copy the BIOS-supplied E820-map.
+        *
+        * Otherwise fake a memory map; one section from 0k->640k,
+        * the next section from 1mb->appropriate_mem_k
+        */
+       sanitize_e820_map(E820_MAP, &E820_MAP_NR);
+       if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0)
+               early_panic("Cannot find a valid memory map");
+       printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+       e820_print_map("BIOS-e820");
+}
+
+static int __init parse_memopt(char *p)
+{
+       if (!p)
+               return -EINVAL;
+       end_user_pfn = memparse(p, &p);
+       end_user_pfn >>= PAGE_SHIFT;    
+       return 0;
+} 
+early_param("mem", parse_memopt);
+
+static int userdef __initdata;
+
+static int __init parse_memmap_opt(char *p)
+{
+       char *oldp;
+       unsigned long long start_at, mem_size;
+
+       if (!strcmp(p, "exactmap")) {
+#ifdef CONFIG_CRASH_DUMP
+               /* If we are doing a crash dump, we
+                * still need to know the real mem
+                * size before original memory map is
+                * reset.
+                */
+               e820_register_active_regions(0, 0, -1UL);
+               saved_max_pfn = e820_end_of_ram();
+               remove_all_active_ranges();
+#endif
+               end_pfn_map = 0;
+               e820.nr_map = 0;
+               userdef = 1;
+               return 0;
+       }
+
+       oldp = p;
+       mem_size = memparse(p, &p);
+       if (p == oldp)
+               return -EINVAL;
+       if (*p == '@') {
+               start_at = memparse(p+1, &p);
+               add_memory_region(start_at, mem_size, E820_RAM);
+       } else if (*p == '#') {
+               start_at = memparse(p+1, &p);
+               add_memory_region(start_at, mem_size, E820_ACPI);
+       } else if (*p == '$') {
+               start_at = memparse(p+1, &p);
+               add_memory_region(start_at, mem_size, E820_RESERVED);
+       } else {
+               end_user_pfn = (mem_size >> PAGE_SHIFT);
+       }
+       return *p == '\0' ? 0 : -EINVAL;
+}
+early_param("memmap", parse_memmap_opt);
+
+void __init finish_e820_parsing(void)
+{
+       if (userdef) {
+               printk(KERN_INFO "user-defined physical RAM map:\n");
+               e820_print_map("user");
+       }
+}
+
+unsigned long pci_mem_start = 0xaeedbabe;
+EXPORT_SYMBOL(pci_mem_start);
+
+/*
+ * Search for the biggest gap in the low 32 bits of the e820
+ * memory space.  We pass this space to PCI to assign MMIO resources
+ * for hotplug or unconfigured devices in.
+ * Hopefully the BIOS let enough space left.
+ */
+__init void e820_setup_gap(void)
+{
+       unsigned long gapstart, gapsize, round;
+       unsigned long last;
+       int i;
+       int found = 0;
+
+       last = 0x100000000ull;
+       gapstart = 0x10000000;
+       gapsize = 0x400000;
+       i = e820.nr_map;
+       while (--i >= 0) {
+               unsigned long long start = e820.map[i].addr;
+               unsigned long long end = start + e820.map[i].size;
+
+               /*
+                * Since "last" is at most 4GB, we know we'll
+                * fit in 32 bits if this condition is true
+                */
+               if (last > end) {
+                       unsigned long gap = last - end;
+
+                       if (gap > gapsize) {
+                               gapsize = gap;
+                               gapstart = end;
+                               found = 1;
+                       }
+               }
+               if (start < last)
+                       last = start;
+       }
+
+       if (!found) {
+               gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
+               printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
+                      KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
+       }
+
+       /*
+        * See how much we want to round up: start off with
+        * rounding to the next 1MB area.
+        */
+       round = 0x100000;
+       while ((gapsize >> 4) > round)
+               round += round;
+       /* Fun with two's complement */
+       pci_mem_start = (gapstart + round) & -round;
+
+       printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
+               pci_mem_start, gapstart, gapsize);
+}
diff --git a/arch/x86/kernel/early-quirks_64.c b/arch/x86/kernel/early-quirks_64.c
new file mode 100644 (file)
index 0000000..13aa4fd
--- /dev/null
@@ -0,0 +1,127 @@
+/* Various workarounds for chipset bugs.
+   This code runs very early and can't use the regular PCI subsystem
+   The entries are keyed to PCI bridges which usually identify chipsets
+   uniquely.
+   This is only for whole classes of chipsets with specific problems which
+   need early invasive action (e.g. before the timers are initialized).
+   Most PCI device specific workarounds can be done later and should be
+   in standard PCI quirks
+   Mainboard specific bugs should be handled by DMI entries.
+   CPU specific bugs in setup.c */
+
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <linux/pci_ids.h>
+#include <asm/pci-direct.h>
+#include <asm/proto.h>
+#include <asm/iommu.h>
+#include <asm/dma.h>
+
+static void __init via_bugs(void)
+{
+#ifdef CONFIG_IOMMU
+       if ((end_pfn > MAX_DMA32_PFN ||  force_iommu) &&
+           !iommu_aperture_allowed) {
+               printk(KERN_INFO
+  "Looks like a VIA chipset. Disabling IOMMU. Override with iommu=allowed\n");
+               iommu_aperture_disabled = 1;
+       }
+#endif
+}
+
+#ifdef CONFIG_ACPI
+
+static int __init nvidia_hpet_check(struct acpi_table_header *header)
+{
+       return 0;
+}
+#endif
+
+static void __init nvidia_bugs(void)
+{
+#ifdef CONFIG_ACPI
+       /*
+        * All timer overrides on Nvidia are
+        * wrong unless HPET is enabled.
+        * Unfortunately that's not true on many Asus boards.
+        * We don't know yet how to detect this automatically, but
+        * at least allow a command line override.
+        */
+       if (acpi_use_timer_override)
+               return;
+
+       if (acpi_table_parse(ACPI_SIG_HPET, nvidia_hpet_check)) {
+               acpi_skip_timer_override = 1;
+               printk(KERN_INFO "Nvidia board "
+                      "detected. Ignoring ACPI "
+                      "timer override.\n");
+               printk(KERN_INFO "If you got timer trouble "
+                       "try acpi_use_timer_override\n");
+       }
+#endif
+       /* RED-PEN skip them on mptables too? */
+
+}
+
+static void __init ati_bugs(void)
+{
+       if (timer_over_8254 == 1) {
+               timer_over_8254 = 0;
+               printk(KERN_INFO
+               "ATI board detected. Disabling timer routing over 8254.\n");
+       }
+}
+
+struct chipset {
+       u16 vendor;
+       void (*f)(void);
+};
+
+static struct chipset early_qrk[] __initdata = {
+       { PCI_VENDOR_ID_NVIDIA, nvidia_bugs },
+       { PCI_VENDOR_ID_VIA, via_bugs },
+       { PCI_VENDOR_ID_ATI, ati_bugs },
+       {}
+};
+
+void __init early_quirks(void)
+{
+       int num, slot, func;
+
+       if (!early_pci_allowed())
+               return;
+
+       /* Poor man's PCI discovery */
+       for (num = 0; num < 32; num++) {
+               for (slot = 0; slot < 32; slot++) {
+                       for (func = 0; func < 8; func++) {
+                               u32 class;
+                               u32 vendor;
+                               u8 type;
+                               int i;
+                               class = read_pci_config(num,slot,func,
+                                                       PCI_CLASS_REVISION);
+                               if (class == 0xffffffff)
+                                       break;
+
+                               if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
+                                       continue;
+
+                               vendor = read_pci_config(num, slot, func,
+                                                        PCI_VENDOR_ID);
+                               vendor &= 0xffff;
+
+                               for (i = 0; early_qrk[i].f; i++)
+                                       if (early_qrk[i].vendor == vendor) {
+                                               early_qrk[i].f();
+                                               return;
+                                       }
+
+                               type = read_pci_config_byte(num, slot, func,
+                                                           PCI_HEADER_TYPE);
+                               if (!(type & 0x80))
+                                       break;
+                       }
+               }
+       }
+}
index 92f812ba275cf7026415ea2da043012475a320e6..fd9aff3f389011d1a1e3c2cf7a4674325624005d 100644 (file)
@@ -1,2 +1,259 @@
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/screen_info.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+#include <asm/fcntl.h>
+#include <xen/hvc-console.h>
 
-#include "../../x86_64/kernel/early_printk.c"
+/* Simple VGA output */
+
+#ifdef __i386__
+#include <asm/setup.h>
+#else
+#include <asm/bootsetup.h>
+#endif
+#define VGABASE                (__ISA_IO_base + 0xb8000)
+
+static int max_ypos = 25, max_xpos = 80;
+static int current_ypos = 25, current_xpos = 0;
+
+static void early_vga_write(struct console *con, const char *str, unsigned n)
+{
+       char c;
+       int  i, k, j;
+
+       while ((c = *str++) != '\0' && n-- > 0) {
+               if (current_ypos >= max_ypos) {
+                       /* scroll 1 line up */
+                       for (k = 1, j = 0; k < max_ypos; k++, j++) {
+                               for (i = 0; i < max_xpos; i++) {
+                                       writew(readw(VGABASE+2*(max_xpos*k+i)),
+                                              VGABASE + 2*(max_xpos*j + i));
+                               }
+                       }
+                       for (i = 0; i < max_xpos; i++)
+                               writew(0x720, VGABASE + 2*(max_xpos*j + i));
+                       current_ypos = max_ypos-1;
+               }
+               if (c == '\n') {
+                       current_xpos = 0;
+                       current_ypos++;
+               } else if (c != '\r')  {
+                       writew(((0x7 << 8) | (unsigned short) c),
+                              VGABASE + 2*(max_xpos*current_ypos +
+                                               current_xpos++));
+                       if (current_xpos >= max_xpos) {
+                               current_xpos = 0;
+                               current_ypos++;
+                       }
+               }
+       }
+}
+
+static struct console early_vga_console = {
+       .name =         "earlyvga",
+       .write =        early_vga_write,
+       .flags =        CON_PRINTBUFFER,
+       .index =        -1,
+};
+
+/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
+
+static int early_serial_base = 0x3f8;  /* ttyS0 */
+
+#define XMTRDY          0x20
+
+#define DLAB           0x80
+
+#define TXR             0       /*  Transmit register (WRITE) */
+#define RXR             0       /*  Receive register  (READ)  */
+#define IER             1       /*  Interrupt Enable          */
+#define IIR             2       /*  Interrupt ID              */
+#define FCR             2       /*  FIFO control              */
+#define LCR             3       /*  Line control              */
+#define MCR             4       /*  Modem control             */
+#define LSR             5       /*  Line Status               */
+#define MSR             6       /*  Modem Status              */
+#define DLL             0       /*  Divisor Latch Low         */
+#define DLH             1       /*  Divisor latch High        */
+
+static int early_serial_putc(unsigned char ch)
+{
+       unsigned timeout = 0xffff;
+       while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
+               cpu_relax();
+       outb(ch, early_serial_base + TXR);
+       return timeout ? 0 : -1;
+}
+
+static void early_serial_write(struct console *con, const char *s, unsigned n)
+{
+       while (*s && n-- > 0) {
+               if (*s == '\n')
+                       early_serial_putc('\r');
+               early_serial_putc(*s);
+               s++;
+       }
+}
+
+#define DEFAULT_BAUD 9600
+
+static __init void early_serial_init(char *s)
+{
+       unsigned char c;
+       unsigned divisor;
+       unsigned baud = DEFAULT_BAUD;
+       char *e;
+
+       if (*s == ',')
+               ++s;
+
+       if (*s) {
+               unsigned port;
+               if (!strncmp(s,"0x",2)) {
+                       early_serial_base = simple_strtoul(s, &e, 16);
+               } else {
+                       static int bases[] = { 0x3f8, 0x2f8 };
+
+                       if (!strncmp(s,"ttyS",4))
+                               s += 4;
+                       port = simple_strtoul(s, &e, 10);
+                       if (port > 1 || s == e)
+                               port = 0;
+                       early_serial_base = bases[port];
+               }
+               s += strcspn(s, ",");
+               if (*s == ',')
+                       s++;
+       }
+
+       outb(0x3, early_serial_base + LCR);     /* 8n1 */
+       outb(0, early_serial_base + IER);       /* no interrupt */
+       outb(0, early_serial_base + FCR);       /* no fifo */
+       outb(0x3, early_serial_base + MCR);     /* DTR + RTS */
+
+       if (*s) {
+               baud = simple_strtoul(s, &e, 0);
+               if (baud == 0 || s == e)
+                       baud = DEFAULT_BAUD;
+       }
+
+       divisor = 115200 / baud;
+       c = inb(early_serial_base + LCR);
+       outb(c | DLAB, early_serial_base + LCR);
+       outb(divisor & 0xff, early_serial_base + DLL);
+       outb((divisor >> 8) & 0xff, early_serial_base + DLH);
+       outb(c & ~DLAB, early_serial_base + LCR);
+}
+
+static struct console early_serial_console = {
+       .name =         "earlyser",
+       .write =        early_serial_write,
+       .flags =        CON_PRINTBUFFER,
+       .index =        -1,
+};
+
+/* Console interface to a host file on AMD's SimNow! */
+
+static int simnow_fd;
+
+enum {
+       MAGIC1 = 0xBACCD00A,
+       MAGIC2 = 0xCA110000,
+       XOPEN = 5,
+       XWRITE = 4,
+};
+
+static noinline long simnow(long cmd, long a, long b, long c)
+{
+       long ret;
+       asm volatile("cpuid" :
+                    "=a" (ret) :
+                    "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
+       return ret;
+}
+
+static void __init simnow_init(char *str)
+{
+       char *fn = "klog";
+       if (*str == '=')
+               fn = ++str;
+       /* error ignored */
+       simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644);
+}
+
+static void simnow_write(struct console *con, const char *s, unsigned n)
+{
+       simnow(XWRITE, simnow_fd, (unsigned long)s, n);
+}
+
+static struct console simnow_console = {
+       .name =         "simnow",
+       .write =        simnow_write,
+       .flags =        CON_PRINTBUFFER,
+       .index =        -1,
+};
+
+/* Direct interface for emergencies */
+struct console *early_console = &early_vga_console;
+static int early_console_initialized = 0;
+
+void early_printk(const char *fmt, ...)
+{
+       char buf[512];
+       int n;
+       va_list ap;
+
+       va_start(ap,fmt);
+       n = vscnprintf(buf,512,fmt,ap);
+       early_console->write(early_console,buf,n);
+       va_end(ap);
+}
+
+static int __initdata keep_early;
+
+static int __init setup_early_printk(char *buf)
+{
+       if (!buf)
+               return 0;
+
+       if (early_console_initialized)
+               return 0;
+       early_console_initialized = 1;
+
+       if (strstr(buf, "keep"))
+               keep_early = 1;
+
+       if (!strncmp(buf, "serial", 6)) {
+               early_serial_init(buf + 6);
+               early_console = &early_serial_console;
+       } else if (!strncmp(buf, "ttyS", 4)) {
+               early_serial_init(buf);
+               early_console = &early_serial_console;
+       } else if (!strncmp(buf, "vga", 3)
+                  && SCREEN_INFO.orig_video_isVGA == 1) {
+               max_xpos = SCREEN_INFO.orig_video_cols;
+               max_ypos = SCREEN_INFO.orig_video_lines;
+               current_ypos = SCREEN_INFO.orig_y;
+               early_console = &early_vga_console;
+       } else if (!strncmp(buf, "simnow", 6)) {
+               simnow_init(buf + 6);
+               early_console = &simnow_console;
+               keep_early = 1;
+#ifdef CONFIG_HVC_XEN
+       } else if (!strncmp(buf, "xen", 3)) {
+               early_console = &xenboot_console;
+#endif
+       }
+
+       if (keep_early)
+               early_console->flags &= ~CON_BOOT;
+       else
+               early_console->flags |= CON_BOOT;
+       register_console(early_console);
+       return 0;
+}
+early_param("earlyprintk", setup_early_printk);
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
new file mode 100644 (file)
index 0000000..1d232e5
--- /dev/null
@@ -0,0 +1,1172 @@
+/*
+ *  linux/arch/x86_64/entry.S
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002  Andi Kleen SuSE Labs
+ *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
+ */
+
+/*
+ * entry.S contains the system-call and fault low-level handling routines.
+ *
+ * NOTE: This code handles signal-recognition, which happens every time
+ * after an interrupt and after each system call.
+ * 
+ * Normal syscalls and interrupts don't save a full stack frame, this is 
+ * only done for syscall tracing, signals or fork/exec et.al.
+ * 
+ * A note on terminology:       
+ * - top of stack: Architecture defined interrupt frame from SS to RIP 
+ * at the top of the kernel process stack.     
+ * - partial stack frame: partially saved registers upto R11.
+ * - full stack frame: Like partial stack frame, but all register saved. 
+ *
+ * Some macro usage:
+ * - CFI macros are used to generate dwarf2 unwind information for better
+ * backtraces. They don't change any code.
+ * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
+ * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
+ * There are unfortunately lots of special cases where some registers
+ * not touched. The macro is a big mess that should be cleaned up.
+ * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
+ * Gives a full stack frame.
+ * - ENTRY/END Define functions in the symbol table.
+ * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
+ * frame that is otherwise undefined after a SYSCALL
+ * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
+ * - errorentry/paranoidentry/zeroentry - Define exception entry points.
+ */
+
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/cache.h>
+#include <asm/errno.h>
+#include <asm/dwarf2.h>
+#include <asm/calling.h>
+#include <asm/asm-offsets.h>
+#include <asm/msr.h>
+#include <asm/unistd.h>
+#include <asm/thread_info.h>
+#include <asm/hw_irq.h>
+#include <asm/page.h>
+#include <asm/irqflags.h>
+
+       .code64
+
+#ifndef CONFIG_PREEMPT
+#define retint_kernel retint_restore_args
+#endif 
+
+
+.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
+#ifdef CONFIG_TRACE_IRQFLAGS
+       bt   $9,EFLAGS-\offset(%rsp)    /* interrupts off? */
+       jnc  1f
+       TRACE_IRQS_ON
+1:
+#endif
+.endm
+
+/*
+ * C code is not supposed to know about undefined top of stack. Every time 
+ * a C function with an pt_regs argument is called from the SYSCALL based 
+ * fast path FIXUP_TOP_OF_STACK is needed.
+ * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
+ * manipulation.
+ */            
+               
+       /* %rsp:at FRAMEEND */ 
+       .macro FIXUP_TOP_OF_STACK tmp
+       movq    %gs:pda_oldrsp,\tmp
+       movq    \tmp,RSP(%rsp)
+       movq    $__USER_DS,SS(%rsp)
+       movq    $__USER_CS,CS(%rsp)
+       movq    $-1,RCX(%rsp)
+       movq    R11(%rsp),\tmp  /* get eflags */
+       movq    \tmp,EFLAGS(%rsp)
+       .endm
+
+       .macro RESTORE_TOP_OF_STACK tmp,offset=0
+       movq   RSP-\offset(%rsp),\tmp
+       movq   \tmp,%gs:pda_oldrsp
+       movq   EFLAGS-\offset(%rsp),\tmp
+       movq   \tmp,R11-\offset(%rsp)
+       .endm
+
+       .macro FAKE_STACK_FRAME child_rip
+       /* push in order ss, rsp, eflags, cs, rip */
+       xorl %eax, %eax
+       pushq %rax /* ss */
+       CFI_ADJUST_CFA_OFFSET   8
+       /*CFI_REL_OFFSET        ss,0*/
+       pushq %rax /* rsp */
+       CFI_ADJUST_CFA_OFFSET   8
+       CFI_REL_OFFSET  rsp,0
+       pushq $(1<<9) /* eflags - interrupts on */
+       CFI_ADJUST_CFA_OFFSET   8
+       /*CFI_REL_OFFSET        rflags,0*/
+       pushq $__KERNEL_CS /* cs */
+       CFI_ADJUST_CFA_OFFSET   8
+       /*CFI_REL_OFFSET        cs,0*/
+       pushq \child_rip /* rip */
+       CFI_ADJUST_CFA_OFFSET   8
+       CFI_REL_OFFSET  rip,0
+       pushq   %rax /* orig rax */
+       CFI_ADJUST_CFA_OFFSET   8
+       .endm
+
+       .macro UNFAKE_STACK_FRAME
+       addq $8*6, %rsp
+       CFI_ADJUST_CFA_OFFSET   -(6*8)
+       .endm
+
+       .macro  CFI_DEFAULT_STACK start=1
+       .if \start
+       CFI_STARTPROC   simple
+       CFI_SIGNAL_FRAME
+       CFI_DEF_CFA     rsp,SS+8
+       .else
+       CFI_DEF_CFA_OFFSET SS+8
+       .endif
+       CFI_REL_OFFSET  r15,R15
+       CFI_REL_OFFSET  r14,R14
+       CFI_REL_OFFSET  r13,R13
+       CFI_REL_OFFSET  r12,R12
+       CFI_REL_OFFSET  rbp,RBP
+       CFI_REL_OFFSET  rbx,RBX
+       CFI_REL_OFFSET  r11,R11
+       CFI_REL_OFFSET  r10,R10
+       CFI_REL_OFFSET  r9,R9
+       CFI_REL_OFFSET  r8,R8
+       CFI_REL_OFFSET  rax,RAX
+       CFI_REL_OFFSET  rcx,RCX
+       CFI_REL_OFFSET  rdx,RDX
+       CFI_REL_OFFSET  rsi,RSI
+       CFI_REL_OFFSET  rdi,RDI
+       CFI_REL_OFFSET  rip,RIP
+       /*CFI_REL_OFFSET        cs,CS*/
+       /*CFI_REL_OFFSET        rflags,EFLAGS*/
+       CFI_REL_OFFSET  rsp,RSP
+       /*CFI_REL_OFFSET        ss,SS*/
+       .endm
+/*
+ * A newly forked process directly context switches into this.
+ */    
+/* rdi:        prev */ 
+ENTRY(ret_from_fork)
+       CFI_DEFAULT_STACK
+       push kernel_eflags(%rip)
+       CFI_ADJUST_CFA_OFFSET 4
+       popf                            # reset kernel eflags
+       CFI_ADJUST_CFA_OFFSET -4
+       call schedule_tail
+       GET_THREAD_INFO(%rcx)
+       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
+       jnz rff_trace
+rff_action:    
+       RESTORE_REST
+       testl $3,CS-ARGOFFSET(%rsp)     # from kernel_thread?
+       je   int_ret_from_sys_call
+       testl $_TIF_IA32,threadinfo_flags(%rcx)
+       jnz  int_ret_from_sys_call
+       RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
+       jmp ret_from_sys_call
+rff_trace:
+       movq %rsp,%rdi
+       call syscall_trace_leave
+       GET_THREAD_INFO(%rcx)   
+       jmp rff_action
+       CFI_ENDPROC
+END(ret_from_fork)
+
+/*
+ * System call entry. Upto 6 arguments in registers are supported.
+ *
+ * SYSCALL does not save anything on the stack and does not change the
+ * stack pointer.
+ */
+               
+/*
+ * Register setup:     
+ * rax  system call number
+ * rdi  arg0
+ * rcx  return address for syscall/sysret, C arg3 
+ * rsi  arg1
+ * rdx  arg2   
+ * r10  arg3   (--> moved to rcx for C)
+ * r8   arg4
+ * r9   arg5
+ * r11  eflags for syscall/sysret, temporary for C
+ * r12-r15,rbp,rbx saved by C code, not touched.               
+ * 
+ * Interrupts are off on entry.
+ * Only called from user space.
+ *
+ * XXX if we had a free scratch register we could save the RSP into the stack frame
+ *      and report it properly in ps. Unfortunately we haven't.
+ *
+ * When user can change the frames always force IRET. That is because
+ * it deals with uncanonical addresses better. SYSRET has trouble
+ * with them due to bugs in both AMD and Intel CPUs.
+ */                                    
+
+ENTRY(system_call)
+       CFI_STARTPROC   simple
+       CFI_SIGNAL_FRAME
+       CFI_DEF_CFA     rsp,PDA_STACKOFFSET
+       CFI_REGISTER    rip,rcx
+       /*CFI_REGISTER  rflags,r11*/
+       swapgs
+       movq    %rsp,%gs:pda_oldrsp 
+       movq    %gs:pda_kernelstack,%rsp
+       /*
+        * No need to follow this irqs off/on section - it's straight
+        * and short:
+        */
+       sti                                     
+       SAVE_ARGS 8,1
+       movq  %rax,ORIG_RAX-ARGOFFSET(%rsp) 
+       movq  %rcx,RIP-ARGOFFSET(%rsp)
+       CFI_REL_OFFSET rip,RIP-ARGOFFSET
+       GET_THREAD_INFO(%rcx)
+       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
+       jnz tracesys
+       cmpq $__NR_syscall_max,%rax
+       ja badsys
+       movq %r10,%rcx
+       call *sys_call_table(,%rax,8)  # XXX:    rip relative
+       movq %rax,RAX-ARGOFFSET(%rsp)
+/*
+ * Syscall return path ending with SYSRET (fast path)
+ * Has incomplete stack frame and undefined top of stack. 
+ */            
+ret_from_sys_call:
+       movl $_TIF_ALLWORK_MASK,%edi
+       /* edi: flagmask */
+sysret_check:          
+       GET_THREAD_INFO(%rcx)
+       cli
+       TRACE_IRQS_OFF
+       movl threadinfo_flags(%rcx),%edx
+       andl %edi,%edx
+       jnz  sysret_careful 
+       CFI_REMEMBER_STATE
+       /*
+        * sysretq will re-enable interrupts:
+        */
+       TRACE_IRQS_ON
+       movq RIP-ARGOFFSET(%rsp),%rcx
+       CFI_REGISTER    rip,rcx
+       RESTORE_ARGS 0,-ARG_SKIP,1
+       /*CFI_REGISTER  rflags,r11*/
+       movq    %gs:pda_oldrsp,%rsp
+       swapgs
+       sysretq
+
+       CFI_RESTORE_STATE
+       /* Handle reschedules */
+       /* edx: work, edi: workmask */  
+sysret_careful:
+       bt $TIF_NEED_RESCHED,%edx
+       jnc sysret_signal
+       TRACE_IRQS_ON
+       sti
+       pushq %rdi
+       CFI_ADJUST_CFA_OFFSET 8
+       call schedule
+       popq  %rdi
+       CFI_ADJUST_CFA_OFFSET -8
+       jmp sysret_check
+
+       /* Handle a signal */ 
+sysret_signal:
+       TRACE_IRQS_ON
+       sti
+       testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
+       jz    1f
+
+       /* Really a signal */
+       /* edx: work flags (arg3) */
+       leaq do_notify_resume(%rip),%rax
+       leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
+       xorl %esi,%esi # oldset -> arg2
+       call ptregscall_common
+1:     movl $_TIF_NEED_RESCHED,%edi
+       /* Use IRET because user could have changed frame. This
+          works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
+       cli
+       TRACE_IRQS_OFF
+       jmp int_with_check
+       
+badsys:
+       movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
+       jmp ret_from_sys_call
+
+       /* Do syscall tracing */
+tracesys:                       
+       SAVE_REST
+       movq $-ENOSYS,RAX(%rsp)
+       FIXUP_TOP_OF_STACK %rdi
+       movq %rsp,%rdi
+       call syscall_trace_enter
+       LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
+       RESTORE_REST
+       cmpq $__NR_syscall_max,%rax
+       movq $-ENOSYS,%rcx
+       cmova %rcx,%rax
+       ja  1f
+       movq %r10,%rcx  /* fixup for C */
+       call *sys_call_table(,%rax,8)
+1:     movq %rax,RAX-ARGOFFSET(%rsp)
+       /* Use IRET because user could have changed frame */
+               
+/* 
+ * Syscall return path ending with IRET.
+ * Has correct top of stack, but partial stack frame.
+ */
+       .globl int_ret_from_sys_call
+int_ret_from_sys_call:
+       cli
+       TRACE_IRQS_OFF
+       testl $3,CS-ARGOFFSET(%rsp)
+       je retint_restore_args
+       movl $_TIF_ALLWORK_MASK,%edi
+       /* edi: mask to check */
+int_with_check:
+       GET_THREAD_INFO(%rcx)
+       movl threadinfo_flags(%rcx),%edx
+       andl %edi,%edx
+       jnz   int_careful
+       andl    $~TS_COMPAT,threadinfo_status(%rcx)
+       jmp   retint_swapgs
+
+       /* Either reschedule or signal or syscall exit tracking needed. */
+       /* First do a reschedule test. */
+       /* edx: work, edi: workmask */
+int_careful:
+       bt $TIF_NEED_RESCHED,%edx
+       jnc  int_very_careful
+       TRACE_IRQS_ON
+       sti
+       pushq %rdi
+       CFI_ADJUST_CFA_OFFSET 8
+       call schedule
+       popq %rdi
+       CFI_ADJUST_CFA_OFFSET -8
+       cli
+       TRACE_IRQS_OFF
+       jmp int_with_check
+
+       /* handle signals and tracing -- both require a full stack frame */
+int_very_careful:
+       TRACE_IRQS_ON
+       sti
+       SAVE_REST
+       /* Check for syscall exit trace */      
+       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
+       jz int_signal
+       pushq %rdi
+       CFI_ADJUST_CFA_OFFSET 8
+       leaq 8(%rsp),%rdi       # &ptregs -> arg1       
+       call syscall_trace_leave
+       popq %rdi
+       CFI_ADJUST_CFA_OFFSET -8
+       andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
+       jmp int_restore_rest
+       
+int_signal:
+       testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
+       jz 1f
+       movq %rsp,%rdi          # &ptregs -> arg1
+       xorl %esi,%esi          # oldset -> arg2
+       call do_notify_resume
+1:     movl $_TIF_NEED_RESCHED,%edi    
+int_restore_rest:
+       RESTORE_REST
+       cli
+       TRACE_IRQS_OFF
+       jmp int_with_check
+       CFI_ENDPROC
+END(system_call)
+               
+/* 
+ * Certain special system calls that need to save a complete full stack frame.
+ */                                                            
+       
+       .macro PTREGSCALL label,func,arg
+       .globl \label
+\label:
+       leaq    \func(%rip),%rax
+       leaq    -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
+       jmp     ptregscall_common
+END(\label)
+       .endm
+
+       CFI_STARTPROC
+
+       PTREGSCALL stub_clone, sys_clone, %r8
+       PTREGSCALL stub_fork, sys_fork, %rdi
+       PTREGSCALL stub_vfork, sys_vfork, %rdi
+       PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
+       PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
+       PTREGSCALL stub_iopl, sys_iopl, %rsi
+
+ENTRY(ptregscall_common)
+       popq %r11
+       CFI_ADJUST_CFA_OFFSET -8
+       CFI_REGISTER rip, r11
+       SAVE_REST
+       movq %r11, %r15
+       CFI_REGISTER rip, r15
+       FIXUP_TOP_OF_STACK %r11
+       call *%rax
+       RESTORE_TOP_OF_STACK %r11
+       movq %r15, %r11
+       CFI_REGISTER rip, r11
+       RESTORE_REST
+       pushq %r11
+       CFI_ADJUST_CFA_OFFSET 8
+       CFI_REL_OFFSET rip, 0
+       ret
+       CFI_ENDPROC
+END(ptregscall_common)
+       
+ENTRY(stub_execve)
+       CFI_STARTPROC
+       popq %r11
+       CFI_ADJUST_CFA_OFFSET -8
+       CFI_REGISTER rip, r11
+       SAVE_REST
+       FIXUP_TOP_OF_STACK %r11
+       call sys_execve
+       RESTORE_TOP_OF_STACK %r11
+       movq %rax,RAX(%rsp)
+       RESTORE_REST
+       jmp int_ret_from_sys_call
+       CFI_ENDPROC
+END(stub_execve)
+       
+/*
+ * sigreturn is special because it needs to restore all registers on return.
+ * This cannot be done with SYSRET, so use the IRET return path instead.
+ */                
+ENTRY(stub_rt_sigreturn)
+       CFI_STARTPROC
+       addq $8, %rsp
+       CFI_ADJUST_CFA_OFFSET   -8
+       SAVE_REST
+       movq %rsp,%rdi
+       FIXUP_TOP_OF_STACK %r11
+       call sys_rt_sigreturn
+       movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
+       RESTORE_REST
+       jmp int_ret_from_sys_call
+       CFI_ENDPROC
+END(stub_rt_sigreturn)
+
+/*
+ * initial frame state for interrupts and exceptions
+ */
+       .macro _frame ref
+       CFI_STARTPROC simple
+       CFI_SIGNAL_FRAME
+       CFI_DEF_CFA rsp,SS+8-\ref
+       /*CFI_REL_OFFSET ss,SS-\ref*/
+       CFI_REL_OFFSET rsp,RSP-\ref
+       /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
+       /*CFI_REL_OFFSET cs,CS-\ref*/
+       CFI_REL_OFFSET rip,RIP-\ref
+       .endm
+
+/* initial frame state for interrupts (and exceptions without error code) */
+#define INTR_FRAME _frame RIP
+/* initial frame state for exceptions with error code (and interrupts with
+   vector already pushed) */
+#define XCPT_FRAME _frame ORIG_RAX
+
+/* 
+ * Interrupt entry/exit.
+ *
+ * Interrupt entry points save only callee clobbered registers in fast path.
+ *     
+ * Entry runs with interrupts off.     
+ */ 
+
+/* 0(%rsp): interrupt number */ 
+       .macro interrupt func
+       cld
+       SAVE_ARGS
+       leaq -ARGOFFSET(%rsp),%rdi      # arg1 for handler
+       pushq %rbp
+       CFI_ADJUST_CFA_OFFSET   8
+       CFI_REL_OFFSET          rbp, 0
+       movq %rsp,%rbp
+       CFI_DEF_CFA_REGISTER    rbp
+       testl $3,CS(%rdi)
+       je 1f
+       swapgs  
+       /* irqcount is used to check if a CPU is already on an interrupt
+          stack or not. While this is essentially redundant with preempt_count
+          it is a little cheaper to use a separate counter in the PDA
+          (short of moving irq_enter into assembly, which would be too
+           much work) */
+1:     incl    %gs:pda_irqcount
+       cmoveq %gs:pda_irqstackptr,%rsp
+       push    %rbp                    # backlink for old unwinder
+       /*
+        * We entered an interrupt context - irqs are off:
+        */
+       TRACE_IRQS_OFF
+       call \func
+       .endm
+
+ENTRY(common_interrupt)
+       XCPT_FRAME
+       interrupt do_IRQ
+       /* 0(%rsp): oldrsp-ARGOFFSET */
+ret_from_intr:
+       cli     
+       TRACE_IRQS_OFF
+       decl %gs:pda_irqcount
+       leaveq
+       CFI_DEF_CFA_REGISTER    rsp
+       CFI_ADJUST_CFA_OFFSET   -8
+exit_intr:
+       GET_THREAD_INFO(%rcx)
+       testl $3,CS-ARGOFFSET(%rsp)
+       je retint_kernel
+       
+       /* Interrupt came from user space */
+       /*
+        * Has a correct top of stack, but a partial stack frame
+        * %rcx: thread info. Interrupts off.
+        */             
+retint_with_reschedule:
+       movl $_TIF_WORK_MASK,%edi
+retint_check:
+       movl threadinfo_flags(%rcx),%edx
+       andl %edi,%edx
+       CFI_REMEMBER_STATE
+       jnz  retint_careful
+retint_swapgs:         
+       /*
+        * The iretq could re-enable interrupts:
+        */
+       cli
+       TRACE_IRQS_IRETQ
+       swapgs 
+       jmp restore_args
+
+retint_restore_args:                           
+       cli
+       /*
+        * The iretq could re-enable interrupts:
+        */
+       TRACE_IRQS_IRETQ
+restore_args:
+       RESTORE_ARGS 0,8,0                                              
+iret_label:    
+       iretq
+
+       .section __ex_table,"a"
+       .quad iret_label,bad_iret       
+       .previous
+       .section .fixup,"ax"
+       /* force a signal here? this matches i386 behaviour */
+       /* running with kernel gs */
+bad_iret:
+       movq $11,%rdi   /* SIGSEGV */
+       TRACE_IRQS_ON
+       sti
+       jmp do_exit                     
+       .previous       
+       
+       /* edi: workmask, edx: work */
+retint_careful:
+       CFI_RESTORE_STATE
+       bt    $TIF_NEED_RESCHED,%edx
+       jnc   retint_signal
+       TRACE_IRQS_ON
+       sti
+       pushq %rdi
+       CFI_ADJUST_CFA_OFFSET   8
+       call  schedule
+       popq %rdi               
+       CFI_ADJUST_CFA_OFFSET   -8
+       GET_THREAD_INFO(%rcx)
+       cli
+       TRACE_IRQS_OFF
+       jmp retint_check
+       
+retint_signal:
+       testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
+       jz    retint_swapgs
+       TRACE_IRQS_ON
+       sti
+       SAVE_REST
+       movq $-1,ORIG_RAX(%rsp)                         
+       xorl %esi,%esi          # oldset
+       movq %rsp,%rdi          # &pt_regs
+       call do_notify_resume
+       RESTORE_REST
+       cli
+       TRACE_IRQS_OFF
+       movl $_TIF_NEED_RESCHED,%edi
+       GET_THREAD_INFO(%rcx)
+       jmp retint_check
+
+#ifdef CONFIG_PREEMPT
+       /* Returning to kernel space. Check if we need preemption */
+       /* rcx:  threadinfo. interrupts off. */
+ENTRY(retint_kernel)
+       cmpl $0,threadinfo_preempt_count(%rcx)
+       jnz  retint_restore_args
+       bt  $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
+       jnc  retint_restore_args
+       bt   $9,EFLAGS-ARGOFFSET(%rsp)  /* interrupts off? */
+       jnc  retint_restore_args
+       call preempt_schedule_irq
+       jmp exit_intr
+#endif 
+
+       CFI_ENDPROC
+END(common_interrupt)
+       
+/*
+ * APIC interrupts.
+ */            
+       .macro apicinterrupt num,func
+       INTR_FRAME
+       pushq $~(\num)
+       CFI_ADJUST_CFA_OFFSET 8
+       interrupt \func
+       jmp ret_from_intr
+       CFI_ENDPROC
+       .endm
+
+ENTRY(thermal_interrupt)
+       apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
+END(thermal_interrupt)
+
+ENTRY(threshold_interrupt)
+       apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
+END(threshold_interrupt)
+
+#ifdef CONFIG_SMP      
+ENTRY(reschedule_interrupt)
+       apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
+END(reschedule_interrupt)
+
+       .macro INVALIDATE_ENTRY num
+ENTRY(invalidate_interrupt\num)
+       apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt 
+END(invalidate_interrupt\num)
+       .endm
+
+       INVALIDATE_ENTRY 0
+       INVALIDATE_ENTRY 1
+       INVALIDATE_ENTRY 2
+       INVALIDATE_ENTRY 3
+       INVALIDATE_ENTRY 4
+       INVALIDATE_ENTRY 5
+       INVALIDATE_ENTRY 6
+       INVALIDATE_ENTRY 7
+
+ENTRY(call_function_interrupt)
+       apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
+END(call_function_interrupt)
+ENTRY(irq_move_cleanup_interrupt)
+       apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
+END(irq_move_cleanup_interrupt)
+#endif
+
+ENTRY(apic_timer_interrupt)
+       apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
+END(apic_timer_interrupt)
+
+ENTRY(error_interrupt)
+       apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
+END(error_interrupt)
+
+ENTRY(spurious_interrupt)
+       apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
+END(spurious_interrupt)
+                               
+/*
+ * Exception entry points.
+ */            
+       .macro zeroentry sym
+       INTR_FRAME
+       pushq $0        /* push error code/oldrax */ 
+       CFI_ADJUST_CFA_OFFSET 8
+       pushq %rax      /* push real oldrax to the rdi slot */ 
+       CFI_ADJUST_CFA_OFFSET 8
+       CFI_REL_OFFSET rax,0
+       leaq  \sym(%rip),%rax
+       jmp error_entry
+       CFI_ENDPROC
+       .endm   
+
+       .macro errorentry sym
+       XCPT_FRAME
+       pushq %rax
+       CFI_ADJUST_CFA_OFFSET 8
+       CFI_REL_OFFSET rax,0
+       leaq  \sym(%rip),%rax
+       jmp error_entry
+       CFI_ENDPROC
+       .endm
+
+       /* error code is on the stack already */
+       /* handle NMI like exceptions that can happen everywhere */
+       .macro paranoidentry sym, ist=0, irqtrace=1
+       SAVE_ALL
+       cld
+       movl $1,%ebx
+       movl  $MSR_GS_BASE,%ecx
+       rdmsr
+       testl %edx,%edx
+       js    1f
+       swapgs
+       xorl  %ebx,%ebx
+1:
+       .if \ist
+       movq    %gs:pda_data_offset, %rbp
+       .endif
+       movq %rsp,%rdi
+       movq ORIG_RAX(%rsp),%rsi
+       movq $-1,ORIG_RAX(%rsp)
+       .if \ist
+       subq    $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
+       .endif
+       call \sym
+       .if \ist
+       addq    $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
+       .endif
+       cli
+       .if \irqtrace
+       TRACE_IRQS_OFF
+       .endif
+       .endm
+
+       /*
+        * "Paranoid" exit path from exception stack.
+        * Paranoid because this is used by NMIs and cannot take
+        * any kernel state for granted.
+        * We don't do kernel preemption checks here, because only
+        * NMI should be common and it does not enable IRQs and
+        * cannot get reschedule ticks.
+        *
+        * "trace" is 0 for the NMI handler only, because irq-tracing
+        * is fundamentally NMI-unsafe. (we cannot change the soft and
+        * hard flags at once, atomically)
+        */
+       .macro paranoidexit trace=1
+       /* ebx: no swapgs flag */
+paranoid_exit\trace:
+       testl %ebx,%ebx                         /* swapgs needed? */
+       jnz paranoid_restore\trace
+       testl $3,CS(%rsp)
+       jnz   paranoid_userspace\trace
+paranoid_swapgs\trace:
+       .if \trace
+       TRACE_IRQS_IRETQ 0
+       .endif
+       swapgs
+paranoid_restore\trace:
+       RESTORE_ALL 8
+       iretq
+paranoid_userspace\trace:
+       GET_THREAD_INFO(%rcx)
+       movl threadinfo_flags(%rcx),%ebx
+       andl $_TIF_WORK_MASK,%ebx
+       jz paranoid_swapgs\trace
+       movq %rsp,%rdi                  /* &pt_regs */
+       call sync_regs
+       movq %rax,%rsp                  /* switch stack for scheduling */
+       testl $_TIF_NEED_RESCHED,%ebx
+       jnz paranoid_schedule\trace
+       movl %ebx,%edx                  /* arg3: thread flags */
+       .if \trace
+       TRACE_IRQS_ON
+       .endif
+       sti
+       xorl %esi,%esi                  /* arg2: oldset */
+       movq %rsp,%rdi                  /* arg1: &pt_regs */
+       call do_notify_resume
+       cli
+       .if \trace
+       TRACE_IRQS_OFF
+       .endif
+       jmp paranoid_userspace\trace
+paranoid_schedule\trace:
+       .if \trace
+       TRACE_IRQS_ON
+       .endif
+       sti
+       call schedule
+       cli
+       .if \trace
+       TRACE_IRQS_OFF
+       .endif
+       jmp paranoid_userspace\trace
+       CFI_ENDPROC
+       .endm
+
+/*
+ * Exception entry point. This expects an error code/orig_rax on the stack
+ * and the exception handler in %rax.  
+ */                                            
+KPROBE_ENTRY(error_entry)
+       _frame RDI
+       CFI_REL_OFFSET rax,0
+       /* rdi slot contains rax, oldrax contains error code */
+       cld     
+       subq  $14*8,%rsp
+       CFI_ADJUST_CFA_OFFSET   (14*8)
+       movq %rsi,13*8(%rsp)
+       CFI_REL_OFFSET  rsi,RSI
+       movq 14*8(%rsp),%rsi    /* load rax from rdi slot */
+       CFI_REGISTER    rax,rsi
+       movq %rdx,12*8(%rsp)
+       CFI_REL_OFFSET  rdx,RDX
+       movq %rcx,11*8(%rsp)
+       CFI_REL_OFFSET  rcx,RCX
+       movq %rsi,10*8(%rsp)    /* store rax */ 
+       CFI_REL_OFFSET  rax,RAX
+       movq %r8, 9*8(%rsp)
+       CFI_REL_OFFSET  r8,R8
+       movq %r9, 8*8(%rsp)
+       CFI_REL_OFFSET  r9,R9
+       movq %r10,7*8(%rsp)
+       CFI_REL_OFFSET  r10,R10
+       movq %r11,6*8(%rsp)
+       CFI_REL_OFFSET  r11,R11
+       movq %rbx,5*8(%rsp) 
+       CFI_REL_OFFSET  rbx,RBX
+       movq %rbp,4*8(%rsp) 
+       CFI_REL_OFFSET  rbp,RBP
+       movq %r12,3*8(%rsp) 
+       CFI_REL_OFFSET  r12,R12
+       movq %r13,2*8(%rsp) 
+       CFI_REL_OFFSET  r13,R13
+       movq %r14,1*8(%rsp) 
+       CFI_REL_OFFSET  r14,R14
+       movq %r15,(%rsp) 
+       CFI_REL_OFFSET  r15,R15
+       xorl %ebx,%ebx  
+       testl $3,CS(%rsp)
+       je  error_kernelspace
+error_swapgs:  
+       swapgs
+error_sti:     
+       movq %rdi,RDI(%rsp)     
+       CFI_REL_OFFSET  rdi,RDI
+       movq %rsp,%rdi
+       movq ORIG_RAX(%rsp),%rsi        /* get error code */ 
+       movq $-1,ORIG_RAX(%rsp)
+       call *%rax
+       /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */     
+error_exit:            
+       movl %ebx,%eax          
+       RESTORE_REST
+       cli
+       TRACE_IRQS_OFF
+       GET_THREAD_INFO(%rcx)   
+       testl %eax,%eax
+       jne  retint_kernel
+       movl  threadinfo_flags(%rcx),%edx
+       movl  $_TIF_WORK_MASK,%edi
+       andl  %edi,%edx
+       jnz  retint_careful
+       /*
+        * The iret might restore flags:
+        */
+       TRACE_IRQS_IRETQ
+       swapgs 
+       RESTORE_ARGS 0,8,0                                              
+       jmp iret_label
+       CFI_ENDPROC
+
+error_kernelspace:
+       incl %ebx
+       /* There are two places in the kernel that can potentially fault with
+          usergs. Handle them here. The exception handlers after
+          iret run with kernel gs again, so don't set the user space flag.
+          B stepping K8s sometimes report an truncated RIP for IRET 
+          exceptions returning to compat mode. Check for these here too. */
+       leaq iret_label(%rip),%rbp
+       cmpq %rbp,RIP(%rsp) 
+       je   error_swapgs
+       movl %ebp,%ebp  /* zero extend */
+       cmpq %rbp,RIP(%rsp) 
+       je   error_swapgs
+       cmpq $gs_change,RIP(%rsp)
+        je   error_swapgs
+       jmp  error_sti
+KPROBE_END(error_entry)
+       
+       /* Reload gs selector with exception handling */
+       /* edi:  new selector */ 
+ENTRY(load_gs_index)
+       CFI_STARTPROC
+       pushf
+       CFI_ADJUST_CFA_OFFSET 8
+       cli
+        swapgs
+gs_change:     
+        movl %edi,%gs   
+2:     mfence          /* workaround */
+       swapgs
+        popf
+       CFI_ADJUST_CFA_OFFSET -8
+        ret
+       CFI_ENDPROC
+ENDPROC(load_gs_index)
+       
+        .section __ex_table,"a"
+        .align 8
+        .quad gs_change,bad_gs
+        .previous
+        .section .fixup,"ax"
+       /* running with kernelgs */
+bad_gs: 
+       swapgs                  /* switch back to user gs */
+       xorl %eax,%eax
+        movl %eax,%gs
+        jmp  2b
+        .previous       
+       
+/*
+ * Create a kernel thread.
+ *
+ * C extern interface:
+ *     extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
+ *
+ * asm input arguments:
+ *     rdi: fn, rsi: arg, rdx: flags
+ */
+ENTRY(kernel_thread)
+       CFI_STARTPROC
+       FAKE_STACK_FRAME $child_rip
+       SAVE_ALL
+
+       # rdi: flags, rsi: usp, rdx: will be &pt_regs
+       movq %rdx,%rdi
+       orq  kernel_thread_flags(%rip),%rdi
+       movq $-1, %rsi
+       movq %rsp, %rdx
+
+       xorl %r8d,%r8d
+       xorl %r9d,%r9d
+       
+       # clone now
+       call do_fork
+       movq %rax,RAX(%rsp)
+       xorl %edi,%edi
+
+       /*
+        * It isn't worth to check for reschedule here,
+        * so internally to the x86_64 port you can rely on kernel_thread()
+        * not to reschedule the child before returning, this avoids the need
+        * of hacks for example to fork off the per-CPU idle tasks.
+         * [Hopefully no generic code relies on the reschedule -AK]    
+        */
+       RESTORE_ALL
+       UNFAKE_STACK_FRAME
+       ret
+       CFI_ENDPROC
+ENDPROC(kernel_thread)
+       
+child_rip:
+       pushq $0                # fake return address
+       CFI_STARTPROC
+       /*
+        * Here we are in the child and the registers are set as they were
+        * at kernel_thread() invocation in the parent.
+        */
+       movq %rdi, %rax
+       movq %rsi, %rdi
+       call *%rax
+       # exit
+       xorl %edi, %edi
+       call do_exit
+       CFI_ENDPROC
+ENDPROC(child_rip)
+
+/*
+ * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
+ *
+ * C extern interface:
+ *      extern long execve(char *name, char **argv, char **envp)
+ *
+ * asm input arguments:
+ *     rdi: name, rsi: argv, rdx: envp
+ *
+ * We want to fallback into:
+ *     extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
+ *
+ * do_sys_execve asm fallback arguments:
+ *     rdi: name, rsi: argv, rdx: envp, fake frame on the stack
+ */
+ENTRY(kernel_execve)
+       CFI_STARTPROC
+       FAKE_STACK_FRAME $0
+       SAVE_ALL        
+       call sys_execve
+       movq %rax, RAX(%rsp)    
+       RESTORE_REST
+       testq %rax,%rax
+       je int_ret_from_sys_call
+       RESTORE_ARGS
+       UNFAKE_STACK_FRAME
+       ret
+       CFI_ENDPROC
+ENDPROC(kernel_execve)
+
+KPROBE_ENTRY(page_fault)
+       errorentry do_page_fault
+KPROBE_END(page_fault)
+
+ENTRY(coprocessor_error)
+       zeroentry do_coprocessor_error
+END(coprocessor_error)
+
+ENTRY(simd_coprocessor_error)
+       zeroentry do_simd_coprocessor_error     
+END(simd_coprocessor_error)
+
+ENTRY(device_not_available)
+       zeroentry math_state_restore
+END(device_not_available)
+
+       /* runs on exception stack */
+KPROBE_ENTRY(debug)
+       INTR_FRAME
+       pushq $0
+       CFI_ADJUST_CFA_OFFSET 8         
+       paranoidentry do_debug, DEBUG_STACK
+       paranoidexit
+KPROBE_END(debug)
+
+       /* runs on exception stack */   
+KPROBE_ENTRY(nmi)
+       INTR_FRAME
+       pushq $-1
+       CFI_ADJUST_CFA_OFFSET 8
+       paranoidentry do_nmi, 0, 0
+#ifdef CONFIG_TRACE_IRQFLAGS
+       paranoidexit 0
+#else
+       jmp paranoid_exit1
+       CFI_ENDPROC
+#endif
+KPROBE_END(nmi)
+
+KPROBE_ENTRY(int3)
+       INTR_FRAME
+       pushq $0
+       CFI_ADJUST_CFA_OFFSET 8
+       paranoidentry do_int3, DEBUG_STACK
+       jmp paranoid_exit1
+       CFI_ENDPROC
+KPROBE_END(int3)
+
+ENTRY(overflow)
+       zeroentry do_overflow
+END(overflow)
+
+ENTRY(bounds)
+       zeroentry do_bounds
+END(bounds)
+
+ENTRY(invalid_op)
+       zeroentry do_invalid_op 
+END(invalid_op)
+
+ENTRY(coprocessor_segment_overrun)
+       zeroentry do_coprocessor_segment_overrun
+END(coprocessor_segment_overrun)
+
+ENTRY(reserved)
+       zeroentry do_reserved
+END(reserved)
+
+       /* runs on exception stack */
+ENTRY(double_fault)
+       XCPT_FRAME
+       paranoidentry do_double_fault
+       jmp paranoid_exit1
+       CFI_ENDPROC
+END(double_fault)
+
+ENTRY(invalid_TSS)
+       errorentry do_invalid_TSS
+END(invalid_TSS)
+
+ENTRY(segment_not_present)
+       errorentry do_segment_not_present
+END(segment_not_present)
+
+       /* runs on exception stack */
+ENTRY(stack_segment)
+       XCPT_FRAME
+       paranoidentry do_stack_segment
+       jmp paranoid_exit1
+       CFI_ENDPROC
+END(stack_segment)
+
+KPROBE_ENTRY(general_protection)
+       errorentry do_general_protection
+KPROBE_END(general_protection)
+
+ENTRY(alignment_check)
+       errorentry do_alignment_check
+END(alignment_check)
+
+ENTRY(divide_error)
+       zeroentry do_divide_error
+END(divide_error)
+
+ENTRY(spurious_interrupt_bug)
+       zeroentry do_spurious_interrupt_bug
+END(spurious_interrupt_bug)
+
+#ifdef CONFIG_X86_MCE
+       /* runs on exception stack */
+ENTRY(machine_check)
+       INTR_FRAME
+       pushq $0
+       CFI_ADJUST_CFA_OFFSET 8 
+       paranoidentry do_machine_check
+       jmp paranoid_exit1
+       CFI_ENDPROC
+END(machine_check)
+#endif
+
+/* Call softirq on interrupt stack. Interrupts are off. */
+ENTRY(call_softirq)
+       CFI_STARTPROC
+       push %rbp
+       CFI_ADJUST_CFA_OFFSET   8
+       CFI_REL_OFFSET rbp,0
+       mov  %rsp,%rbp
+       CFI_DEF_CFA_REGISTER rbp
+       incl %gs:pda_irqcount
+       cmove %gs:pda_irqstackptr,%rsp
+       push  %rbp                      # backlink for old unwinder
+       call __do_softirq
+       leaveq
+       CFI_DEF_CFA_REGISTER    rsp
+       CFI_ADJUST_CFA_OFFSET   -8
+       decl %gs:pda_irqcount
+       ret
+       CFI_ENDPROC
+ENDPROC(call_softirq)
+
+KPROBE_ENTRY(ignore_sysret)
+       CFI_STARTPROC
+       mov $-ENOSYS,%eax
+       sysret
+       CFI_ENDPROC
+ENDPROC(ignore_sysret)
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
new file mode 100644 (file)
index 0000000..47496a4
--- /dev/null
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic APIC sub-arch probe layer.
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ */
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+
+#include <asm/smp.h>
+#include <asm/ipi.h>
+#include <asm/genapic.h>
+
+#ifdef CONFIG_ACPI
+#include <acpi/acpi_bus.h>
+#endif
+
+/* which logical CPU number maps to which CPU (physical APIC ID) */
+u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly
+                                       = { [0 ... NR_CPUS-1] = BAD_APICID };
+EXPORT_SYMBOL(x86_cpu_to_apicid);
+
+u8 x86_cpu_to_log_apicid[NR_CPUS]      = { [0 ... NR_CPUS-1] = BAD_APICID };
+
+struct genapic __read_mostly *genapic = &apic_flat;
+
+/*
+ * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
+ */
+void __init setup_apic_routing(void)
+{
+#ifdef CONFIG_ACPI
+       /*
+        * Quirk: some x86_64 machines can only use physical APIC mode
+        * regardless of how many processors are present (x86_64 ES7000
+        * is an example).
+        */
+       if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
+                       (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
+               genapic = &apic_physflat;
+       else
+#endif
+
+       if (cpus_weight(cpu_possible_map) <= 8)
+               genapic = &apic_flat;
+       else
+               genapic = &apic_physflat;
+
+       printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
+}
+
+/* Same for both flat and physical. */
+
+void send_IPI_self(int vector)
+{
+       __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
+}
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
new file mode 100644 (file)
index 0000000..ecb01ee
--- /dev/null
@@ -0,0 +1,194 @@
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Flat APIC subarch code.
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ */
+#include <linux/errno.h>
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <asm/smp.h>
+#include <asm/ipi.h>
+#include <asm/genapic.h>
+
+static cpumask_t flat_target_cpus(void)
+{
+       return cpu_online_map;
+}
+
+static cpumask_t flat_vector_allocation_domain(int cpu)
+{
+       /* Careful. Some cpus do not strictly honor the set of cpus
+        * specified in the interrupt destination when using lowest
+        * priority interrupt delivery mode.
+        *
+        * In particular there was a hyperthreading cpu observed to
+        * deliver interrupts to the wrong hyperthread when only one
+        * hyperthread was specified in the interrupt desitination.
+        */
+       cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
+       return domain;
+}
+
+/*
+ * Set up the logical destination ID.
+ *
+ * Intel recommends to set DFR, LDR and TPR before enabling
+ * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116).  So here it goes...
+ */
+static void flat_init_apic_ldr(void)
+{
+       unsigned long val;
+       unsigned long num, id;
+
+       num = smp_processor_id();
+       id = 1UL << num;
+       x86_cpu_to_log_apicid[num] = id;
+       apic_write(APIC_DFR, APIC_DFR_FLAT);
+       val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+       val |= SET_APIC_LOGICAL_ID(id);
+       apic_write(APIC_LDR, val);
+}
+
+static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
+{
+       unsigned long mask = cpus_addr(cpumask)[0];
+       unsigned long flags;
+
+       local_irq_save(flags);
+       __send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL);
+       local_irq_restore(flags);
+}
+
+static void flat_send_IPI_allbutself(int vector)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+       int hotplug = 1;
+#else
+       int hotplug = 0;
+#endif
+       if (hotplug || vector == NMI_VECTOR) {
+               cpumask_t allbutme = cpu_online_map;
+
+               cpu_clear(smp_processor_id(), allbutme);
+
+               if (!cpus_empty(allbutme))
+                       flat_send_IPI_mask(allbutme, vector);
+       } else if (num_online_cpus() > 1) {
+               __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL);
+       }
+}
+
+static void flat_send_IPI_all(int vector)
+{
+       if (vector == NMI_VECTOR)
+               flat_send_IPI_mask(cpu_online_map, vector);
+       else
+               __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
+}
+
+static int flat_apic_id_registered(void)
+{
+       return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map);
+}
+
+static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
+{
+       return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
+}
+
+static unsigned int phys_pkg_id(int index_msb)
+{
+       return hard_smp_processor_id() >> index_msb;
+}
+
+struct genapic apic_flat =  {
+       .name = "flat",
+       .int_delivery_mode = dest_LowestPrio,
+       .int_dest_mode = (APIC_DEST_LOGICAL != 0),
+       .target_cpus = flat_target_cpus,
+       .vector_allocation_domain = flat_vector_allocation_domain,
+       .apic_id_registered = flat_apic_id_registered,
+       .init_apic_ldr = flat_init_apic_ldr,
+       .send_IPI_all = flat_send_IPI_all,
+       .send_IPI_allbutself = flat_send_IPI_allbutself,
+       .send_IPI_mask = flat_send_IPI_mask,
+       .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
+       .phys_pkg_id = phys_pkg_id,
+};
+
+/*
+ * Physflat mode is used when there are more than 8 CPUs on a AMD system.
+ * We cannot use logical delivery in this case because the mask
+ * overflows, so use physical mode.
+ */
+
+static cpumask_t physflat_target_cpus(void)
+{
+       return cpu_online_map;
+}
+
+static cpumask_t physflat_vector_allocation_domain(int cpu)
+{
+       cpumask_t domain = CPU_MASK_NONE;
+       cpu_set(cpu, domain);
+       return domain;
+}
+
+
+static void physflat_send_IPI_mask(cpumask_t cpumask, int vector)
+{
+       send_IPI_mask_sequence(cpumask, vector);
+}
+
+static void physflat_send_IPI_allbutself(int vector)
+{
+       cpumask_t allbutme = cpu_online_map;
+
+       cpu_clear(smp_processor_id(), allbutme);
+       physflat_send_IPI_mask(allbutme, vector);
+}
+
+static void physflat_send_IPI_all(int vector)
+{
+       physflat_send_IPI_mask(cpu_online_map, vector);
+}
+
+static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
+{
+       int cpu;
+
+       /*
+        * We're using fixed IRQ delivery, can only return one phys APIC ID.
+        * May as well be the first.
+        */
+       cpu = first_cpu(cpumask);
+       if ((unsigned)cpu < NR_CPUS)
+               return x86_cpu_to_apicid[cpu];
+       else
+               return BAD_APICID;
+}
+
+struct genapic apic_physflat =  {
+       .name = "physical flat",
+       .int_delivery_mode = dest_Fixed,
+       .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
+       .target_cpus = physflat_target_cpus,
+       .vector_allocation_domain = physflat_vector_allocation_domain,
+       .apic_id_registered = flat_apic_id_registered,
+       .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/
+       .send_IPI_all = physflat_send_IPI_all,
+       .send_IPI_allbutself = physflat_send_IPI_allbutself,
+       .send_IPI_mask = physflat_send_IPI_mask,
+       .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
+       .phys_pkg_id = phys_pkg_id,
+};
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
new file mode 100644 (file)
index 0000000..6c34bdd
--- /dev/null
@@ -0,0 +1,86 @@
+/*
+ *  linux/arch/x86_64/kernel/head64.c -- prepare to run common code
+ *
+ *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ */
+
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/percpu.h>
+
+#include <asm/processor.h>
+#include <asm/proto.h>
+#include <asm/smp.h>
+#include <asm/bootsetup.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+
+static void __init zap_identity_mappings(void)
+{
+       pgd_t *pgd = pgd_offset_k(0UL);
+       pgd_clear(pgd);
+       __flush_tlb();
+}
+
+/* Don't add a printk in there. printk relies on the PDA which is not initialized 
+   yet. */
+static void __init clear_bss(void)
+{
+       memset(__bss_start, 0,
+              (unsigned long) __bss_stop - (unsigned long) __bss_start);
+}
+
+#define NEW_CL_POINTER         0x228   /* Relative to real mode data */
+#define OLD_CL_MAGIC_ADDR      0x20
+#define OLD_CL_MAGIC            0xA33F
+#define OLD_CL_OFFSET           0x22
+
+static void __init copy_bootdata(char *real_mode_data)
+{
+       unsigned long new_data;
+       char * command_line;
+
+       memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE);
+       new_data = *(u32 *) (x86_boot_params + NEW_CL_POINTER);
+       if (!new_data) {
+               if (OLD_CL_MAGIC != *(u16 *)(real_mode_data + OLD_CL_MAGIC_ADDR)) {
+                       return;
+               }
+               new_data = __pa(real_mode_data) + *(u16 *)(real_mode_data + OLD_CL_OFFSET);
+       }
+       command_line = __va(new_data);
+       memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
+}
+
+void __init x86_64_start_kernel(char * real_mode_data)
+{
+       int i;
+
+       /* clear bss before set_intr_gate with early_idt_handler */
+       clear_bss();
+
+       /* Make NULL pointers segfault */
+       zap_identity_mappings();
+
+       for (i = 0; i < IDT_ENTRIES; i++)
+               set_intr_gate(i, early_idt_handler);
+       asm volatile("lidt %0" :: "m" (idt_descr));
+
+       early_printk("Kernel alive\n");
+
+       for (i = 0; i < NR_CPUS; i++)
+               cpu_pda(i) = &boot_cpu_pda[i];
+
+       pda_init(0);
+       copy_bootdata(__va(real_mode_data));
+#ifdef CONFIG_SMP
+       cpu_set(0, cpu_online_map);
+#endif
+       start_kernel();
+}
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
new file mode 100644 (file)
index 0000000..b6167fe
--- /dev/null
@@ -0,0 +1,416 @@
+/*
+ *  linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
+ *
+ *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ *  Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
+ *  Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
+ *  Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
+ *  Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com>
+ */
+
+
+#include <linux/linkage.h>
+#include <linux/threads.h>
+#include <linux/init.h>
+#include <asm/desc.h>
+#include <asm/segment.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+#include <asm/cache.h>
+
+/* we are not able to switch in one step to the final KERNEL ADRESS SPACE
+ * because we need identity-mapped pages.
+ *
+ */
+
+       .text
+       .section .text.head
+       .code64
+       .globl startup_64
+startup_64:
+
+       /*
+        * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
+        * and someone has loaded an identity mapped page table
+        * for us.  These identity mapped page tables map all of the
+        * kernel pages and possibly all of memory.
+        *
+        * %esi holds a physical pointer to real_mode_data.
+        *
+        * We come here either directly from a 64bit bootloader, or from
+        * arch/x86_64/boot/compressed/head.S.
+        *
+        * We only come here initially at boot nothing else comes here.
+        *
+        * Since we may be loaded at an address different from what we were
+        * compiled to run at we first fixup the physical addresses in our page
+        * tables and then reload them.
+        */
+
+       /* Compute the delta between the address I am compiled to run at and the
+        * address I am actually running at.
+        */
+       leaq    _text(%rip), %rbp
+       subq    $_text - __START_KERNEL_map, %rbp
+
+       /* Is the address not 2M aligned? */
+       movq    %rbp, %rax
+       andl    $~LARGE_PAGE_MASK, %eax
+       testl   %eax, %eax
+       jnz     bad_address
+
+       /* Is the address too large? */
+       leaq    _text(%rip), %rdx
+       movq    $PGDIR_SIZE, %rax
+       cmpq    %rax, %rdx
+       jae     bad_address
+
+       /* Fixup the physical addresses in the page table
+        */
+       addq    %rbp, init_level4_pgt + 0(%rip)
+       addq    %rbp, init_level4_pgt + (258*8)(%rip)
+       addq    %rbp, init_level4_pgt + (511*8)(%rip)
+
+       addq    %rbp, level3_ident_pgt + 0(%rip)
+
+       addq    %rbp, level3_kernel_pgt + (510*8)(%rip)
+       addq    %rbp, level3_kernel_pgt + (511*8)(%rip)
+
+       addq    %rbp, level2_fixmap_pgt + (506*8)(%rip)
+
+       /* Add an Identity mapping if I am above 1G */
+       leaq    _text(%rip), %rdi
+       andq    $LARGE_PAGE_MASK, %rdi
+
+       movq    %rdi, %rax
+       shrq    $PUD_SHIFT, %rax
+       andq    $(PTRS_PER_PUD - 1), %rax
+       jz      ident_complete
+
+       leaq    (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
+       leaq    level3_ident_pgt(%rip), %rbx
+       movq    %rdx, 0(%rbx, %rax, 8)
+
+       movq    %rdi, %rax
+       shrq    $PMD_SHIFT, %rax
+       andq    $(PTRS_PER_PMD - 1), %rax
+       leaq    __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx
+       leaq    level2_spare_pgt(%rip), %rbx
+       movq    %rdx, 0(%rbx, %rax, 8)
+ident_complete:
+
+       /* Fixup the kernel text+data virtual addresses
+        */
+       leaq    level2_kernel_pgt(%rip), %rdi
+       leaq    4096(%rdi), %r8
+       /* See if it is a valid page table entry */
+1:     testq   $1, 0(%rdi)
+       jz      2f
+       addq    %rbp, 0(%rdi)
+       /* Go to the next page */
+2:     addq    $8, %rdi
+       cmp     %r8, %rdi
+       jne     1b
+
+       /* Fixup phys_base */
+       addq    %rbp, phys_base(%rip)
+
+#ifdef CONFIG_SMP
+       addq    %rbp, trampoline_level4_pgt + 0(%rip)
+       addq    %rbp, trampoline_level4_pgt + (511*8)(%rip)
+#endif
+#ifdef CONFIG_ACPI_SLEEP
+       addq    %rbp, wakeup_level4_pgt + 0(%rip)
+       addq    %rbp, wakeup_level4_pgt + (511*8)(%rip)
+#endif
+
+       /* Due to ENTRY(), sometimes the empty space gets filled with
+        * zeros. Better take a jmp than relying on empty space being
+        * filled with 0x90 (nop)
+        */
+       jmp secondary_startup_64
+ENTRY(secondary_startup_64)
+       /*
+        * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
+        * and someone has loaded a mapped page table.
+        *
+        * %esi holds a physical pointer to real_mode_data.
+        *
+        * We come here either from startup_64 (using physical addresses)
+        * or from trampoline.S (using virtual addresses).
+        *
+        * Using virtual addresses from trampoline.S removes the need
+        * to have any identity mapped pages in the kernel page table
+        * after the boot processor executes this code.
+        */
+
+       /* Enable PAE mode and PGE */
+       xorq    %rax, %rax
+       btsq    $5, %rax
+       btsq    $7, %rax
+       movq    %rax, %cr4
+
+       /* Setup early boot stage 4 level pagetables. */
+       movq    $(init_level4_pgt - __START_KERNEL_map), %rax
+       addq    phys_base(%rip), %rax
+       movq    %rax, %cr3
+
+       /* Ensure I am executing from virtual addresses */
+       movq    $1f, %rax
+       jmp     *%rax
+1:
+
+       /* Check if nx is implemented */
+       movl    $0x80000001, %eax
+       cpuid
+       movl    %edx,%edi
+
+       /* Setup EFER (Extended Feature Enable Register) */
+       movl    $MSR_EFER, %ecx
+       rdmsr
+       btsl    $_EFER_SCE, %eax        /* Enable System Call */
+       btl     $20,%edi                /* No Execute supported? */
+       jnc     1f
+       btsl    $_EFER_NX, %eax
+1:     wrmsr                           /* Make changes effective */
+
+       /* Setup cr0 */
+#define CR0_PM                         1               /* protected mode */
+#define CR0_MP                         (1<<1)
+#define CR0_ET                         (1<<4)
+#define CR0_NE                         (1<<5)
+#define CR0_WP                         (1<<16)
+#define CR0_AM                         (1<<18)
+#define CR0_PAGING                     (1<<31)
+       movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax
+       /* Make changes effective */
+       movq    %rax, %cr0
+
+       /* Setup a boot time stack */
+       movq init_rsp(%rip),%rsp
+
+       /* zero EFLAGS after setting rsp */
+       pushq $0
+       popfq
+
+       /*
+        * We must switch to a new descriptor in kernel space for the GDT
+        * because soon the kernel won't have access anymore to the userspace
+        * addresses where we're currently running on. We have to do that here
+        * because in 32bit we couldn't load a 64bit linear address.
+        */
+       lgdt    cpu_gdt_descr(%rip)
+
+       /* set up data segments. actually 0 would do too */
+       movl $__KERNEL_DS,%eax
+       movl %eax,%ds
+       movl %eax,%ss
+       movl %eax,%es
+
+       /*
+        * We don't really need to load %fs or %gs, but load them anyway
+        * to kill any stale realmode selectors.  This allows execution
+        * under VT hardware.
+        */
+       movl %eax,%fs
+       movl %eax,%gs
+
+       /* 
+        * Setup up a dummy PDA. this is just for some early bootup code
+        * that does in_interrupt() 
+        */ 
+       movl    $MSR_GS_BASE,%ecx
+       movq    $empty_zero_page,%rax
+       movq    %rax,%rdx
+       shrq    $32,%rdx
+       wrmsr   
+
+       /* esi is pointer to real mode structure with interesting info.
+          pass it to C */
+       movl    %esi, %edi
+       
+       /* Finally jump to run C code and to be on real kernel address
+        * Since we are running on identity-mapped space we have to jump
+        * to the full 64bit address, this is only possible as indirect
+        * jump.  In addition we need to ensure %cs is set so we make this
+        * a far return.
+        */
+       movq    initial_code(%rip),%rax
+       pushq   $0              # fake return address to stop unwinder
+       pushq   $__KERNEL_CS    # set correct cs
+       pushq   %rax            # target address in negative space
+       lretq
+
+       /* SMP bootup changes these two */
+#ifndef CONFIG_HOTPLUG_CPU
+       .pushsection .init.data
+#endif
+       .align  8
+       .globl  initial_code
+initial_code:
+       .quad   x86_64_start_kernel
+#ifndef CONFIG_HOTPLUG_CPU
+       .popsection
+#endif
+       .globl init_rsp
+init_rsp:
+       .quad  init_thread_union+THREAD_SIZE-8
+
+bad_address:
+       jmp bad_address
+
+ENTRY(early_idt_handler)
+       cmpl $2,early_recursion_flag(%rip)
+       jz  1f
+       incl early_recursion_flag(%rip)
+       xorl %eax,%eax
+       movq 8(%rsp),%rsi       # get rip
+       movq (%rsp),%rdx
+       movq %cr2,%rcx
+       leaq early_idt_msg(%rip),%rdi
+       call early_printk
+       cmpl $2,early_recursion_flag(%rip)
+       jz  1f
+       call dump_stack
+#ifdef CONFIG_KALLSYMS 
+       leaq early_idt_ripmsg(%rip),%rdi
+       movq 8(%rsp),%rsi       # get rip again
+       call __print_symbol
+#endif
+1:     hlt
+       jmp 1b
+early_recursion_flag:
+       .long 0
+
+early_idt_msg:
+       .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n"
+early_idt_ripmsg:
+       .asciz "RIP %s\n"
+
+.balign PAGE_SIZE
+
+#define NEXT_PAGE(name) \
+       .balign PAGE_SIZE; \
+ENTRY(name)
+
+/* Automate the creation of 1 to 1 mapping pmd entries */
+#define PMDS(START, PERM, COUNT)               \
+       i = 0 ;                                 \
+       .rept (COUNT) ;                         \
+       .quad   (START) + (i << 21) + (PERM) ;  \
+       i = i + 1 ;                             \
+       .endr
+
+       /*
+        * This default setting generates an ident mapping at address 0x100000
+        * and a mapping for the kernel that precisely maps virtual address
+        * 0xffffffff80000000 to physical address 0x000000. (always using
+        * 2Mbyte large pages provided by PAE mode)
+        */
+NEXT_PAGE(init_level4_pgt)
+       .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+       .fill   257,8,0
+       .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+       .fill   252,8,0
+       /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+       .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+
+NEXT_PAGE(level3_ident_pgt)
+       .quad   level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+       .fill   511,8,0
+
+NEXT_PAGE(level3_kernel_pgt)
+       .fill   510,8,0
+       /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
+       .quad   level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
+       .quad   level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+
+NEXT_PAGE(level2_fixmap_pgt)
+       .fill   506,8,0
+       .quad   level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+       /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
+       .fill   5,8,0
+
+NEXT_PAGE(level1_fixmap_pgt)
+       .fill   512,8,0
+
+NEXT_PAGE(level2_ident_pgt)
+       /* Since I easily can, map the first 1G.
+        * Don't set NX because code runs from these pages.
+        */
+       PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD)
+
+NEXT_PAGE(level2_kernel_pgt)
+       /* 40MB kernel mapping. The kernel code cannot be bigger than that.
+          When you change this change KERNEL_TEXT_SIZE in page.h too. */
+       /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */
+       PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, KERNEL_TEXT_SIZE/PMD_SIZE)
+       /* Module mapping starts here */
+       .fill   (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0
+
+NEXT_PAGE(level2_spare_pgt)
+       .fill   512,8,0
+
+#undef PMDS
+#undef NEXT_PAGE
+
+       .data
+       .align 16
+       .globl cpu_gdt_descr
+cpu_gdt_descr:
+       .word   gdt_end-cpu_gdt_table-1
+gdt:
+       .quad   cpu_gdt_table
+#ifdef CONFIG_SMP
+       .rept   NR_CPUS-1
+       .word   0
+       .quad   0
+       .endr
+#endif
+
+ENTRY(phys_base)
+       /* This must match the first entry in level2_kernel_pgt */
+       .quad   0x0000000000000000
+
+/* We need valid kernel segments for data and code in long mode too
+ * IRET will check the segment types  kkeil 2000/10/28
+ * Also sysret mandates a special GDT layout 
+ */
+                               
+       .section .data.page_aligned, "aw"
+       .align PAGE_SIZE
+
+/* The TLS descriptors are currently at a different place compared to i386.
+   Hopefully nobody expects them at a fixed place (Wine?) */
+       
+ENTRY(cpu_gdt_table)
+       .quad   0x0000000000000000      /* NULL descriptor */
+       .quad   0x00cf9b000000ffff      /* __KERNEL32_CS */
+       .quad   0x00af9b000000ffff      /* __KERNEL_CS */
+       .quad   0x00cf93000000ffff      /* __KERNEL_DS */
+       .quad   0x00cffb000000ffff      /* __USER32_CS */
+       .quad   0x00cff3000000ffff      /* __USER_DS, __USER32_DS  */
+       .quad   0x00affb000000ffff      /* __USER_CS */
+       .quad   0x0                     /* unused */
+       .quad   0,0                     /* TSS */
+       .quad   0,0                     /* LDT */
+       .quad   0,0,0                   /* three TLS descriptors */ 
+       .quad   0x0000f40000000000      /* node/CPU stored in limit */
+gdt_end:       
+       /* asm/segment.h:GDT_ENTRIES must match this */ 
+       /* This should be a multiple of the cache line size */
+       /* GDTs of other CPUs are now dynamically allocated */
+
+       /* zero the remaining page */
+       .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
+
+       .section .bss, "aw", @nobits
+       .align L1_CACHE_BYTES
+ENTRY(idt_table)
+       .skip 256 * 16
+
+       .section .bss.page_aligned, "aw", @nobits
+       .align PAGE_SIZE
+ENTRY(empty_zero_page)
+       .skip PAGE_SIZE
diff --git a/arch/x86/kernel/hpet_64.c b/arch/x86/kernel/hpet_64.c
new file mode 100644 (file)
index 0000000..e2d1b91
--- /dev/null
@@ -0,0 +1,493 @@
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/mc146818rtc.h>
+#include <linux/time.h>
+#include <linux/clocksource.h>
+#include <linux/ioport.h>
+#include <linux/acpi.h>
+#include <linux/hpet.h>
+#include <asm/pgtable.h>
+#include <asm/vsyscall.h>
+#include <asm/timex.h>
+#include <asm/hpet.h>
+
+#define HPET_MASK      0xFFFFFFFF
+#define HPET_SHIFT     22
+
+/* FSEC = 10^-15 NSEC = 10^-9 */
+#define FSEC_PER_NSEC  1000000
+
+int nohpet __initdata;
+
+unsigned long hpet_address;
+unsigned long hpet_period;     /* fsecs / HPET clock */
+unsigned long hpet_tick;       /* HPET clocks / interrupt */
+
+int hpet_use_timer;            /* Use counter of hpet for time keeping,
+                                * otherwise PIT
+                                */
+
+#ifdef CONFIG_HPET
+static __init int late_hpet_init(void)
+{
+       struct hpet_data        hd;
+       unsigned int            ntimer;
+
+       if (!hpet_address)
+               return 0;
+
+       memset(&hd, 0, sizeof(hd));
+
+       ntimer = hpet_readl(HPET_ID);
+       ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
+       ntimer++;
+
+       /*
+        * Register with driver.
+        * Timer0 and Timer1 is used by platform.
+        */
+       hd.hd_phys_address = hpet_address;
+       hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE);
+       hd.hd_nirqs = ntimer;
+       hd.hd_flags = HPET_DATA_PLATFORM;
+       hpet_reserve_timer(&hd, 0);
+#ifdef CONFIG_HPET_EMULATE_RTC
+       hpet_reserve_timer(&hd, 1);
+#endif
+       hd.hd_irq[0] = HPET_LEGACY_8254;
+       hd.hd_irq[1] = HPET_LEGACY_RTC;
+       if (ntimer > 2) {
+               struct hpet             *hpet;
+               struct hpet_timer       *timer;
+               int                     i;
+
+               hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE);
+               timer = &hpet->hpet_timers[2];
+               for (i = 2; i < ntimer; timer++, i++)
+                       hd.hd_irq[i] = (timer->hpet_config &
+                                       Tn_INT_ROUTE_CNF_MASK) >>
+                               Tn_INT_ROUTE_CNF_SHIFT;
+
+       }
+
+       hpet_alloc(&hd);
+       return 0;
+}
+fs_initcall(late_hpet_init);
+#endif
+
+int hpet_timer_stop_set_go(unsigned long tick)
+{
+       unsigned int cfg;
+
+/*
+ * Stop the timers and reset the main counter.
+ */
+
+       cfg = hpet_readl(HPET_CFG);
+       cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
+       hpet_writel(cfg, HPET_CFG);
+       hpet_writel(0, HPET_COUNTER);
+       hpet_writel(0, HPET_COUNTER + 4);
+
+/*
+ * Set up timer 0, as periodic with first interrupt to happen at hpet_tick,
+ * and period also hpet_tick.
+ */
+       if (hpet_use_timer) {
+               hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
+                   HPET_TN_32BIT, HPET_T0_CFG);
+               hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */
+               hpet_writel(hpet_tick, HPET_T0_CMP); /* period */
+               cfg |= HPET_CFG_LEGACY;
+       }
+/*
+ * Go!
+ */
+
+       cfg |= HPET_CFG_ENABLE;
+       hpet_writel(cfg, HPET_CFG);
+
+       return 0;
+}
+
+static cycle_t read_hpet(void)
+{
+       return (cycle_t)hpet_readl(HPET_COUNTER);
+}
+
+static cycle_t __vsyscall_fn vread_hpet(void)
+{
+       return readl((void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
+}
+
+struct clocksource clocksource_hpet = {
+       .name           = "hpet",
+       .rating         = 250,
+       .read           = read_hpet,
+       .mask           = (cycle_t)HPET_MASK,
+       .mult           = 0, /* set below */
+       .shift          = HPET_SHIFT,
+       .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
+       .vread          = vread_hpet,
+};
+
+int __init hpet_arch_init(void)
+{
+       unsigned int id;
+       u64 tmp;
+
+       if (!hpet_address)
+               return -1;
+       set_fixmap_nocache(FIX_HPET_BASE, hpet_address);
+       __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
+
+/*
+ * Read the period, compute tick and quotient.
+ */
+
+       id = hpet_readl(HPET_ID);
+
+       if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER))
+               return -1;
+
+       hpet_period = hpet_readl(HPET_PERIOD);
+       if (hpet_period < 100000 || hpet_period > 100000000)
+               return -1;
+
+       hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period;
+
+       hpet_use_timer = (id & HPET_ID_LEGSUP);
+
+       /*
+        * hpet period is in femto seconds per cycle
+        * so we need to convert this to ns/cyc units
+        * aproximated by mult/2^shift
+        *
+        *  fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
+        *  fsec/cyc * 1ns/1000000fsec * 2^shift = mult
+        *  fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
+        *  (fsec/cyc << shift)/1000000 = mult
+        *  (hpet_period << shift)/FSEC_PER_NSEC = mult
+        */
+       tmp = (u64)hpet_period << HPET_SHIFT;
+       do_div(tmp, FSEC_PER_NSEC);
+       clocksource_hpet.mult = (u32)tmp;
+       clocksource_register(&clocksource_hpet);
+
+       return hpet_timer_stop_set_go(hpet_tick);
+}
+
+int hpet_reenable(void)
+{
+       return hpet_timer_stop_set_go(hpet_tick);
+}
+
+/*
+ * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing
+ * it to the HPET timer of known frequency.
+ */
+
+#define TICK_COUNT 100000000
+#define SMI_THRESHOLD 50000
+#define MAX_TRIES  5
+
+/*
+ * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none
+ * occurs between the reads of the hpet & TSC.
+ */
+static void __init read_hpet_tsc(int *hpet, int *tsc)
+{
+       int tsc1, tsc2, hpet1, i;
+
+       for (i = 0; i < MAX_TRIES; i++) {
+               tsc1 = get_cycles_sync();
+               hpet1 = hpet_readl(HPET_COUNTER);
+               tsc2 = get_cycles_sync();
+               if ((tsc2 - tsc1) < SMI_THRESHOLD)
+                       break;
+       }
+       *hpet = hpet1;
+       *tsc = tsc2;
+}
+
+unsigned int __init hpet_calibrate_tsc(void)
+{
+       int tsc_start, hpet_start;
+       int tsc_now, hpet_now;
+       unsigned long flags;
+
+       local_irq_save(flags);
+
+       read_hpet_tsc(&hpet_start, &tsc_start);
+
+       do {
+               local_irq_disable();
+               read_hpet_tsc(&hpet_now, &tsc_now);
+               local_irq_restore(flags);
+       } while ((tsc_now - tsc_start) < TICK_COUNT &&
+               (hpet_now - hpet_start) < TICK_COUNT);
+
+       return (tsc_now - tsc_start) * 1000000000L
+               / ((hpet_now - hpet_start) * hpet_period / 1000);
+}
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
+ * is enabled, we support RTC interrupt functionality in software.
+ * RTC has 3 kinds of interrupts:
+ * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
+ *    is updated
+ * 2) Alarm Interrupt - generate an interrupt at a specific time of day
+ * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
+ *    2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
+ * (1) and (2) above are implemented using polling at a frequency of
+ * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
+ * overhead. (DEFAULT_RTC_INT_FREQ)
+ * For (3), we use interrupts at 64Hz or user specified periodic
+ * frequency, whichever is higher.
+ */
+#include <linux/rtc.h>
+
+#define DEFAULT_RTC_INT_FREQ   64
+#define RTC_NUM_INTS           1
+
+static unsigned long UIE_on;
+static unsigned long prev_update_sec;
+
+static unsigned long AIE_on;
+static struct rtc_time alarm_time;
+
+static unsigned long PIE_on;
+static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ;
+static unsigned long PIE_count;
+
+static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
+static unsigned int hpet_t1_cmp; /* cached comparator register */
+
+int is_hpet_enabled(void)
+{
+       return hpet_address != 0;
+}
+
+/*
+ * Timer 1 for RTC, we do not use periodic interrupt feature,
+ * even if HPET supports periodic interrupts on Timer 1.
+ * The reason being, to set up a periodic interrupt in HPET, we need to
+ * stop the main counter. And if we do that everytime someone diables/enables
+ * RTC, we will have adverse effect on main kernel timer running on Timer 0.
+ * So, for the time being, simulate the periodic interrupt in software.
+ *
+ * hpet_rtc_timer_init() is called for the first time and during subsequent
+ * interuppts reinit happens through hpet_rtc_timer_reinit().
+ */
+int hpet_rtc_timer_init(void)
+{
+       unsigned int cfg, cnt;
+       unsigned long flags;
+
+       if (!is_hpet_enabled())
+               return 0;
+       /*
+        * Set the counter 1 and enable the interrupts.
+        */
+       if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
+               hpet_rtc_int_freq = PIE_freq;
+       else
+               hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
+
+       local_irq_save(flags);
+
+       cnt = hpet_readl(HPET_COUNTER);
+       cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
+       hpet_writel(cnt, HPET_T1_CMP);
+       hpet_t1_cmp = cnt;
+
+       cfg = hpet_readl(HPET_T1_CFG);
+       cfg &= ~HPET_TN_PERIODIC;
+       cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+       hpet_writel(cfg, HPET_T1_CFG);
+
+       local_irq_restore(flags);
+
+       return 1;
+}
+
+static void hpet_rtc_timer_reinit(void)
+{
+       unsigned int cfg, cnt, ticks_per_int, lost_ints;
+
+       if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
+               cfg = hpet_readl(HPET_T1_CFG);
+               cfg &= ~HPET_TN_ENABLE;
+               hpet_writel(cfg, HPET_T1_CFG);
+               return;
+       }
+
+       if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
+               hpet_rtc_int_freq = PIE_freq;
+       else
+               hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
+
+       /* It is more accurate to use the comparator value than current count.*/
+       ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq;
+       hpet_t1_cmp += ticks_per_int;
+       hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+
+       /*
+        * If the interrupt handler was delayed too long, the write above tries
+        * to schedule the next interrupt in the past and the hardware would
+        * not interrupt until the counter had wrapped around.
+        * So we have to check that the comparator wasn't set to a past time.
+        */
+       cnt = hpet_readl(HPET_COUNTER);
+       if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) {
+               lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1;
+               /* Make sure that, even with the time needed to execute
+                * this code, the next scheduled interrupt has been moved
+                * back to the future: */
+               lost_ints++;
+
+               hpet_t1_cmp += lost_ints * ticks_per_int;
+               hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+
+               if (PIE_on)
+                       PIE_count += lost_ints;
+
+               if (printk_ratelimit())
+                       printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
+                              hpet_rtc_int_freq);
+       }
+}
+
+/*
+ * The functions below are called from rtc driver.
+ * Return 0 if HPET is not being used.
+ * Otherwise do the necessary changes and return 1.
+ */
+int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
+{
+       if (!is_hpet_enabled())
+               return 0;
+
+       if (bit_mask & RTC_UIE)
+               UIE_on = 0;
+       if (bit_mask & RTC_PIE)
+               PIE_on = 0;
+       if (bit_mask & RTC_AIE)
+               AIE_on = 0;
+
+       return 1;
+}
+
+int hpet_set_rtc_irq_bit(unsigned long bit_mask)
+{
+       int timer_init_reqd = 0;
+
+       if (!is_hpet_enabled())
+               return 0;
+
+       if (!(PIE_on | AIE_on | UIE_on))
+               timer_init_reqd = 1;
+
+       if (bit_mask & RTC_UIE) {
+               UIE_on = 1;
+       }
+       if (bit_mask & RTC_PIE) {
+               PIE_on = 1;
+               PIE_count = 0;
+       }
+       if (bit_mask & RTC_AIE) {
+               AIE_on = 1;
+       }
+
+       if (timer_init_reqd)
+               hpet_rtc_timer_init();
+
+       return 1;
+}
+
+int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec)
+{
+       if (!is_hpet_enabled())
+               return 0;
+
+       alarm_time.tm_hour = hrs;
+       alarm_time.tm_min = min;
+       alarm_time.tm_sec = sec;
+
+       return 1;
+}
+
+int hpet_set_periodic_freq(unsigned long freq)
+{
+       if (!is_hpet_enabled())
+               return 0;
+
+       PIE_freq = freq;
+       PIE_count = 0;
+
+       return 1;
+}
+
+int hpet_rtc_dropped_irq(void)
+{
+       if (!is_hpet_enabled())
+               return 0;
+
+       return 1;
+}
+
+irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
+{
+       struct rtc_time curr_time;
+       unsigned long rtc_int_flag = 0;
+       int call_rtc_interrupt = 0;
+
+       hpet_rtc_timer_reinit();
+
+       if (UIE_on | AIE_on) {
+               rtc_get_rtc_time(&curr_time);
+       }
+       if (UIE_on) {
+               if (curr_time.tm_sec != prev_update_sec) {
+                       /* Set update int info, call real rtc int routine */
+                       call_rtc_interrupt = 1;
+                       rtc_int_flag = RTC_UF;
+                       prev_update_sec = curr_time.tm_sec;
+               }
+       }
+       if (PIE_on) {
+               PIE_count++;
+               if (PIE_count >= hpet_rtc_int_freq/PIE_freq) {
+                       /* Set periodic int info, call real rtc int routine */
+                       call_rtc_interrupt = 1;
+                       rtc_int_flag |= RTC_PF;
+                       PIE_count = 0;
+               }
+       }
+       if (AIE_on) {
+               if ((curr_time.tm_sec == alarm_time.tm_sec) &&
+                   (curr_time.tm_min == alarm_time.tm_min) &&
+                   (curr_time.tm_hour == alarm_time.tm_hour)) {
+                       /* Set alarm int info, call real rtc int routine */
+                       call_rtc_interrupt = 1;
+                       rtc_int_flag |= RTC_AF;
+               }
+       }
+       if (call_rtc_interrupt) {
+               rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
+               rtc_interrupt(rtc_int_flag, dev_id);
+       }
+       return IRQ_HANDLED;
+}
+#endif
+
+static int __init nohpet_setup(char *s)
+{
+       nohpet = 1;
+       return 1;
+}
+
+__setup("nohpet", nohpet_setup);
diff --git a/arch/x86/kernel/i387_64.c b/arch/x86/kernel/i387_64.c
new file mode 100644 (file)
index 0000000..1d58c13
--- /dev/null
@@ -0,0 +1,151 @@
+/*
+ *  linux/arch/x86_64/kernel/i387.c
+ *
+ *  Copyright (C) 1994 Linus Torvalds
+ *  Copyright (C) 2002 Andi Kleen, SuSE Labs
+ *
+ *  Pentium III FXSR, SSE support
+ *  General FPU state handling cleanups
+ *     Gareth Hughes <gareth@valinux.com>, May 2000
+ * 
+ *  x86-64 rework 2002 Andi Kleen. 
+ *  Does direct fxsave in and out of user space now for signal handlers.
+ *  All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation,
+ *  the 64bit user space sees a FXSAVE frame directly. 
+ */
+
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/sigcontext.h>
+#include <asm/user.h>
+#include <asm/ptrace.h>
+#include <asm/uaccess.h>
+
+unsigned int mxcsr_feature_mask __read_mostly = 0xffffffff;
+
+void mxcsr_feature_mask_init(void)
+{
+       unsigned int mask;
+       clts();
+       memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
+       asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
+       mask = current->thread.i387.fxsave.mxcsr_mask;
+       if (mask == 0) mask = 0x0000ffbf;
+       mxcsr_feature_mask &= mask;
+       stts();
+}
+
+/*
+ * Called at bootup to set up the initial FPU state that is later cloned
+ * into all processes.
+ */
+void __cpuinit fpu_init(void)
+{
+       unsigned long oldcr0 = read_cr0();
+       extern void __bad_fxsave_alignment(void);
+               
+       if (offsetof(struct task_struct, thread.i387.fxsave) & 15)
+               __bad_fxsave_alignment();
+       set_in_cr4(X86_CR4_OSFXSR);
+       set_in_cr4(X86_CR4_OSXMMEXCPT);
+
+       write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */
+
+       mxcsr_feature_mask_init();
+       /* clean state in init */
+       current_thread_info()->status = 0;
+       clear_used_math();
+}
+
+void init_fpu(struct task_struct *child)
+{
+       if (tsk_used_math(child)) {
+               if (child == current)
+                       unlazy_fpu(child);
+               return;
+       }       
+       memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
+       child->thread.i387.fxsave.cwd = 0x37f;
+       child->thread.i387.fxsave.mxcsr = 0x1f80;
+       /* only the device not available exception or ptrace can call init_fpu */
+       set_stopped_child_used_math(child);
+}
+
+/*
+ * Signal frame handlers.
+ */
+
+int save_i387(struct _fpstate __user *buf)
+{
+       struct task_struct *tsk = current;
+       int err = 0;
+
+       BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
+                       sizeof(tsk->thread.i387.fxsave));
+
+       if ((unsigned long)buf % 16) 
+               printk("save_i387: bad fpstate %p\n",buf); 
+
+       if (!used_math())
+               return 0;
+       clear_used_math(); /* trigger finit */
+       if (task_thread_info(tsk)->status & TS_USEDFPU) {
+               err = save_i387_checking((struct i387_fxsave_struct __user *)buf);
+               if (err) return err;
+               stts();
+               } else {
+               if (__copy_to_user(buf, &tsk->thread.i387.fxsave, 
+                                  sizeof(struct i387_fxsave_struct)))
+                       return -1;
+       } 
+               return 1;
+}
+
+/*
+ * ptrace request handlers.
+ */
+
+int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk)
+{
+       init_fpu(tsk);
+       return __copy_to_user(buf, &tsk->thread.i387.fxsave,
+                              sizeof(struct user_i387_struct)) ? -EFAULT : 0;
+}
+
+int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf)
+{
+       if (__copy_from_user(&tsk->thread.i387.fxsave, buf, 
+                            sizeof(struct user_i387_struct)))
+               return -EFAULT;
+               return 0;
+}
+
+/*
+ * FPU state for core dumps.
+ */
+
+int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
+{
+       struct task_struct *tsk = current;
+
+       if (!used_math())
+               return 0;
+
+       unlazy_fpu(tsk);
+       memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); 
+       return 1; 
+}
+
+int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
+{
+       int fpvalid = !!tsk_used_math(tsk);
+
+       if (fpvalid) {
+               if (tsk == current)
+                       unlazy_fpu(tsk);
+               memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct));         
+}
+       return fpvalid;
+}
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
new file mode 100644 (file)
index 0000000..948cae6
--- /dev/null
@@ -0,0 +1,544 @@
+#include <linux/linkage.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/timex.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/bitops.h>
+
+#include <asm/acpi.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/hw_irq.h>
+#include <asm/pgtable.h>
+#include <asm/delay.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+
+/*
+ * Common place to define all x86 IRQ vectors
+ *
+ * This builds up the IRQ handler stubs using some ugly macros in irq.h
+ *
+ * These macros create the low-level assembly IRQ routines that save
+ * register context and call do_IRQ(). do_IRQ() then does all the
+ * operations that are needed to keep the AT (or SMP IOAPIC)
+ * interrupt-controller happy.
+ */
+
+#define BI(x,y) \
+       BUILD_IRQ(x##y)
+
+#define BUILD_16_IRQS(x) \
+       BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
+       BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
+       BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
+       BI(x,c) BI(x,d) BI(x,e) BI(x,f)
+
+/*
+ * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
+ * (these are usually mapped to vectors 0x30-0x3f)
+ */
+
+/*
+ * The IO-APIC gives us many more interrupt sources. Most of these 
+ * are unused but an SMP system is supposed to have enough memory ...
+ * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
+ * across the spectrum, so we really want to be prepared to get all
+ * of these. Plus, more powerful systems might have more than 64
+ * IO-APIC registers.
+ *
+ * (these are usually mapped into the 0x30-0xff vector range)
+ */
+                                     BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
+BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
+BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
+BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
+
+#undef BUILD_16_IRQS
+#undef BI
+
+
+#define IRQ(x,y) \
+       IRQ##x##y##_interrupt
+
+#define IRQLIST_16(x) \
+       IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
+       IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
+       IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
+       IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
+
+/* for the irq vectors */
+static void (*interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
+                                         IRQLIST_16(0x2), IRQLIST_16(0x3),
+       IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
+       IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
+       IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
+};
+
+#undef IRQ
+#undef IRQLIST_16
+
+/*
+ * This is the 'legacy' 8259A Programmable Interrupt Controller,
+ * present in the majority of PC/AT boxes.
+ * plus some generic x86 specific things if generic specifics makes
+ * any sense at all.
+ * this file should become arch/i386/kernel/irq.c when the old irq.c
+ * moves to arch independent land
+ */
+
+static int i8259A_auto_eoi;
+DEFINE_SPINLOCK(i8259A_lock);
+static void mask_and_ack_8259A(unsigned int);
+
+static struct irq_chip i8259A_chip = {
+       .name           = "XT-PIC",
+       .mask           = disable_8259A_irq,
+       .disable        = disable_8259A_irq,
+       .unmask         = enable_8259A_irq,
+       .mask_ack       = mask_and_ack_8259A,
+};
+
+/*
+ * 8259A PIC functions to handle ISA devices:
+ */
+
+/*
+ * This contains the irq mask for both 8259A irq controllers,
+ */
+static unsigned int cached_irq_mask = 0xffff;
+
+#define __byte(x,y)    (((unsigned char *)&(y))[x])
+#define cached_21      (__byte(0,cached_irq_mask))
+#define cached_A1      (__byte(1,cached_irq_mask))
+
+/*
+ * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
+ * boards the timer interrupt is not really connected to any IO-APIC pin,
+ * it's fed to the master 8259A's IR0 line only.
+ *
+ * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
+ * this 'mixed mode' IRQ handling costs nothing because it's only used
+ * at IRQ setup time.
+ */
+unsigned long io_apic_irqs;
+
+void disable_8259A_irq(unsigned int irq)
+{
+       unsigned int mask = 1 << irq;
+       unsigned long flags;
+
+       spin_lock_irqsave(&i8259A_lock, flags);
+       cached_irq_mask |= mask;
+       if (irq & 8)
+               outb(cached_A1,0xA1);
+       else
+               outb(cached_21,0x21);
+       spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+void enable_8259A_irq(unsigned int irq)
+{
+       unsigned int mask = ~(1 << irq);
+       unsigned long flags;
+
+       spin_lock_irqsave(&i8259A_lock, flags);
+       cached_irq_mask &= mask;
+       if (irq & 8)
+               outb(cached_A1,0xA1);
+       else
+               outb(cached_21,0x21);
+       spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+int i8259A_irq_pending(unsigned int irq)
+{
+       unsigned int mask = 1<<irq;
+       unsigned long flags;
+       int ret;
+
+       spin_lock_irqsave(&i8259A_lock, flags);
+       if (irq < 8)
+               ret = inb(0x20) & mask;
+       else
+               ret = inb(0xA0) & (mask >> 8);
+       spin_unlock_irqrestore(&i8259A_lock, flags);
+
+       return ret;
+}
+
+void make_8259A_irq(unsigned int irq)
+{
+       disable_irq_nosync(irq);
+       io_apic_irqs &= ~(1<<irq);
+       set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
+                                     "XT");
+       enable_irq(irq);
+}
+
+/*
+ * This function assumes to be called rarely. Switching between
+ * 8259A registers is slow.
+ * This has to be protected by the irq controller spinlock
+ * before being called.
+ */
+static inline int i8259A_irq_real(unsigned int irq)
+{
+       int value;
+       int irqmask = 1<<irq;
+
+       if (irq < 8) {
+               outb(0x0B,0x20);                /* ISR register */
+               value = inb(0x20) & irqmask;
+               outb(0x0A,0x20);                /* back to the IRR register */
+               return value;
+       }
+       outb(0x0B,0xA0);                /* ISR register */
+       value = inb(0xA0) & (irqmask >> 8);
+       outb(0x0A,0xA0);                /* back to the IRR register */
+       return value;
+}
+
+/*
+ * Careful! The 8259A is a fragile beast, it pretty
+ * much _has_ to be done exactly like this (mask it
+ * first, _then_ send the EOI, and the order of EOI
+ * to the two 8259s is important!
+ */
+static void mask_and_ack_8259A(unsigned int irq)
+{
+       unsigned int irqmask = 1 << irq;
+       unsigned long flags;
+
+       spin_lock_irqsave(&i8259A_lock, flags);
+       /*
+        * Lightweight spurious IRQ detection. We do not want
+        * to overdo spurious IRQ handling - it's usually a sign
+        * of hardware problems, so we only do the checks we can
+        * do without slowing down good hardware unnecessarily.
+        *
+        * Note that IRQ7 and IRQ15 (the two spurious IRQs
+        * usually resulting from the 8259A-1|2 PICs) occur
+        * even if the IRQ is masked in the 8259A. Thus we
+        * can check spurious 8259A IRQs without doing the
+        * quite slow i8259A_irq_real() call for every IRQ.
+        * This does not cover 100% of spurious interrupts,
+        * but should be enough to warn the user that there
+        * is something bad going on ...
+        */
+       if (cached_irq_mask & irqmask)
+               goto spurious_8259A_irq;
+       cached_irq_mask |= irqmask;
+
+handle_real_irq:
+       if (irq & 8) {
+               inb(0xA1);              /* DUMMY - (do we need this?) */
+               outb(cached_A1,0xA1);
+               outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */
+               outb(0x62,0x20);        /* 'Specific EOI' to master-IRQ2 */
+       } else {
+               inb(0x21);              /* DUMMY - (do we need this?) */
+               outb(cached_21,0x21);
+               outb(0x60+irq,0x20);    /* 'Specific EOI' to master */
+       }
+       spin_unlock_irqrestore(&i8259A_lock, flags);
+       return;
+
+spurious_8259A_irq:
+       /*
+        * this is the slow path - should happen rarely.
+        */
+       if (i8259A_irq_real(irq))
+               /*
+                * oops, the IRQ _is_ in service according to the
+                * 8259A - not spurious, go handle it.
+                */
+               goto handle_real_irq;
+
+       {
+               static int spurious_irq_mask;
+               /*
+                * At this point we can be sure the IRQ is spurious,
+                * lets ACK and report it. [once per IRQ]
+                */
+               if (!(spurious_irq_mask & irqmask)) {
+                       printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
+                       spurious_irq_mask |= irqmask;
+               }
+               atomic_inc(&irq_err_count);
+               /*
+                * Theoretically we do not have to handle this IRQ,
+                * but in Linux this does not cause problems and is
+                * simpler for us.
+                */
+               goto handle_real_irq;
+       }
+}
+
+void init_8259A(int auto_eoi)
+{
+       unsigned long flags;
+
+       i8259A_auto_eoi = auto_eoi;
+
+       spin_lock_irqsave(&i8259A_lock, flags);
+
+       outb(0xff, 0x21);       /* mask all of 8259A-1 */
+       outb(0xff, 0xA1);       /* mask all of 8259A-2 */
+
+       /*
+        * outb_p - this has to work on a wide range of PC hardware.
+        */
+       outb_p(0x11, 0x20);     /* ICW1: select 8259A-1 init */
+       outb_p(IRQ0_VECTOR, 0x21);      /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
+       outb_p(0x04, 0x21);     /* 8259A-1 (the master) has a slave on IR2 */
+       if (auto_eoi)
+               outb_p(0x03, 0x21);     /* master does Auto EOI */
+       else
+               outb_p(0x01, 0x21);     /* master expects normal EOI */
+
+       outb_p(0x11, 0xA0);     /* ICW1: select 8259A-2 init */
+       outb_p(IRQ8_VECTOR, 0xA1);      /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
+       outb_p(0x02, 0xA1);     /* 8259A-2 is a slave on master's IR2 */
+       outb_p(0x01, 0xA1);     /* (slave's support for AEOI in flat mode
+                                   is to be investigated) */
+
+       if (auto_eoi)
+               /*
+                * in AEOI mode we just have to mask the interrupt
+                * when acking.
+                */
+               i8259A_chip.mask_ack = disable_8259A_irq;
+       else
+               i8259A_chip.mask_ack = mask_and_ack_8259A;
+
+       udelay(100);            /* wait for 8259A to initialize */
+
+       outb(cached_21, 0x21);  /* restore master IRQ mask */
+       outb(cached_A1, 0xA1);  /* restore slave IRQ mask */
+
+       spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+static char irq_trigger[2];
+/**
+ * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
+ */
+static void restore_ELCR(char *trigger)
+{
+       outb(trigger[0], 0x4d0);
+       outb(trigger[1], 0x4d1);
+}
+
+static void save_ELCR(char *trigger)
+{
+       /* IRQ 0,1,2,8,13 are marked as reserved */
+       trigger[0] = inb(0x4d0) & 0xF8;
+       trigger[1] = inb(0x4d1) & 0xDE;
+}
+
+static int i8259A_resume(struct sys_device *dev)
+{
+       init_8259A(i8259A_auto_eoi);
+       restore_ELCR(irq_trigger);
+       return 0;
+}
+
+static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
+{
+       save_ELCR(irq_trigger);
+       return 0;
+}
+
+static int i8259A_shutdown(struct sys_device *dev)
+{
+       /* Put the i8259A into a quiescent state that
+        * the kernel initialization code can get it
+        * out of.
+        */
+       outb(0xff, 0x21);       /* mask all of 8259A-1 */
+       outb(0xff, 0xA1);       /* mask all of 8259A-1 */
+       return 0;
+}
+
+static struct sysdev_class i8259_sysdev_class = {
+       set_kset_name("i8259"),
+       .suspend = i8259A_suspend,
+       .resume = i8259A_resume,
+       .shutdown = i8259A_shutdown,
+};
+
+static struct sys_device device_i8259A = {
+       .id     = 0,
+       .cls    = &i8259_sysdev_class,
+};
+
+static int __init i8259A_init_sysfs(void)
+{
+       int error = sysdev_class_register(&i8259_sysdev_class);
+       if (!error)
+               error = sysdev_register(&device_i8259A);
+       return error;
+}
+
+device_initcall(i8259A_init_sysfs);
+
+/*
+ * IRQ2 is cascade interrupt to second interrupt controller
+ */
+
+static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
+DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+       [0 ... IRQ0_VECTOR - 1] = -1,
+       [IRQ0_VECTOR] = 0,
+       [IRQ1_VECTOR] = 1,
+       [IRQ2_VECTOR] = 2,
+       [IRQ3_VECTOR] = 3,
+       [IRQ4_VECTOR] = 4,
+       [IRQ5_VECTOR] = 5,
+       [IRQ6_VECTOR] = 6,
+       [IRQ7_VECTOR] = 7,
+       [IRQ8_VECTOR] = 8,
+       [IRQ9_VECTOR] = 9,
+       [IRQ10_VECTOR] = 10,
+       [IRQ11_VECTOR] = 11,
+       [IRQ12_VECTOR] = 12,
+       [IRQ13_VECTOR] = 13,
+       [IRQ14_VECTOR] = 14,
+       [IRQ15_VECTOR] = 15,
+       [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
+};
+
+void __init init_ISA_irqs (void)
+{
+       int i;
+
+       init_bsp_APIC();
+       init_8259A(0);
+
+       for (i = 0; i < NR_IRQS; i++) {
+               irq_desc[i].status = IRQ_DISABLED;
+               irq_desc[i].action = NULL;
+               irq_desc[i].depth = 1;
+
+               if (i < 16) {
+                       /*
+                        * 16 old-style INTA-cycle interrupts:
+                        */
+                       set_irq_chip_and_handler_name(i, &i8259A_chip,
+                                                     handle_level_irq, "XT");
+               } else {
+                       /*
+                        * 'high' PCI IRQs filled in on demand
+                        */
+                       irq_desc[i].chip = &no_irq_chip;
+               }
+       }
+}
+
+static void setup_timer_hardware(void)
+{
+       outb_p(0x34,0x43);              /* binary, mode 2, LSB/MSB, ch 0 */
+       udelay(10);
+       outb_p(LATCH & 0xff , 0x40);    /* LSB */
+       udelay(10);
+       outb(LATCH >> 8 , 0x40);        /* MSB */
+}
+
+static int timer_resume(struct sys_device *dev)
+{
+       setup_timer_hardware();
+       return 0;
+}
+
+void i8254_timer_resume(void)
+{
+       setup_timer_hardware();
+}
+
+static struct sysdev_class timer_sysclass = {
+       set_kset_name("timer_pit"),
+       .resume         = timer_resume,
+};
+
+static struct sys_device device_timer = {
+       .id             = 0,
+       .cls            = &timer_sysclass,
+};
+
+static int __init init_timer_sysfs(void)
+{
+       int error = sysdev_class_register(&timer_sysclass);
+       if (!error)
+               error = sysdev_register(&device_timer);
+       return error;
+}
+
+device_initcall(init_timer_sysfs);
+
+void __init init_IRQ(void)
+{
+       int i;
+
+       init_ISA_irqs();
+       /*
+        * Cover the whole vector space, no vector can escape
+        * us. (some of these will be overridden and become
+        * 'special' SMP interrupts)
+        */
+       for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+               int vector = FIRST_EXTERNAL_VECTOR + i;
+               if (vector != IA32_SYSCALL_VECTOR)
+                       set_intr_gate(vector, interrupt[i]);
+       }
+
+#ifdef CONFIG_SMP
+       /*
+        * The reschedule interrupt is a CPU-to-CPU reschedule-helper
+        * IPI, driven by wakeup.
+        */
+       set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+
+       /* IPIs for invalidation */
+       set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
+       set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
+       set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
+       set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
+       set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
+       set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
+       set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
+       set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
+
+       /* IPI for generic function call */
+       set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+
+       /* Low priority IPI to cleanup after moving an irq */
+       set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+#endif
+       set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+       set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
+
+       /* self generated IPI for local APIC timer */
+       set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+
+       /* IPI vectors for APIC spurious and error interrupts */
+       set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+       set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+
+       /*
+        * Set the clock to HZ Hz, we already have a valid
+        * vector now:
+        */
+       setup_timer_hardware();
+
+       if (!acpi_ioapic)
+               setup_irq(2, &irq2);
+}
diff --git a/arch/x86/kernel/init_task_64.c b/arch/x86/kernel/init_task_64.c
new file mode 100644 (file)
index 0000000..4ff33d4
--- /dev/null
@@ -0,0 +1,54 @@
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/fs.h>
+#include <linux/mqueue.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+
+static struct fs_struct init_fs = INIT_FS;
+static struct files_struct init_files = INIT_FILES;
+static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
+static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
+struct mm_struct init_mm = INIT_MM(init_mm);
+
+EXPORT_SYMBOL(init_mm);
+
+/*
+ * Initial task structure.
+ *
+ * We need to make sure that this is 8192-byte aligned due to the
+ * way process stacks are handled. This is done by having a special
+ * "init_task" linker map entry..
+ */
+union thread_union init_thread_union 
+       __attribute__((__section__(".data.init_task"))) =
+               { INIT_THREAD_INFO(init_task) };
+
+/*
+ * Initial task structure.
+ *
+ * All other task structs will be allocated on slabs in fork.c
+ */
+struct task_struct init_task = INIT_TASK(init_task);
+
+EXPORT_SYMBOL(init_task);
+/*
+ * per-CPU TSS segments. Threads are completely 'soft' on Linux,
+ * no more per-task TSS's. The TSS size is kept cacheline-aligned
+ * so they are allowed to end up in the .data.cacheline_aligned
+ * section. Since TSS's are completely CPU-local, we want them
+ * on exact cacheline boundaries, to eliminate cacheline ping-pong.
+ */ 
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
+
+/* Copies of the original ist values from the tss are only accessed during
+ * debugging, no special alignment required.
+ */
+DEFINE_PER_CPU(struct orig_ist, orig_ist);
+
+#define ALIGN_TO_4K __attribute__((section(".data.init_task")))
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
new file mode 100644 (file)
index 0000000..966fa10
--- /dev/null
@@ -0,0 +1,2202 @@
+/*
+ *     Intel IO-APIC support for multi-Pentium hosts.
+ *
+ *     Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
+ *
+ *     Many thanks to Stig Venaas for trying out countless experimental
+ *     patches and reporting/debugging problems patiently!
+ *
+ *     (c) 1999, Multiple IO-APIC support, developed by
+ *     Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
+ *      Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
+ *     further tested and cleaned up by Zach Brown <zab@redhat.com>
+ *     and Ingo Molnar <mingo@redhat.com>
+ *
+ *     Fixes
+ *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
+ *                                     thanks to Eric Gilmore
+ *                                     and Rolf G. Tews
+ *                                     for testing these extensively
+ *     Paul Diefenbaugh        :       Added full ACPI support
+ */
+
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/mc146818rtc.h>
+#include <linux/acpi.h>
+#include <linux/sysdev.h>
+#include <linux/msi.h>
+#include <linux/htirq.h>
+#ifdef CONFIG_ACPI
+#include <acpi/acpi_bus.h>
+#endif
+
+#include <asm/idle.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/mach_apic.h>
+#include <asm/acpi.h>
+#include <asm/dma.h>
+#include <asm/nmi.h>
+#include <asm/msidef.h>
+#include <asm/hypertransport.h>
+
+struct irq_cfg {
+       cpumask_t domain;
+       cpumask_t old_domain;
+       unsigned move_cleanup_count;
+       u8 vector;
+       u8 move_in_progress : 1;
+};
+
+/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
+       [0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
+       [1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
+       [2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
+       [3]  = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
+       [4]  = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
+       [5]  = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
+       [6]  = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
+       [7]  = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
+       [8]  = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
+       [9]  = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
+       [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
+       [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
+       [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
+       [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
+       [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
+       [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
+};
+
+static int assign_irq_vector(int irq, cpumask_t mask);
+
+#define __apicdebuginit  __init
+
+int sis_apic_bug; /* not actually supported, dummy for compile */
+
+static int no_timer_check;
+
+static int disable_timer_pin_1 __initdata;
+
+int timer_over_8254 __initdata = 1;
+
+/* Where if anywhere is the i8259 connect in external int mode */
+static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
+
+static DEFINE_SPINLOCK(ioapic_lock);
+DEFINE_SPINLOCK(vector_lock);
+
+/*
+ * # of IRQ routing registers
+ */
+int nr_ioapic_registers[MAX_IO_APICS];
+
+/*
+ * Rough estimation of how many shared IRQs there are, can
+ * be changed anytime.
+ */
+#define MAX_PLUS_SHARED_IRQS NR_IRQS
+#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * the indexing order of this array favors 1:1 mappings
+ * between pins and IRQs.
+ */
+
+static struct irq_pin_list {
+       short apic, pin, next;
+} irq_2_pin[PIN_MAP_SIZE];
+
+struct io_apic {
+       unsigned int index;
+       unsigned int unused[3];
+       unsigned int data;
+};
+
+static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
+{
+       return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
+               + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+}
+
+static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+{
+       struct io_apic __iomem *io_apic = io_apic_base(apic);
+       writel(reg, &io_apic->index);
+       return readl(&io_apic->data);
+}
+
+static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+{
+       struct io_apic __iomem *io_apic = io_apic_base(apic);
+       writel(reg, &io_apic->index);
+       writel(value, &io_apic->data);
+}
+
+/*
+ * Re-write a value: to be used for read-modify-write
+ * cycles where the read already set up the index register.
+ */
+static inline void io_apic_modify(unsigned int apic, unsigned int value)
+{
+       struct io_apic __iomem *io_apic = io_apic_base(apic);
+       writel(value, &io_apic->data);
+}
+
+static int io_apic_level_ack_pending(unsigned int irq)
+{
+       struct irq_pin_list *entry;
+       unsigned long flags;
+       int pending = 0;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       entry = irq_2_pin + irq;
+       for (;;) {
+               unsigned int reg;
+               int pin;
+
+               pin = entry->pin;
+               if (pin == -1)
+                       break;
+               reg = io_apic_read(entry->apic, 0x10 + pin*2);
+               /* Is the remote IRR bit set? */
+               pending |= (reg >> 14) & 1;
+               if (!entry->next)
+                       break;
+               entry = irq_2_pin + entry->next;
+       }
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+       return pending;
+}
+
+/*
+ * Synchronize the IO-APIC and the CPU by doing
+ * a dummy read from the IO-APIC
+ */
+static inline void io_apic_sync(unsigned int apic)
+{
+       struct io_apic __iomem *io_apic = io_apic_base(apic);
+       readl(&io_apic->data);
+}
+
+#define __DO_ACTION(R, ACTION, FINAL)                                  \
+                                                                       \
+{                                                                      \
+       int pin;                                                        \
+       struct irq_pin_list *entry = irq_2_pin + irq;                   \
+                                                                       \
+       BUG_ON(irq >= NR_IRQS);                                         \
+       for (;;) {                                                      \
+               unsigned int reg;                                       \
+               pin = entry->pin;                                       \
+               if (pin == -1)                                          \
+                       break;                                          \
+               reg = io_apic_read(entry->apic, 0x10 + R + pin*2);      \
+               reg ACTION;                                             \
+               io_apic_modify(entry->apic, reg);                       \
+               FINAL;                                                  \
+               if (!entry->next)                                       \
+                       break;                                          \
+               entry = irq_2_pin + entry->next;                        \
+       }                                                               \
+}
+
+union entry_union {
+       struct { u32 w1, w2; };
+       struct IO_APIC_route_entry entry;
+};
+
+static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
+{
+       union entry_union eu;
+       unsigned long flags;
+       spin_lock_irqsave(&ioapic_lock, flags);
+       eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+       eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+       return eu.entry;
+}
+
+/*
+ * When we write a new IO APIC routing entry, we need to write the high
+ * word first! If the mask bit in the low word is clear, we will enable
+ * the interrupt, and we need to make sure the entry is fully populated
+ * before that happens.
+ */
+static void
+__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+       union entry_union eu;
+       eu.entry = e;
+       io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+       io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+}
+
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+       unsigned long flags;
+       spin_lock_irqsave(&ioapic_lock, flags);
+       __ioapic_write_entry(apic, pin, e);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+/*
+ * When we mask an IO APIC routing entry, we need to write the low
+ * word first, in order to set the mask bit before we change the
+ * high bits!
+ */
+static void ioapic_mask_entry(int apic, int pin)
+{
+       unsigned long flags;
+       union entry_union eu = { .entry.mask = 1 };
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+       io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+#ifdef CONFIG_SMP
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
+{
+       int apic, pin;
+       struct irq_pin_list *entry = irq_2_pin + irq;
+
+       BUG_ON(irq >= NR_IRQS);
+       for (;;) {
+               unsigned int reg;
+               apic = entry->apic;
+               pin = entry->pin;
+               if (pin == -1)
+                       break;
+               io_apic_write(apic, 0x11 + pin*2, dest);
+               reg = io_apic_read(apic, 0x10 + pin*2);
+               reg &= ~0x000000ff;
+               reg |= vector;
+               io_apic_modify(apic, reg);
+               if (!entry->next)
+                       break;
+               entry = irq_2_pin + entry->next;
+       }
+}
+
+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+{
+       struct irq_cfg *cfg = irq_cfg + irq;
+       unsigned long flags;
+       unsigned int dest;
+       cpumask_t tmp;
+
+       cpus_and(tmp, mask, cpu_online_map);
+       if (cpus_empty(tmp))
+               return;
+
+       if (assign_irq_vector(irq, mask))
+               return;
+
+       cpus_and(tmp, cfg->domain, mask);
+       dest = cpu_mask_to_apicid(tmp);
+
+       /*
+        * Only the high 8 bits are valid.
+        */
+       dest = SET_APIC_LOGICAL_ID(dest);
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       __target_IO_APIC_irq(irq, dest, cfg->vector);
+       irq_desc[irq].affinity = mask;
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+#endif
+
+/*
+ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
+ * shared ISA-space IRQs, so we have to support them. We are super
+ * fast in the common case, and fast for shared ISA-space IRQs.
+ */
+static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+{
+       static int first_free_entry = NR_IRQS;
+       struct irq_pin_list *entry = irq_2_pin + irq;
+
+       BUG_ON(irq >= NR_IRQS);
+       while (entry->next)
+               entry = irq_2_pin + entry->next;
+
+       if (entry->pin != -1) {
+               entry->next = first_free_entry;
+               entry = irq_2_pin + entry->next;
+               if (++first_free_entry >= PIN_MAP_SIZE)
+                       panic("io_apic.c: ran out of irq_2_pin entries!");
+       }
+       entry->apic = apic;
+       entry->pin = pin;
+}
+
+
+#define DO_ACTION(name,R,ACTION, FINAL)                                        \
+                                                                       \
+       static void name##_IO_APIC_irq (unsigned int irq)               \
+       __DO_ACTION(R, ACTION, FINAL)
+
+DO_ACTION( __mask,             0, |= 0x00010000, io_apic_sync(entry->apic) )
+                                               /* mask = 1 */
+DO_ACTION( __unmask,           0, &= 0xfffeffff, )
+                                               /* mask = 0 */
+
+static void mask_IO_APIC_irq (unsigned int irq)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       __mask_IO_APIC_irq(irq);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void unmask_IO_APIC_irq (unsigned int irq)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       __unmask_IO_APIC_irq(irq);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
+{
+       struct IO_APIC_route_entry entry;
+
+       /* Check delivery_mode to be sure we're not clearing an SMI pin */
+       entry = ioapic_read_entry(apic, pin);
+       if (entry.delivery_mode == dest_SMI)
+               return;
+       /*
+        * Disable it in the IO-APIC irq-routing table:
+        */
+       ioapic_mask_entry(apic, pin);
+}
+
+static void clear_IO_APIC (void)
+{
+       int apic, pin;
+
+       for (apic = 0; apic < nr_ioapics; apic++)
+               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+                       clear_IO_APIC_pin(apic, pin);
+}
+
+int skip_ioapic_setup;
+int ioapic_force;
+
+static int __init parse_noapic(char *str)
+{
+       disable_ioapic_setup();
+       return 0;
+}
+early_param("noapic", parse_noapic);
+
+/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
+static int __init disable_timer_pin_setup(char *arg)
+{
+       disable_timer_pin_1 = 1;
+       return 1;
+}
+__setup("disable_timer_pin_1", disable_timer_pin_setup);
+
+static int __init setup_disable_8254_timer(char *s)
+{
+       timer_over_8254 = -1;
+       return 1;
+}
+static int __init setup_enable_8254_timer(char *s)
+{
+       timer_over_8254 = 2;
+       return 1;
+}
+
+__setup("disable_8254_timer", setup_disable_8254_timer);
+__setup("enable_8254_timer", setup_enable_8254_timer);
+
+
+/*
+ * Find the IRQ entry number of a certain pin.
+ */
+static int find_irq_entry(int apic, int pin, int type)
+{
+       int i;
+
+       for (i = 0; i < mp_irq_entries; i++)
+               if (mp_irqs[i].mpc_irqtype == type &&
+                   (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
+                    mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
+                   mp_irqs[i].mpc_dstirq == pin)
+                       return i;
+
+       return -1;
+}
+
+/*
+ * Find the pin to which IRQ[irq] (ISA) is connected
+ */
+static int __init find_isa_irq_pin(int irq, int type)
+{
+       int i;
+
+       for (i = 0; i < mp_irq_entries; i++) {
+               int lbus = mp_irqs[i].mpc_srcbus;
+
+               if (test_bit(lbus, mp_bus_not_pci) &&
+                   (mp_irqs[i].mpc_irqtype == type) &&
+                   (mp_irqs[i].mpc_srcbusirq == irq))
+
+                       return mp_irqs[i].mpc_dstirq;
+       }
+       return -1;
+}
+
+static int __init find_isa_irq_apic(int irq, int type)
+{
+       int i;
+
+       for (i = 0; i < mp_irq_entries; i++) {
+               int lbus = mp_irqs[i].mpc_srcbus;
+
+               if (test_bit(lbus, mp_bus_not_pci) &&
+                   (mp_irqs[i].mpc_irqtype == type) &&
+                   (mp_irqs[i].mpc_srcbusirq == irq))
+                       break;
+       }
+       if (i < mp_irq_entries) {
+               int apic;
+               for(apic = 0; apic < nr_ioapics; apic++) {
+                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
+                               return apic;
+               }
+       }
+
+       return -1;
+}
+
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+static int pin_2_irq(int idx, int apic, int pin);
+
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
+{
+       int apic, i, best_guess = -1;
+
+       apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
+               bus, slot, pin);
+       if (mp_bus_id_to_pci_bus[bus] == -1) {
+               apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+               return -1;
+       }
+       for (i = 0; i < mp_irq_entries; i++) {
+               int lbus = mp_irqs[i].mpc_srcbus;
+
+               for (apic = 0; apic < nr_ioapics; apic++)
+                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
+                           mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+                               break;
+
+               if (!test_bit(lbus, mp_bus_not_pci) &&
+                   !mp_irqs[i].mpc_irqtype &&
+                   (bus == lbus) &&
+                   (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
+                       int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+
+                       if (!(apic || IO_APIC_IRQ(irq)))
+                               continue;
+
+                       if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+                               return irq;
+                       /*
+                        * Use the first all-but-pin matching entry as a
+                        * best-guess fuzzy result for broken mptables.
+                        */
+                       if (best_guess < 0)
+                               best_guess = irq;
+               }
+       }
+       BUG_ON(best_guess >= NR_IRQS);
+       return best_guess;
+}
+
+/* ISA interrupts are always polarity zero edge triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_ISA_trigger(idx)       (0)
+#define default_ISA_polarity(idx)      (0)
+
+/* PCI interrupts are always polarity one level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_PCI_trigger(idx)       (1)
+#define default_PCI_polarity(idx)      (1)
+
+static int __init MPBIOS_polarity(int idx)
+{
+       int bus = mp_irqs[idx].mpc_srcbus;
+       int polarity;
+
+       /*
+        * Determine IRQ line polarity (high active or low active):
+        */
+       switch (mp_irqs[idx].mpc_irqflag & 3)
+       {
+               case 0: /* conforms, ie. bus-type dependent polarity */
+                       if (test_bit(bus, mp_bus_not_pci))
+                               polarity = default_ISA_polarity(idx);
+                       else
+                               polarity = default_PCI_polarity(idx);
+                       break;
+               case 1: /* high active */
+               {
+                       polarity = 0;
+                       break;
+               }
+               case 2: /* reserved */
+               {
+                       printk(KERN_WARNING "broken BIOS!!\n");
+                       polarity = 1;
+                       break;
+               }
+               case 3: /* low active */
+               {
+                       polarity = 1;
+                       break;
+               }
+               default: /* invalid */
+               {
+                       printk(KERN_WARNING "broken BIOS!!\n");
+                       polarity = 1;
+                       break;
+               }
+       }
+       return polarity;
+}
+
+static int MPBIOS_trigger(int idx)
+{
+       int bus = mp_irqs[idx].mpc_srcbus;
+       int trigger;
+
+       /*
+        * Determine IRQ trigger mode (edge or level sensitive):
+        */
+       switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+       {
+               case 0: /* conforms, ie. bus-type dependent */
+                       if (test_bit(bus, mp_bus_not_pci))
+                               trigger = default_ISA_trigger(idx);
+                       else
+                               trigger = default_PCI_trigger(idx);
+                       break;
+               case 1: /* edge */
+               {
+                       trigger = 0;
+                       break;
+               }
+               case 2: /* reserved */
+               {
+                       printk(KERN_WARNING "broken BIOS!!\n");
+                       trigger = 1;
+                       break;
+               }
+               case 3: /* level */
+               {
+                       trigger = 1;
+                       break;
+               }
+               default: /* invalid */
+               {
+                       printk(KERN_WARNING "broken BIOS!!\n");
+                       trigger = 0;
+                       break;
+               }
+       }
+       return trigger;
+}
+
+static inline int irq_polarity(int idx)
+{
+       return MPBIOS_polarity(idx);
+}
+
+static inline int irq_trigger(int idx)
+{
+       return MPBIOS_trigger(idx);
+}
+
+static int pin_2_irq(int idx, int apic, int pin)
+{
+       int irq, i;
+       int bus = mp_irqs[idx].mpc_srcbus;
+
+       /*
+        * Debugging check, we are in big trouble if this message pops up!
+        */
+       if (mp_irqs[idx].mpc_dstirq != pin)
+               printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
+
+       if (test_bit(bus, mp_bus_not_pci)) {
+               irq = mp_irqs[idx].mpc_srcbusirq;
+       } else {
+               /*
+                * PCI IRQs are mapped in order
+                */
+               i = irq = 0;
+               while (i < apic)
+                       irq += nr_ioapic_registers[i++];
+               irq += pin;
+       }
+       BUG_ON(irq >= NR_IRQS);
+       return irq;
+}
+
+static int __assign_irq_vector(int irq, cpumask_t mask)
+{
+       /*
+        * NOTE! The local APIC isn't very good at handling
+        * multiple interrupts at the same interrupt level.
+        * As the interrupt level is determined by taking the
+        * vector number and shifting that right by 4, we
+        * want to spread these out a bit so that they don't
+        * all fall in the same interrupt level.
+        *
+        * Also, we've got to be careful not to trash gate
+        * 0x80, because int 0x80 is hm, kind of importantish. ;)
+        */
+       static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
+       unsigned int old_vector;
+       int cpu;
+       struct irq_cfg *cfg;
+
+       BUG_ON((unsigned)irq >= NR_IRQS);
+       cfg = &irq_cfg[irq];
+
+       /* Only try and allocate irqs on cpus that are present */
+       cpus_and(mask, mask, cpu_online_map);
+
+       if ((cfg->move_in_progress) || cfg->move_cleanup_count)
+               return -EBUSY;
+
+       old_vector = cfg->vector;
+       if (old_vector) {
+               cpumask_t tmp;
+               cpus_and(tmp, cfg->domain, mask);
+               if (!cpus_empty(tmp))
+                       return 0;
+       }
+
+       for_each_cpu_mask(cpu, mask) {
+               cpumask_t domain, new_mask;
+               int new_cpu;
+               int vector, offset;
+
+               domain = vector_allocation_domain(cpu);
+               cpus_and(new_mask, domain, cpu_online_map);
+
+               vector = current_vector;
+               offset = current_offset;
+next:
+               vector += 8;
+               if (vector >= FIRST_SYSTEM_VECTOR) {
+                       /* If we run out of vectors on large boxen, must share them. */
+                       offset = (offset + 1) % 8;
+                       vector = FIRST_DEVICE_VECTOR + offset;
+               }
+               if (unlikely(current_vector == vector))
+                       continue;
+               if (vector == IA32_SYSCALL_VECTOR)
+                       goto next;
+               for_each_cpu_mask(new_cpu, new_mask)
+                       if (per_cpu(vector_irq, new_cpu)[vector] != -1)
+                               goto next;
+               /* Found one! */
+               current_vector = vector;
+               current_offset = offset;
+               if (old_vector) {
+                       cfg->move_in_progress = 1;
+                       cfg->old_domain = cfg->domain;
+               }
+               for_each_cpu_mask(new_cpu, new_mask)
+                       per_cpu(vector_irq, new_cpu)[vector] = irq;
+               cfg->vector = vector;
+               cfg->domain = domain;
+               return 0;
+       }
+       return -ENOSPC;
+}
+
+static int assign_irq_vector(int irq, cpumask_t mask)
+{
+       int err;
+       unsigned long flags;
+
+       spin_lock_irqsave(&vector_lock, flags);
+       err = __assign_irq_vector(irq, mask);
+       spin_unlock_irqrestore(&vector_lock, flags);
+       return err;
+}
+
+static void __clear_irq_vector(int irq)
+{
+       struct irq_cfg *cfg;
+       cpumask_t mask;
+       int cpu, vector;
+
+       BUG_ON((unsigned)irq >= NR_IRQS);
+       cfg = &irq_cfg[irq];
+       BUG_ON(!cfg->vector);
+
+       vector = cfg->vector;
+       cpus_and(mask, cfg->domain, cpu_online_map);
+       for_each_cpu_mask(cpu, mask)
+               per_cpu(vector_irq, cpu)[vector] = -1;
+
+       cfg->vector = 0;
+       cfg->domain = CPU_MASK_NONE;
+}
+
+void __setup_vector_irq(int cpu)
+{
+       /* Initialize vector_irq on a new cpu */
+       /* This function must be called with vector_lock held */
+       int irq, vector;
+
+       /* Mark the inuse vectors */
+       for (irq = 0; irq < NR_IRQS; ++irq) {
+               if (!cpu_isset(cpu, irq_cfg[irq].domain))
+                       continue;
+               vector = irq_cfg[irq].vector;
+               per_cpu(vector_irq, cpu)[vector] = irq;
+       }
+       /* Mark the free vectors */
+       for (vector = 0; vector < NR_VECTORS; ++vector) {
+               irq = per_cpu(vector_irq, cpu)[vector];
+               if (irq < 0)
+                       continue;
+               if (!cpu_isset(cpu, irq_cfg[irq].domain))
+                       per_cpu(vector_irq, cpu)[vector] = -1;
+       }
+}
+
+
+static struct irq_chip ioapic_chip;
+
+static void ioapic_register_intr(int irq, unsigned long trigger)
+{
+       if (trigger) {
+               irq_desc[irq].status |= IRQ_LEVEL;
+               set_irq_chip_and_handler_name(irq, &ioapic_chip,
+                                             handle_fasteoi_irq, "fasteoi");
+       } else {
+               irq_desc[irq].status &= ~IRQ_LEVEL;
+               set_irq_chip_and_handler_name(irq, &ioapic_chip,
+                                             handle_edge_irq, "edge");
+       }
+}
+
+static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
+                             int trigger, int polarity)
+{
+       struct irq_cfg *cfg = irq_cfg + irq;
+       struct IO_APIC_route_entry entry;
+       cpumask_t mask;
+
+       if (!IO_APIC_IRQ(irq))
+               return;
+
+       mask = TARGET_CPUS;
+       if (assign_irq_vector(irq, mask))
+               return;
+
+       cpus_and(mask, cfg->domain, mask);
+
+       apic_printk(APIC_VERBOSE,KERN_DEBUG
+                   "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
+                   "IRQ %d Mode:%i Active:%i)\n",
+                   apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector,
+                   irq, trigger, polarity);
+
+       /*
+        * add it to the IO-APIC irq-routing table:
+        */
+       memset(&entry,0,sizeof(entry));
+
+       entry.delivery_mode = INT_DELIVERY_MODE;
+       entry.dest_mode = INT_DEST_MODE;
+       entry.dest = cpu_mask_to_apicid(mask);
+       entry.mask = 0;                         /* enable IRQ */
+       entry.trigger = trigger;
+       entry.polarity = polarity;
+       entry.vector = cfg->vector;
+
+       /* Mask level triggered irqs.
+        * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
+        */
+       if (trigger)
+               entry.mask = 1;
+
+       ioapic_register_intr(irq, trigger);
+       if (irq < 16)
+               disable_8259A_irq(irq);
+
+       ioapic_write_entry(apic, pin, entry);
+}
+
+static void __init setup_IO_APIC_irqs(void)
+{
+       int apic, pin, idx, irq, first_notcon = 1;
+
+       apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+
+       for (apic = 0; apic < nr_ioapics; apic++) {
+       for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+
+               idx = find_irq_entry(apic,pin,mp_INT);
+               if (idx == -1) {
+                       if (first_notcon) {
+                               apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+                               first_notcon = 0;
+                       } else
+                               apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+                       continue;
+               }
+
+               irq = pin_2_irq(idx, apic, pin);
+               add_pin_to_irq(irq, apic, pin);
+
+               setup_IO_APIC_irq(apic, pin, irq,
+                                 irq_trigger(idx), irq_polarity(idx));
+       }
+       }
+
+       if (!first_notcon)
+               apic_printk(APIC_VERBOSE," not connected.\n");
+}
+
+/*
+ * Set up the 8259A-master output pin as broadcast to all
+ * CPUs.
+ */
+static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
+{
+       struct IO_APIC_route_entry entry;
+       unsigned long flags;
+
+       memset(&entry,0,sizeof(entry));
+
+       disable_8259A_irq(0);
+
+       /* mask LVT0 */
+       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+
+       /*
+        * We use logical delivery to get the timer IRQ
+        * to the first CPU.
+        */
+       entry.dest_mode = INT_DEST_MODE;
+       entry.mask = 0;                                 /* unmask IRQ now */
+       entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
+       entry.delivery_mode = INT_DELIVERY_MODE;
+       entry.polarity = 0;
+       entry.trigger = 0;
+       entry.vector = vector;
+
+       /*
+        * The timer IRQ doesn't have to know that behind the
+        * scene we have a 8259A-master in AEOI mode ...
+        */
+       set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
+
+       /*
+        * Add it to the IO-APIC irq-routing table:
+        */
+       spin_lock_irqsave(&ioapic_lock, flags);
+       io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
+       io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       enable_8259A_irq(0);
+}
+
+void __apicdebuginit print_IO_APIC(void)
+{
+       int apic, i;
+       union IO_APIC_reg_00 reg_00;
+       union IO_APIC_reg_01 reg_01;
+       union IO_APIC_reg_02 reg_02;
+       unsigned long flags;
+
+       if (apic_verbosity == APIC_QUIET)
+               return;
+
+       printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
+       for (i = 0; i < nr_ioapics; i++)
+               printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
+                      mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+
+       /*
+        * We are a bit conservative about what we expect.  We have to
+        * know about every hardware change ASAP.
+        */
+       printk(KERN_INFO "testing the IO APIC.......................\n");
+
+       for (apic = 0; apic < nr_ioapics; apic++) {
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       reg_00.raw = io_apic_read(apic, 0);
+       reg_01.raw = io_apic_read(apic, 1);
+       if (reg_01.bits.version >= 0x10)
+               reg_02.raw = io_apic_read(apic, 2);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       printk("\n");
+       printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+       printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
+       printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
+
+       printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
+       printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
+
+       printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
+       printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
+
+       if (reg_01.bits.version >= 0x10) {
+               printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
+               printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
+       }
+
+       printk(KERN_DEBUG ".... IRQ redirection table:\n");
+
+       printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
+                         " Stat Dmod Deli Vect:   \n");
+
+       for (i = 0; i <= reg_01.bits.entries; i++) {
+               struct IO_APIC_route_entry entry;
+
+               entry = ioapic_read_entry(apic, i);
+
+               printk(KERN_DEBUG " %02x %03X ",
+                       i,
+                       entry.dest
+               );
+
+               printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
+                       entry.mask,
+                       entry.trigger,
+                       entry.irr,
+                       entry.polarity,
+                       entry.delivery_status,
+                       entry.dest_mode,
+                       entry.delivery_mode,
+                       entry.vector
+               );
+       }
+       }
+       printk(KERN_DEBUG "IRQ to pin mappings:\n");
+       for (i = 0; i < NR_IRQS; i++) {
+               struct irq_pin_list *entry = irq_2_pin + i;
+               if (entry->pin < 0)
+                       continue;
+               printk(KERN_DEBUG "IRQ%d ", i);
+               for (;;) {
+                       printk("-> %d:%d", entry->apic, entry->pin);
+                       if (!entry->next)
+                               break;
+                       entry = irq_2_pin + entry->next;
+               }
+               printk("\n");
+       }
+
+       printk(KERN_INFO ".................................... done.\n");
+
+       return;
+}
+
+#if 0
+
+static __apicdebuginit void print_APIC_bitfield (int base)
+{
+       unsigned int v;
+       int i, j;
+
+       if (apic_verbosity == APIC_QUIET)
+               return;
+
+       printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
+       for (i = 0; i < 8; i++) {
+               v = apic_read(base + i*0x10);
+               for (j = 0; j < 32; j++) {
+                       if (v & (1<<j))
+                               printk("1");
+                       else
+                               printk("0");
+               }
+               printk("\n");
+       }
+}
+
+void __apicdebuginit print_local_APIC(void * dummy)
+{
+       unsigned int v, ver, maxlvt;
+
+       if (apic_verbosity == APIC_QUIET)
+               return;
+
+       printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
+               smp_processor_id(), hard_smp_processor_id());
+       v = apic_read(APIC_ID);
+       printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(v));
+       v = apic_read(APIC_LVR);
+       printk(KERN_INFO "... APIC VERSION: %08x\n", v);
+       ver = GET_APIC_VERSION(v);
+       maxlvt = get_maxlvt();
+
+       v = apic_read(APIC_TASKPRI);
+       printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
+
+       v = apic_read(APIC_ARBPRI);
+       printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
+               v & APIC_ARBPRI_MASK);
+       v = apic_read(APIC_PROCPRI);
+       printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
+
+       v = apic_read(APIC_EOI);
+       printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
+       v = apic_read(APIC_RRR);
+       printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
+       v = apic_read(APIC_LDR);
+       printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
+       v = apic_read(APIC_DFR);
+       printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
+       v = apic_read(APIC_SPIV);
+       printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
+
+       printk(KERN_DEBUG "... APIC ISR field:\n");
+       print_APIC_bitfield(APIC_ISR);
+       printk(KERN_DEBUG "... APIC TMR field:\n");
+       print_APIC_bitfield(APIC_TMR);
+       printk(KERN_DEBUG "... APIC IRR field:\n");
+       print_APIC_bitfield(APIC_IRR);
+
+       v = apic_read(APIC_ESR);
+       printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
+
+       v = apic_read(APIC_ICR);
+       printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
+       v = apic_read(APIC_ICR2);
+       printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
+
+       v = apic_read(APIC_LVTT);
+       printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
+
+       if (maxlvt > 3) {                       /* PC is LVT#4. */
+               v = apic_read(APIC_LVTPC);
+               printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
+       }
+       v = apic_read(APIC_LVT0);
+       printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
+       v = apic_read(APIC_LVT1);
+       printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
+
+       if (maxlvt > 2) {                       /* ERR is LVT#3. */
+               v = apic_read(APIC_LVTERR);
+               printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
+       }
+
+       v = apic_read(APIC_TMICT);
+       printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
+       v = apic_read(APIC_TMCCT);
+       printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
+       v = apic_read(APIC_TDCR);
+       printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
+       printk("\n");
+}
+
+void print_all_local_APICs (void)
+{
+       on_each_cpu(print_local_APIC, NULL, 1, 1);
+}
+
+void __apicdebuginit print_PIC(void)
+{
+       unsigned int v;
+       unsigned long flags;
+
+       if (apic_verbosity == APIC_QUIET)
+               return;
+
+       printk(KERN_DEBUG "\nprinting PIC contents\n");
+
+       spin_lock_irqsave(&i8259A_lock, flags);
+
+       v = inb(0xa1) << 8 | inb(0x21);
+       printk(KERN_DEBUG "... PIC  IMR: %04x\n", v);
+
+       v = inb(0xa0) << 8 | inb(0x20);
+       printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
+
+       outb(0x0b,0xa0);
+       outb(0x0b,0x20);
+       v = inb(0xa0) << 8 | inb(0x20);
+       outb(0x0a,0xa0);
+       outb(0x0a,0x20);
+
+       spin_unlock_irqrestore(&i8259A_lock, flags);
+
+       printk(KERN_DEBUG "... PIC  ISR: %04x\n", v);
+
+       v = inb(0x4d1) << 8 | inb(0x4d0);
+       printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
+}
+
+#endif  /*  0  */
+
+static void __init enable_IO_APIC(void)
+{
+       union IO_APIC_reg_01 reg_01;
+       int i8259_apic, i8259_pin;
+       int i, apic;
+       unsigned long flags;
+
+       for (i = 0; i < PIN_MAP_SIZE; i++) {
+               irq_2_pin[i].pin = -1;
+               irq_2_pin[i].next = 0;
+       }
+
+       /*
+        * The number of IO-APIC IRQ registers (== #pins):
+        */
+       for (apic = 0; apic < nr_ioapics; apic++) {
+               spin_lock_irqsave(&ioapic_lock, flags);
+               reg_01.raw = io_apic_read(apic, 1);
+               spin_unlock_irqrestore(&ioapic_lock, flags);
+               nr_ioapic_registers[apic] = reg_01.bits.entries+1;
+       }
+       for(apic = 0; apic < nr_ioapics; apic++) {
+               int pin;
+               /* See if any of the pins is in ExtINT mode */
+               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+                       struct IO_APIC_route_entry entry;
+                       entry = ioapic_read_entry(apic, pin);
+
+                       /* If the interrupt line is enabled and in ExtInt mode
+                        * I have found the pin where the i8259 is connected.
+                        */
+                       if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
+                               ioapic_i8259.apic = apic;
+                               ioapic_i8259.pin  = pin;
+                               goto found_i8259;
+                       }
+               }
+       }
+ found_i8259:
+       /* Look to see what if the MP table has reported the ExtINT */
+       i8259_pin  = find_isa_irq_pin(0, mp_ExtINT);
+       i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
+       /* Trust the MP table if nothing is setup in the hardware */
+       if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
+               printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
+               ioapic_i8259.pin  = i8259_pin;
+               ioapic_i8259.apic = i8259_apic;
+       }
+       /* Complain if the MP table and the hardware disagree */
+       if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
+               (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
+       {
+               printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
+       }
+
+       /*
+        * Do not trust the IO-APIC being empty at bootup
+        */
+       clear_IO_APIC();
+}
+
+/*
+ * Not an __init, needed by the reboot code
+ */
+void disable_IO_APIC(void)
+{
+       /*
+        * Clear the IO-APIC before rebooting:
+        */
+       clear_IO_APIC();
+
+       /*
+        * If the i8259 is routed through an IOAPIC
+        * Put that IOAPIC in virtual wire mode
+        * so legacy interrupts can be delivered.
+        */
+       if (ioapic_i8259.pin != -1) {
+               struct IO_APIC_route_entry entry;
+
+               memset(&entry, 0, sizeof(entry));
+               entry.mask            = 0; /* Enabled */
+               entry.trigger         = 0; /* Edge */
+               entry.irr             = 0;
+               entry.polarity        = 0; /* High */
+               entry.delivery_status = 0;
+               entry.dest_mode       = 0; /* Physical */
+               entry.delivery_mode   = dest_ExtINT; /* ExtInt */
+               entry.vector          = 0;
+               entry.dest          = GET_APIC_ID(apic_read(APIC_ID));
+
+               /*
+                * Add it to the IO-APIC irq-routing table:
+                */
+               ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
+       }
+
+       disconnect_bsp_APIC(ioapic_i8259.pin != -1);
+}
+
+/*
+ * There is a nasty bug in some older SMP boards, their mptable lies
+ * about the timer IRQ. We do the following to work around the situation:
+ *
+ *     - timer IRQ defaults to IO-APIC IRQ
+ *     - if this function detects that timer IRQs are defunct, then we fall
+ *       back to ISA timer IRQs
+ */
+static int __init timer_irq_works(void)
+{
+       unsigned long t1 = jiffies;
+
+       local_irq_enable();
+       /* Let ten ticks pass... */
+       mdelay((10 * 1000) / HZ);
+
+       /*
+        * Expect a few ticks at least, to be sure some possible
+        * glue logic does not lock up after one or two first
+        * ticks in a non-ExtINT mode.  Also the local APIC
+        * might have cached one ExtINT interrupt.  Finally, at
+        * least one tick may be lost due to delays.
+        */
+
+       /* jiffies wrap? */
+       if (jiffies - t1 > 4)
+               return 1;
+       return 0;
+}
+
+/*
+ * In the SMP+IOAPIC case it might happen that there are an unspecified
+ * number of pending IRQ events unhandled. These cases are very rare,
+ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
+ * better to do it this way as thus we do not have to be aware of
+ * 'pending' interrupts in the IRQ path, except at this point.
+ */
+/*
+ * Edge triggered needs to resend any interrupt
+ * that was delayed but this is now handled in the device
+ * independent code.
+ */
+
+/*
+ * Starting up a edge-triggered IO-APIC interrupt is
+ * nasty - we need to make sure that we get the edge.
+ * If it is already asserted for some reason, we need
+ * return 1 to indicate that is was pending.
+ *
+ * This is not complete - we should be able to fake
+ * an edge even if it isn't on the 8259A...
+ */
+
+static unsigned int startup_ioapic_irq(unsigned int irq)
+{
+       int was_pending = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       if (irq < 16) {
+               disable_8259A_irq(irq);
+               if (i8259A_irq_pending(irq))
+                       was_pending = 1;
+       }
+       __unmask_IO_APIC_irq(irq);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       return was_pending;
+}
+
+static int ioapic_retrigger_irq(unsigned int irq)
+{
+       struct irq_cfg *cfg = &irq_cfg[irq];
+       cpumask_t mask;
+       unsigned long flags;
+
+       spin_lock_irqsave(&vector_lock, flags);
+       cpus_clear(mask);
+       cpu_set(first_cpu(cfg->domain), mask);
+
+       send_IPI_mask(mask, cfg->vector);
+       spin_unlock_irqrestore(&vector_lock, flags);
+
+       return 1;
+}
+
+/*
+ * Level and edge triggered IO-APIC interrupts need different handling,
+ * so we use two separate IRQ descriptors. Edge triggered IRQs can be
+ * handled with the level-triggered descriptor, but that one has slightly
+ * more overhead. Level-triggered interrupts cannot be handled with the
+ * edge-triggered handler, without risking IRQ storms and other ugly
+ * races.
+ */
+
+#ifdef CONFIG_SMP
+asmlinkage void smp_irq_move_cleanup_interrupt(void)
+{
+       unsigned vector, me;
+       ack_APIC_irq();
+       exit_idle();
+       irq_enter();
+
+       me = smp_processor_id();
+       for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
+               unsigned int irq;
+               struct irq_desc *desc;
+               struct irq_cfg *cfg;
+               irq = __get_cpu_var(vector_irq)[vector];
+               if (irq >= NR_IRQS)
+                       continue;
+
+               desc = irq_desc + irq;
+               cfg = irq_cfg + irq;
+               spin_lock(&desc->lock);
+               if (!cfg->move_cleanup_count)
+                       goto unlock;
+
+               if ((vector == cfg->vector) && cpu_isset(me, cfg->domain))
+                       goto unlock;
+
+               __get_cpu_var(vector_irq)[vector] = -1;
+               cfg->move_cleanup_count--;
+unlock:
+               spin_unlock(&desc->lock);
+       }
+
+       irq_exit();
+}
+
+static void irq_complete_move(unsigned int irq)
+{
+       struct irq_cfg *cfg = irq_cfg + irq;
+       unsigned vector, me;
+
+       if (likely(!cfg->move_in_progress))
+               return;
+
+       vector = ~get_irq_regs()->orig_rax;
+       me = smp_processor_id();
+       if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
+               cpumask_t cleanup_mask;
+
+               cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
+               cfg->move_cleanup_count = cpus_weight(cleanup_mask);
+               send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+               cfg->move_in_progress = 0;
+       }
+}
+#else
+static inline void irq_complete_move(unsigned int irq) {}
+#endif
+
+static void ack_apic_edge(unsigned int irq)
+{
+       irq_complete_move(irq);
+       move_native_irq(irq);
+       ack_APIC_irq();
+}
+
+static void ack_apic_level(unsigned int irq)
+{
+       int do_unmask_irq = 0;
+
+       irq_complete_move(irq);
+#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
+       /* If we are moving the irq we need to mask it */
+       if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
+               do_unmask_irq = 1;
+               mask_IO_APIC_irq(irq);
+       }
+#endif
+
+       /*
+        * We must acknowledge the irq before we move it or the acknowledge will
+        * not propagate properly.
+        */
+       ack_APIC_irq();
+
+       /* Now we can move and renable the irq */
+       if (unlikely(do_unmask_irq)) {
+               /* Only migrate the irq if the ack has been received.
+                *
+                * On rare occasions the broadcast level triggered ack gets
+                * delayed going to ioapics, and if we reprogram the
+                * vector while Remote IRR is still set the irq will never
+                * fire again.
+                *
+                * To prevent this scenario we read the Remote IRR bit
+                * of the ioapic.  This has two effects.
+                * - On any sane system the read of the ioapic will
+                *   flush writes (and acks) going to the ioapic from
+                *   this cpu.
+                * - We get to see if the ACK has actually been delivered.
+                *
+                * Based on failed experiments of reprogramming the
+                * ioapic entry from outside of irq context starting
+                * with masking the ioapic entry and then polling until
+                * Remote IRR was clear before reprogramming the
+                * ioapic I don't trust the Remote IRR bit to be
+                * completey accurate.
+                *
+                * However there appears to be no other way to plug
+                * this race, so if the Remote IRR bit is not
+                * accurate and is causing problems then it is a hardware bug
+                * and you can go talk to the chipset vendor about it.
+                */
+               if (!io_apic_level_ack_pending(irq))
+                       move_masked_irq(irq);
+               unmask_IO_APIC_irq(irq);
+       }
+}
+
+static struct irq_chip ioapic_chip __read_mostly = {
+       .name           = "IO-APIC",
+       .startup        = startup_ioapic_irq,
+       .mask           = mask_IO_APIC_irq,
+       .unmask         = unmask_IO_APIC_irq,
+       .ack            = ack_apic_edge,
+       .eoi            = ack_apic_level,
+#ifdef CONFIG_SMP
+       .set_affinity   = set_ioapic_affinity_irq,
+#endif
+       .retrigger      = ioapic_retrigger_irq,
+};
+
+static inline void init_IO_APIC_traps(void)
+{
+       int irq;
+
+       /*
+        * NOTE! The local APIC isn't very good at handling
+        * multiple interrupts at the same interrupt level.
+        * As the interrupt level is determined by taking the
+        * vector number and shifting that right by 4, we
+        * want to spread these out a bit so that they don't
+        * all fall in the same interrupt level.
+        *
+        * Also, we've got to be careful not to trash gate
+        * 0x80, because int 0x80 is hm, kind of importantish. ;)
+        */
+       for (irq = 0; irq < NR_IRQS ; irq++) {
+               int tmp = irq;
+               if (IO_APIC_IRQ(tmp) && !irq_cfg[tmp].vector) {
+                       /*
+                        * Hmm.. We don't have an entry for this,
+                        * so default to an old-fashioned 8259
+                        * interrupt if we can..
+                        */
+                       if (irq < 16)
+                               make_8259A_irq(irq);
+                       else
+                               /* Strange. Oh, well.. */
+                               irq_desc[irq].chip = &no_irq_chip;
+               }
+       }
+}
+
+static void enable_lapic_irq (unsigned int irq)
+{
+       unsigned long v;
+
+       v = apic_read(APIC_LVT0);
+       apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
+}
+
+static void disable_lapic_irq (unsigned int irq)
+{
+       unsigned long v;
+
+       v = apic_read(APIC_LVT0);
+       apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
+}
+
+static void ack_lapic_irq (unsigned int irq)
+{
+       ack_APIC_irq();
+}
+
+static void end_lapic_irq (unsigned int i) { /* nothing */ }
+
+static struct hw_interrupt_type lapic_irq_type __read_mostly = {
+       .name = "local-APIC",
+       .typename = "local-APIC-edge",
+       .startup = NULL, /* startup_irq() not used for IRQ0 */
+       .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
+       .enable = enable_lapic_irq,
+       .disable = disable_lapic_irq,
+       .ack = ack_lapic_irq,
+       .end = end_lapic_irq,
+};
+
+static void setup_nmi (void)
+{
+       /*
+        * Dirty trick to enable the NMI watchdog ...
+        * We put the 8259A master into AEOI mode and
+        * unmask on all local APICs LVT0 as NMI.
+        *
+        * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
+        * is from Maciej W. Rozycki - so we do not have to EOI from
+        * the NMI handler or the timer interrupt.
+        */ 
+       printk(KERN_INFO "activating NMI Watchdog ...");
+
+       enable_NMI_through_LVT0(NULL);
+
+       printk(" done.\n");
+}
+
+/*
+ * This looks a bit hackish but it's about the only one way of sending
+ * a few INTA cycles to 8259As and any associated glue logic.  ICR does
+ * not support the ExtINT mode, unfortunately.  We need to send these
+ * cycles as some i82489DX-based boards have glue logic that keeps the
+ * 8259A interrupt line asserted until INTA.  --macro
+ */
+static inline void unlock_ExtINT_logic(void)
+{
+       int apic, pin, i;
+       struct IO_APIC_route_entry entry0, entry1;
+       unsigned char save_control, save_freq_select;
+       unsigned long flags;
+
+       pin  = find_isa_irq_pin(8, mp_INT);
+       apic = find_isa_irq_apic(8, mp_INT);
+       if (pin == -1)
+               return;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
+       *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+       clear_IO_APIC_pin(apic, pin);
+
+       memset(&entry1, 0, sizeof(entry1));
+
+       entry1.dest_mode = 0;                   /* physical delivery */
+       entry1.mask = 0;                        /* unmask IRQ now */
+       entry1.dest = hard_smp_processor_id();
+       entry1.delivery_mode = dest_ExtINT;
+       entry1.polarity = entry0.polarity;
+       entry1.trigger = 0;
+       entry1.vector = 0;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
+       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       save_control = CMOS_READ(RTC_CONTROL);
+       save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
+       CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
+                  RTC_FREQ_SELECT);
+       CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
+
+       i = 100;
+       while (i-- > 0) {
+               mdelay(10);
+               if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
+                       i -= 10;
+       }
+
+       CMOS_WRITE(save_control, RTC_CONTROL);
+       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
+       clear_IO_APIC_pin(apic, pin);
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
+       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+/*
+ * This code may look a bit paranoid, but it's supposed to cooperate with
+ * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
+ * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
+ * fanatically on his truly buggy board.
+ *
+ * FIXME: really need to revamp this for modern platforms only.
+ */
+static inline void check_timer(void)
+{
+       struct irq_cfg *cfg = irq_cfg + 0;
+       int apic1, pin1, apic2, pin2;
+
+       /*
+        * get/set the timer IRQ vector:
+        */
+       disable_8259A_irq(0);
+       assign_irq_vector(0, TARGET_CPUS);
+
+       /*
+        * Subtle, code in do_timer_interrupt() expects an AEOI
+        * mode for the 8259A whenever interrupts are routed
+        * through I/O APICs.  Also IRQ0 has to be enabled in
+        * the 8259A which implies the virtual wire has to be
+        * disabled in the local APIC.
+        */
+       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+       init_8259A(1);
+       if (timer_over_8254 > 0)
+               enable_8259A_irq(0);
+
+       pin1  = find_isa_irq_pin(0, mp_INT);
+       apic1 = find_isa_irq_apic(0, mp_INT);
+       pin2  = ioapic_i8259.pin;
+       apic2 = ioapic_i8259.apic;
+
+       apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
+               cfg->vector, apic1, pin1, apic2, pin2);
+
+       if (pin1 != -1) {
+               /*
+                * Ok, does IRQ0 through the IOAPIC work?
+                */
+               unmask_IO_APIC_irq(0);
+               if (!no_timer_check && timer_irq_works()) {
+                       nmi_watchdog_default();
+                       if (nmi_watchdog == NMI_IO_APIC) {
+                               disable_8259A_irq(0);
+                               setup_nmi();
+                               enable_8259A_irq(0);
+                       }
+                       if (disable_timer_pin_1 > 0)
+                               clear_IO_APIC_pin(0, pin1);
+                       return;
+               }
+               clear_IO_APIC_pin(apic1, pin1);
+               apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
+                               "connected to IO-APIC\n");
+       }
+
+       apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
+                               "through the 8259A ... ");
+       if (pin2 != -1) {
+               apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
+                       apic2, pin2);
+               /*
+                * legacy devices should be connected to IO APIC #0
+                */
+               setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector);
+               if (timer_irq_works()) {
+                       apic_printk(APIC_VERBOSE," works.\n");
+                       nmi_watchdog_default();
+                       if (nmi_watchdog == NMI_IO_APIC) {
+                               setup_nmi();
+                       }
+                       return;
+               }
+               /*
+                * Cleanup, just in case ...
+                */
+               clear_IO_APIC_pin(apic2, pin2);
+       }
+       apic_printk(APIC_VERBOSE," failed.\n");
+
+       if (nmi_watchdog == NMI_IO_APIC) {
+               printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
+               nmi_watchdog = 0;
+       }
+
+       apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
+
+       disable_8259A_irq(0);
+       irq_desc[0].chip = &lapic_irq_type;
+       apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);     /* Fixed mode */
+       enable_8259A_irq(0);
+
+       if (timer_irq_works()) {
+               apic_printk(APIC_VERBOSE," works.\n");
+               return;
+       }
+       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
+       apic_printk(APIC_VERBOSE," failed.\n");
+
+       apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
+
+       init_8259A(0);
+       make_8259A_irq(0);
+       apic_write(APIC_LVT0, APIC_DM_EXTINT);
+
+       unlock_ExtINT_logic();
+
+       if (timer_irq_works()) {
+               apic_printk(APIC_VERBOSE," works.\n");
+               return;
+       }
+       apic_printk(APIC_VERBOSE," failed :(.\n");
+       panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
+}
+
+static int __init notimercheck(char *s)
+{
+       no_timer_check = 1;
+       return 1;
+}
+__setup("no_timer_check", notimercheck);
+
+/*
+ *
+ * IRQ's that are handled by the PIC in the MPS IOAPIC case.
+ * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
+ *   Linux doesn't really care, as it's not actually used
+ *   for any interrupt handling anyway.
+ */
+#define PIC_IRQS       (1<<2)
+
+void __init setup_IO_APIC(void)
+{
+       enable_IO_APIC();
+
+       if (acpi_ioapic)
+               io_apic_irqs = ~0;      /* all IRQs go through IOAPIC */
+       else
+               io_apic_irqs = ~PIC_IRQS;
+
+       apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
+
+       sync_Arb_IDs();
+       setup_IO_APIC_irqs();
+       init_IO_APIC_traps();
+       check_timer();
+       if (!acpi_ioapic)
+               print_IO_APIC();
+}
+
+struct sysfs_ioapic_data {
+       struct sys_device dev;
+       struct IO_APIC_route_entry entry[0];
+};
+static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
+
+static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
+{
+       struct IO_APIC_route_entry *entry;
+       struct sysfs_ioapic_data *data;
+       int i;
+
+       data = container_of(dev, struct sysfs_ioapic_data, dev);
+       entry = data->entry;
+       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ )
+               *entry = ioapic_read_entry(dev->id, i);
+
+       return 0;
+}
+
+static int ioapic_resume(struct sys_device *dev)
+{
+       struct IO_APIC_route_entry *entry;
+       struct sysfs_ioapic_data *data;
+       unsigned long flags;
+       union IO_APIC_reg_00 reg_00;
+       int i;
+
+       data = container_of(dev, struct sysfs_ioapic_data, dev);
+       entry = data->entry;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       reg_00.raw = io_apic_read(dev->id, 0);
+       if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
+               reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+               io_apic_write(dev->id, 0, reg_00.raw);
+       }
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+       for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
+               ioapic_write_entry(dev->id, i, entry[i]);
+
+       return 0;
+}
+
+static struct sysdev_class ioapic_sysdev_class = {
+       set_kset_name("ioapic"),
+       .suspend = ioapic_suspend,
+       .resume = ioapic_resume,
+};
+
+static int __init ioapic_init_sysfs(void)
+{
+       struct sys_device * dev;
+       int i, size, error = 0;
+
+       error = sysdev_class_register(&ioapic_sysdev_class);
+       if (error)
+               return error;
+
+       for (i = 0; i < nr_ioapics; i++ ) {
+               size = sizeof(struct sys_device) + nr_ioapic_registers[i]
+                       * sizeof(struct IO_APIC_route_entry);
+               mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
+               if (!mp_ioapic_data[i]) {
+                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
+                       continue;
+               }
+               memset(mp_ioapic_data[i], 0, size);
+               dev = &mp_ioapic_data[i]->dev;
+               dev->id = i;
+               dev->cls = &ioapic_sysdev_class;
+               error = sysdev_register(dev);
+               if (error) {
+                       kfree(mp_ioapic_data[i]);
+                       mp_ioapic_data[i] = NULL;
+                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
+                       continue;
+               }
+       }
+
+       return 0;
+}
+
+device_initcall(ioapic_init_sysfs);
+
+/*
+ * Dynamic irq allocate and deallocation
+ */
+int create_irq(void)
+{
+       /* Allocate an unused irq */
+       int irq;
+       int new;
+       unsigned long flags;
+
+       irq = -ENOSPC;
+       spin_lock_irqsave(&vector_lock, flags);
+       for (new = (NR_IRQS - 1); new >= 0; new--) {
+               if (platform_legacy_irq(new))
+                       continue;
+               if (irq_cfg[new].vector != 0)
+                       continue;
+               if (__assign_irq_vector(new, TARGET_CPUS) == 0)
+                       irq = new;
+               break;
+       }
+       spin_unlock_irqrestore(&vector_lock, flags);
+
+       if (irq >= 0) {
+               dynamic_irq_init(irq);
+       }
+       return irq;
+}
+
+void destroy_irq(unsigned int irq)
+{
+       unsigned long flags;
+
+       dynamic_irq_cleanup(irq);
+
+       spin_lock_irqsave(&vector_lock, flags);
+       __clear_irq_vector(irq);
+       spin_unlock_irqrestore(&vector_lock, flags);
+}
+
+/*
+ * MSI mesage composition
+ */
+#ifdef CONFIG_PCI_MSI
+static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
+{
+       struct irq_cfg *cfg = irq_cfg + irq;
+       int err;
+       unsigned dest;
+       cpumask_t tmp;
+
+       tmp = TARGET_CPUS;
+       err = assign_irq_vector(irq, tmp);
+       if (!err) {
+               cpus_and(tmp, cfg->domain, tmp);
+               dest = cpu_mask_to_apicid(tmp);
+
+               msg->address_hi = MSI_ADDR_BASE_HI;
+               msg->address_lo =
+                       MSI_ADDR_BASE_LO |
+                       ((INT_DEST_MODE == 0) ?
+                               MSI_ADDR_DEST_MODE_PHYSICAL:
+                               MSI_ADDR_DEST_MODE_LOGICAL) |
+                       ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+                               MSI_ADDR_REDIRECTION_CPU:
+                               MSI_ADDR_REDIRECTION_LOWPRI) |
+                       MSI_ADDR_DEST_ID(dest);
+
+               msg->data =
+                       MSI_DATA_TRIGGER_EDGE |
+                       MSI_DATA_LEVEL_ASSERT |
+                       ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+                               MSI_DATA_DELIVERY_FIXED:
+                               MSI_DATA_DELIVERY_LOWPRI) |
+                       MSI_DATA_VECTOR(cfg->vector);
+       }
+       return err;
+}
+
+#ifdef CONFIG_SMP
+static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+       struct irq_cfg *cfg = irq_cfg + irq;
+       struct msi_msg msg;
+       unsigned int dest;
+       cpumask_t tmp;
+
+       cpus_and(tmp, mask, cpu_online_map);
+       if (cpus_empty(tmp))
+               return;
+
+       if (assign_irq_vector(irq, mask))
+               return;
+
+       cpus_and(tmp, cfg->domain, mask);
+       dest = cpu_mask_to_apicid(tmp);
+
+       read_msi_msg(irq, &msg);
+
+       msg.data &= ~MSI_DATA_VECTOR_MASK;
+       msg.data |= MSI_DATA_VECTOR(cfg->vector);
+       msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+       msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+       write_msi_msg(irq, &msg);
+       irq_desc[irq].affinity = mask;
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+static struct irq_chip msi_chip = {
+       .name           = "PCI-MSI",
+       .unmask         = unmask_msi_irq,
+       .mask           = mask_msi_irq,
+       .ack            = ack_apic_edge,
+#ifdef CONFIG_SMP
+       .set_affinity   = set_msi_irq_affinity,
+#endif
+       .retrigger      = ioapic_retrigger_irq,
+};
+
+int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+{
+       struct msi_msg msg;
+       int irq, ret;
+       irq = create_irq();
+       if (irq < 0)
+               return irq;
+
+       ret = msi_compose_msg(dev, irq, &msg);
+       if (ret < 0) {
+               destroy_irq(irq);
+               return ret;
+       }
+
+       set_irq_msi(irq, desc);
+       write_msi_msg(irq, &msg);
+
+       set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
+
+       return 0;
+}
+
+void arch_teardown_msi_irq(unsigned int irq)
+{
+       destroy_irq(irq);
+}
+
+#endif /* CONFIG_PCI_MSI */
+
+/*
+ * Hypertransport interrupt support
+ */
+#ifdef CONFIG_HT_IRQ
+
+#ifdef CONFIG_SMP
+
+static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
+{
+       struct ht_irq_msg msg;
+       fetch_ht_irq_msg(irq, &msg);
+
+       msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
+       msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
+
+       msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
+       msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
+
+       write_ht_irq_msg(irq, &msg);
+}
+
+static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+       struct irq_cfg *cfg = irq_cfg + irq;
+       unsigned int dest;
+       cpumask_t tmp;
+
+       cpus_and(tmp, mask, cpu_online_map);
+       if (cpus_empty(tmp))
+               return;
+
+       if (assign_irq_vector(irq, mask))
+               return;
+
+       cpus_and(tmp, cfg->domain, mask);
+       dest = cpu_mask_to_apicid(tmp);
+
+       target_ht_irq(irq, dest, cfg->vector);
+       irq_desc[irq].affinity = mask;
+}
+#endif
+
+static struct irq_chip ht_irq_chip = {
+       .name           = "PCI-HT",
+       .mask           = mask_ht_irq,
+       .unmask         = unmask_ht_irq,
+       .ack            = ack_apic_edge,
+#ifdef CONFIG_SMP
+       .set_affinity   = set_ht_irq_affinity,
+#endif
+       .retrigger      = ioapic_retrigger_irq,
+};
+
+int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
+{
+       struct irq_cfg *cfg = irq_cfg + irq;
+       int err;
+       cpumask_t tmp;
+
+       tmp = TARGET_CPUS;
+       err = assign_irq_vector(irq, tmp);
+       if (!err) {
+               struct ht_irq_msg msg;
+               unsigned dest;
+
+               cpus_and(tmp, cfg->domain, tmp);
+               dest = cpu_mask_to_apicid(tmp);
+
+               msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
+
+               msg.address_lo =
+                       HT_IRQ_LOW_BASE |
+                       HT_IRQ_LOW_DEST_ID(dest) |
+                       HT_IRQ_LOW_VECTOR(cfg->vector) |
+                       ((INT_DEST_MODE == 0) ?
+                               HT_IRQ_LOW_DM_PHYSICAL :
+                               HT_IRQ_LOW_DM_LOGICAL) |
+                       HT_IRQ_LOW_RQEOI_EDGE |
+                       ((INT_DELIVERY_MODE != dest_LowestPrio) ?
+                               HT_IRQ_LOW_MT_FIXED :
+                               HT_IRQ_LOW_MT_ARBITRATED) |
+                       HT_IRQ_LOW_IRQ_MASKED;
+
+               write_ht_irq_msg(irq, &msg);
+
+               set_irq_chip_and_handler_name(irq, &ht_irq_chip,
+                                             handle_edge_irq, "edge");
+       }
+       return err;
+}
+#endif /* CONFIG_HT_IRQ */
+
+/* --------------------------------------------------------------------------
+                          ACPI-based IOAPIC Configuration
+   -------------------------------------------------------------------------- */
+
+#ifdef CONFIG_ACPI
+
+#define IO_APIC_MAX_ID         0xFE
+
+int __init io_apic_get_redir_entries (int ioapic)
+{
+       union IO_APIC_reg_01    reg_01;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       reg_01.raw = io_apic_read(ioapic, 1);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       return reg_01.bits.entries;
+}
+
+
+int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
+{
+       if (!IO_APIC_IRQ(irq)) {
+               apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
+                       ioapic);
+               return -EINVAL;
+       }
+
+       /*
+        * IRQs < 16 are already in the irq_2_pin[] map
+        */
+       if (irq >= 16)
+               add_pin_to_irq(irq, ioapic, pin);
+
+       setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
+
+       return 0;
+}
+
+#endif /* CONFIG_ACPI */
+
+
+/*
+ * This function currently is only a helper for the i386 smp boot process where
+ * we need to reprogram the ioredtbls to cater for the cpus which have come online
+ * so mask in all cases should simply be TARGET_CPUS
+ */
+#ifdef CONFIG_SMP
+void __init setup_ioapic_dest(void)
+{
+       int pin, ioapic, irq, irq_entry;
+
+       if (skip_ioapic_setup == 1)
+               return;
+
+       for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
+               for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
+                       irq_entry = find_irq_entry(ioapic, pin, mp_INT);
+                       if (irq_entry == -1)
+                               continue;
+                       irq = pin_2_irq(irq_entry, ioapic, pin);
+
+                       /* setup_IO_APIC_irqs could fail to get vector for some device
+                        * when you have too many devices, because at that time only boot
+                        * cpu is online.
+                        */
+                       if (!irq_cfg[irq].vector)
+                               setup_IO_APIC_irq(ioapic, pin, irq,
+                                                 irq_trigger(irq_entry),
+                                                 irq_polarity(irq_entry));
+                       else
+                               set_ioapic_affinity_irq(irq, TARGET_CPUS);
+               }
+
+       }
+}
+#endif
diff --git a/arch/x86/kernel/ioport_64.c b/arch/x86/kernel/ioport_64.c
new file mode 100644 (file)
index 0000000..653efa3
--- /dev/null
@@ -0,0 +1,119 @@
+/*
+ *     linux/arch/x86_64/kernel/ioport.c
+ *
+ * This contains the io-permission bitmap code - written by obz, with changes
+ * by Linus.
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/ioport.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+#include <linux/syscalls.h>
+
+/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
+static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
+{
+       int i;
+               if (new_value)
+               for (i = base; i < base + extent; i++) 
+                       __set_bit(i, bitmap); 
+               else
+               for (i = base; i < base + extent; i++) 
+                       clear_bit(i, bitmap); 
+}
+
+/*
+ * this changes the io permissions bitmap in the current task.
+ */
+asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
+{
+       unsigned int i, max_long, bytes, bytes_updated;
+       struct thread_struct * t = &current->thread;
+       struct tss_struct * tss;
+       unsigned long *bitmap;
+
+       if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
+               return -EINVAL;
+       if (turn_on && !capable(CAP_SYS_RAWIO))
+               return -EPERM;
+
+       /*
+        * If it's the first ioperm() call in this thread's lifetime, set the
+        * IO bitmap up. ioperm() is much less timing critical than clone(),
+        * this is why we delay this operation until now:
+        */
+       if (!t->io_bitmap_ptr) {
+               bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+               if (!bitmap)
+                       return -ENOMEM;
+
+               memset(bitmap, 0xff, IO_BITMAP_BYTES);
+               t->io_bitmap_ptr = bitmap;
+               set_thread_flag(TIF_IO_BITMAP);
+       }
+
+       /*
+        * do it in the per-thread copy and in the TSS ...
+        *
+        * Disable preemption via get_cpu() - we must not switch away
+        * because the ->io_bitmap_max value must match the bitmap
+        * contents:
+        */
+       tss = &per_cpu(init_tss, get_cpu());
+
+       set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
+
+       /*
+        * Search for a (possibly new) maximum. This is simple and stupid,
+        * to keep it obviously correct:
+        */
+       max_long = 0;
+       for (i = 0; i < IO_BITMAP_LONGS; i++)
+               if (t->io_bitmap_ptr[i] != ~0UL)
+                       max_long = i;
+
+       bytes = (max_long + 1) * sizeof(long);
+       bytes_updated = max(bytes, t->io_bitmap_max);
+
+       t->io_bitmap_max = bytes;
+
+       /* Update the TSS: */
+       memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
+
+       put_cpu();
+
+       return 0;
+}
+
+/*
+ * sys_iopl has to be used when you want to access the IO ports
+ * beyond the 0x3ff range: to get the full 65536 ports bitmapped
+ * you'd need 8kB of bitmaps/process, which is a bit excessive.
+ *
+ * Here we just change the eflags value on the stack: we allow
+ * only the super-user to do it. This depends on the stack-layout
+ * on system-call entry - see also fork() and the signal handling
+ * code.
+ */
+
+asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
+{
+       unsigned int old = (regs->eflags >> 12) & 3;
+
+       if (level > 3)
+               return -EINVAL;
+       /* Trying to gain more privileges? */
+       if (level > old) {
+               if (!capable(CAP_SYS_RAWIO))
+                       return -EPERM;
+       }
+       regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12);
+       return 0;
+}
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
new file mode 100644 (file)
index 0000000..39cb3fa
--- /dev/null
@@ -0,0 +1,213 @@
+/*
+ *     linux/arch/x86_64/kernel/irq.c
+ *
+ *     Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the lowest level x86_64-specific interrupt
+ * entry and irq statistics code. All the remaining irq logic is
+ * done by the generic kernel/irq/ code and in the
+ * x86_64-specific irq controller code. (e.g. i8259.c and
+ * io_apic.c.)
+ */
+
+#include <linux/kernel_stat.h>
+#include <linux/interrupt.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <asm/uaccess.h>
+#include <asm/io_apic.h>
+#include <asm/idle.h>
+#include <asm/smp.h>
+
+atomic_t irq_err_count;
+
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+/*
+ * Probabilistic stack overflow check:
+ *
+ * Only check the stack in process context, because everything else
+ * runs on the big interrupt stacks. Checking reliably is too expensive,
+ * so we just check from interrupts.
+ */
+static inline void stack_overflow_check(struct pt_regs *regs)
+{
+       u64 curbase = (u64)task_stack_page(current);
+       static unsigned long warned = -60*HZ;
+
+       if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
+           regs->rsp <  curbase + sizeof(struct thread_info) + 128 &&
+           time_after(jiffies, warned + 60*HZ)) {
+               printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n",
+                      current->comm, curbase, regs->rsp);
+               show_stack(NULL,NULL);
+               warned = jiffies;
+       }
+}
+#endif
+
+/*
+ * Generic, controller-independent functions:
+ */
+
+int show_interrupts(struct seq_file *p, void *v)
+{
+       int i = *(loff_t *) v, j;
+       struct irqaction * action;
+       unsigned long flags;
+
+       if (i == 0) {
+               seq_printf(p, "           ");
+               for_each_online_cpu(j)
+                       seq_printf(p, "CPU%-8d",j);
+               seq_putc(p, '\n');
+       }
+
+       if (i < NR_IRQS) {
+               spin_lock_irqsave(&irq_desc[i].lock, flags);
+               action = irq_desc[i].action;
+               if (!action) 
+                       goto skip;
+               seq_printf(p, "%3d: ",i);
+#ifndef CONFIG_SMP
+               seq_printf(p, "%10u ", kstat_irqs(i));
+#else
+               for_each_online_cpu(j)
+                       seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+#endif
+               seq_printf(p, " %8s", irq_desc[i].chip->name);
+               seq_printf(p, "-%-8s", irq_desc[i].name);
+
+               seq_printf(p, "  %s", action->name);
+               for (action=action->next; action; action = action->next)
+                       seq_printf(p, ", %s", action->name);
+               seq_putc(p, '\n');
+skip:
+               spin_unlock_irqrestore(&irq_desc[i].lock, flags);
+       } else if (i == NR_IRQS) {
+               seq_printf(p, "NMI: ");
+               for_each_online_cpu(j)
+                       seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
+               seq_putc(p, '\n');
+               seq_printf(p, "LOC: ");
+               for_each_online_cpu(j)
+                       seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
+               seq_putc(p, '\n');
+               seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+       }
+       return 0;
+}
+
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
+{
+       struct pt_regs *old_regs = set_irq_regs(regs);
+
+       /* high bit used in ret_from_ code  */
+       unsigned vector = ~regs->orig_rax;
+       unsigned irq;
+
+       exit_idle();
+       irq_enter();
+       irq = __get_cpu_var(vector_irq)[vector];
+
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+       stack_overflow_check(regs);
+#endif
+
+       if (likely(irq < NR_IRQS))
+               generic_handle_irq(irq);
+       else {
+               if (!disable_apic)
+                       ack_APIC_irq();
+
+               if (printk_ratelimit())
+                       printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n",
+                               __func__, smp_processor_id(), vector);
+       }
+
+       irq_exit();
+
+       set_irq_regs(old_regs);
+       return 1;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+void fixup_irqs(cpumask_t map)
+{
+       unsigned int irq;
+       static int warned;
+
+       for (irq = 0; irq < NR_IRQS; irq++) {
+               cpumask_t mask;
+               int break_affinity = 0;
+               int set_affinity = 1;
+
+               if (irq == 2)
+                       continue;
+
+               /* interrupt's are disabled at this point */
+               spin_lock(&irq_desc[irq].lock);
+
+               if (!irq_has_action(irq) ||
+                   cpus_equal(irq_desc[irq].affinity, map)) {
+                       spin_unlock(&irq_desc[irq].lock);
+                       continue;
+               }
+
+               cpus_and(mask, irq_desc[irq].affinity, map);
+               if (cpus_empty(mask)) {
+                       break_affinity = 1;
+                       mask = map;
+               }
+
+               if (irq_desc[irq].chip->mask)
+                       irq_desc[irq].chip->mask(irq);
+
+               if (irq_desc[irq].chip->set_affinity)
+                       irq_desc[irq].chip->set_affinity(irq, mask);
+               else if (!(warned++))
+                       set_affinity = 0;
+
+               if (irq_desc[irq].chip->unmask)
+                       irq_desc[irq].chip->unmask(irq);
+
+               spin_unlock(&irq_desc[irq].lock);
+
+               if (break_affinity && set_affinity)
+                       printk("Broke affinity for irq %i\n", irq);
+               else if (!set_affinity)
+                       printk("Cannot set affinity for irq %i\n", irq);
+       }
+
+       /* That doesn't seem sufficient.  Give it 1ms. */
+       local_irq_enable();
+       mdelay(1);
+       local_irq_disable();
+}
+#endif
+
+extern void call_softirq(void);
+
+asmlinkage void do_softirq(void)
+{
+       __u32 pending;
+       unsigned long flags;
+
+       if (in_interrupt())
+               return;
+
+       local_irq_save(flags);
+       pending = local_softirq_pending();
+       /* Switch to interrupt stack */
+       if (pending) {
+               call_softirq();
+               WARN_ON_ONCE(softirq_count());
+       }
+       local_irq_restore(flags);
+}
+EXPORT_SYMBOL(do_softirq);
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
new file mode 100644 (file)
index 0000000..7377ccb
--- /dev/null
@@ -0,0 +1,123 @@
+/*
+ * Shared support code for AMD K8 northbridges and derivates.
+ * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
+ */
+#include <linux/gfp.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <asm/k8.h>
+
+int num_k8_northbridges;
+EXPORT_SYMBOL(num_k8_northbridges);
+
+static u32 *flush_words;
+
+struct pci_device_id k8_nb_ids[] = {
+       { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
+       { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
+       {}
+};
+EXPORT_SYMBOL(k8_nb_ids);
+
+struct pci_dev **k8_northbridges;
+EXPORT_SYMBOL(k8_northbridges);
+
+static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
+{
+       do {
+               dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
+               if (!dev)
+                       break;
+       } while (!pci_match_id(&k8_nb_ids[0], dev));
+       return dev;
+}
+
+int cache_k8_northbridges(void)
+{
+       int i;
+       struct pci_dev *dev;
+
+       if (num_k8_northbridges)
+               return 0;
+
+       dev = NULL;
+       while ((dev = next_k8_northbridge(dev)) != NULL)
+               num_k8_northbridges++;
+
+       k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *),
+                                 GFP_KERNEL);
+       if (!k8_northbridges)
+               return -ENOMEM;
+
+       if (!num_k8_northbridges) {
+               k8_northbridges[0] = NULL;
+               return 0;
+       }
+
+       flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL);
+       if (!flush_words) {
+               kfree(k8_northbridges);
+               return -ENOMEM;
+       }
+
+       dev = NULL;
+       i = 0;
+       while ((dev = next_k8_northbridge(dev)) != NULL) {
+               k8_northbridges[i] = dev;
+               pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
+       }
+       k8_northbridges[i] = NULL;
+       return 0;
+}
+EXPORT_SYMBOL_GPL(cache_k8_northbridges);
+
+/* Ignores subdevice/subvendor but as far as I can figure out
+   they're useless anyways */
+int __init early_is_k8_nb(u32 device)
+{
+       struct pci_device_id *id;
+       u32 vendor = device & 0xffff;
+       device >>= 16;
+       for (id = k8_nb_ids; id->vendor; id++)
+               if (vendor == id->vendor && device == id->device)
+                       return 1;
+       return 0;
+}
+
+void k8_flush_garts(void)
+{
+       int flushed, i;
+       unsigned long flags;
+       static DEFINE_SPINLOCK(gart_lock);
+
+       /* Avoid races between AGP and IOMMU. In theory it's not needed
+          but I'm not sure if the hardware won't lose flush requests
+          when another is pending. This whole thing is so expensive anyways
+          that it doesn't matter to serialize more. -AK */
+       spin_lock_irqsave(&gart_lock, flags);
+       flushed = 0;
+       for (i = 0; i < num_k8_northbridges; i++) {
+               pci_write_config_dword(k8_northbridges[i], 0x9c,
+                                      flush_words[i]|1);
+               flushed++;
+       }
+       for (i = 0; i < num_k8_northbridges; i++) {
+               u32 w;
+               /* Make sure the hardware actually executed the flush*/
+               for (;;) {
+                       pci_read_config_dword(k8_northbridges[i],
+                                             0x9c, &w);
+                       if (!(w & 1))
+                               break;
+                       cpu_relax();
+               }
+       }
+       spin_unlock_irqrestore(&gart_lock, flags);
+       if (!flushed)
+               printk("nothing to flush?\n");
+}
+EXPORT_SYMBOL_GPL(k8_flush_garts);
+
diff --git a/arch/x86/kernel/kprobes_64.c b/arch/x86/kernel/kprobes_64.c
new file mode 100644 (file)
index 0000000..a30e004
--- /dev/null
@@ -0,0 +1,749 @@
+/*
+ *  Kernel Probes (KProbes)
+ *  arch/x86_64/kernel/kprobes.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct    Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
+ *             Probes initial implementation ( includes contributions from
+ *             Rusty Russell).
+ * 2004-July   Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
+ *             interface to access function arguments.
+ * 2004-Oct    Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi
+ *             <prasanna@in.ibm.com> adapted for x86_64
+ * 2005-Mar    Roland McGrath <roland@redhat.com>
+ *             Fixed to handle %rip-relative addressing mode correctly.
+ * 2005-May     Rusty Lynch <rusty.lynch@intel.com>
+ *              Added function return probes functionality
+ */
+
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/preempt.h>
+#include <linux/module.h>
+#include <linux/kdebug.h>
+
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/alternative.h>
+
+void jprobe_return_end(void);
+static void __kprobes arch_copy_kprobe(struct kprobe *p);
+
+DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
+DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
+
+/*
+ * returns non-zero if opcode modifies the interrupt flag.
+ */
+static __always_inline int is_IF_modifier(kprobe_opcode_t *insn)
+{
+       switch (*insn) {
+       case 0xfa:              /* cli */
+       case 0xfb:              /* sti */
+       case 0xcf:              /* iret/iretd */
+       case 0x9d:              /* popf/popfd */
+               return 1;
+       }
+
+       if (*insn  >= 0x40 && *insn <= 0x4f && *++insn == 0xcf)
+               return 1;
+       return 0;
+}
+
+int __kprobes arch_prepare_kprobe(struct kprobe *p)
+{
+       /* insn: must be on special executable page on x86_64. */
+       p->ainsn.insn = get_insn_slot();
+       if (!p->ainsn.insn) {
+               return -ENOMEM;
+       }
+       arch_copy_kprobe(p);
+       return 0;
+}
+
+/*
+ * Determine if the instruction uses the %rip-relative addressing mode.
+ * If it does, return the address of the 32-bit displacement word.
+ * If not, return null.
+ */
+static s32 __kprobes *is_riprel(u8 *insn)
+{
+#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf)               \
+       (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
+         (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
+         (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
+         (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
+        << (row % 64))
+       static const u64 onebyte_has_modrm[256 / 64] = {
+               /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
+               /*      -------------------------------         */
+               W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */
+               W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */
+               W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */
+               W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */
+               W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */
+               W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */
+               W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */
+               W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */
+               W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */
+               W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */
+               W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */
+               W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */
+               W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */
+               W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */
+               W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */
+               W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1)  /* f0 */
+               /*      -------------------------------         */
+               /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
+       };
+       static const u64 twobyte_has_modrm[256 / 64] = {
+               /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
+               /*      -------------------------------         */
+               W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */
+               W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */
+               W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */
+               W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */
+               W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */
+               W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */
+               W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */
+               W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */
+               W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */
+               W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */
+               W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */
+               W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */
+               W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */
+               W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */
+               W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */
+               W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0)  /* ff */
+               /*      -------------------------------         */
+               /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
+       };
+#undef W
+       int need_modrm;
+
+       /* Skip legacy instruction prefixes.  */
+       while (1) {
+               switch (*insn) {
+               case 0x66:
+               case 0x67:
+               case 0x2e:
+               case 0x3e:
+               case 0x26:
+               case 0x64:
+               case 0x65:
+               case 0x36:
+               case 0xf0:
+               case 0xf3:
+               case 0xf2:
+                       ++insn;
+                       continue;
+               }
+               break;
+       }
+
+       /* Skip REX instruction prefix.  */
+       if ((*insn & 0xf0) == 0x40)
+               ++insn;
+
+       if (*insn == 0x0f) {    /* Two-byte opcode.  */
+               ++insn;
+               need_modrm = test_bit(*insn, twobyte_has_modrm);
+       } else {                /* One-byte opcode.  */
+               need_modrm = test_bit(*insn, onebyte_has_modrm);
+       }
+
+       if (need_modrm) {
+               u8 modrm = *++insn;
+               if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */
+                       /* Displacement follows ModRM byte.  */
+                       return (s32 *) ++insn;
+               }
+       }
+
+       /* No %rip-relative addressing mode here.  */
+       return NULL;
+}
+
+static void __kprobes arch_copy_kprobe(struct kprobe *p)
+{
+       s32 *ripdisp;
+       memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE);
+       ripdisp = is_riprel(p->ainsn.insn);
+       if (ripdisp) {
+               /*
+                * The copied instruction uses the %rip-relative
+                * addressing mode.  Adjust the displacement for the
+                * difference between the original location of this
+                * instruction and the location of the copy that will
+                * actually be run.  The tricky bit here is making sure
+                * that the sign extension happens correctly in this
+                * calculation, since we need a signed 32-bit result to
+                * be sign-extended to 64 bits when it's added to the
+                * %rip value and yield the same 64-bit result that the
+                * sign-extension of the original signed 32-bit
+                * displacement would have given.
+                */
+               s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn;
+               BUG_ON((s64) (s32) disp != disp); /* Sanity check.  */
+               *ripdisp = disp;
+       }
+       p->opcode = *p->addr;
+}
+
+void __kprobes arch_arm_kprobe(struct kprobe *p)
+{
+       text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
+}
+
+void __kprobes arch_disarm_kprobe(struct kprobe *p)
+{
+       text_poke(p->addr, &p->opcode, 1);
+}
+
+void __kprobes arch_remove_kprobe(struct kprobe *p)
+{
+       mutex_lock(&kprobe_mutex);
+       free_insn_slot(p->ainsn.insn, 0);
+       mutex_unlock(&kprobe_mutex);
+}
+
+static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+       kcb->prev_kprobe.kp = kprobe_running();
+       kcb->prev_kprobe.status = kcb->kprobe_status;
+       kcb->prev_kprobe.old_rflags = kcb->kprobe_old_rflags;
+       kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags;
+}
+
+static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+       __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
+       kcb->kprobe_status = kcb->prev_kprobe.status;
+       kcb->kprobe_old_rflags = kcb->prev_kprobe.old_rflags;
+       kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags;
+}
+
+static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+                               struct kprobe_ctlblk *kcb)
+{
+       __get_cpu_var(current_kprobe) = p;
+       kcb->kprobe_saved_rflags = kcb->kprobe_old_rflags
+               = (regs->eflags & (TF_MASK | IF_MASK));
+       if (is_IF_modifier(p->ainsn.insn))
+               kcb->kprobe_saved_rflags &= ~IF_MASK;
+}
+
+static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
+{
+       regs->eflags |= TF_MASK;
+       regs->eflags &= ~IF_MASK;
+       /*single step inline if the instruction is an int3*/
+       if (p->opcode == BREAKPOINT_INSTRUCTION)
+               regs->rip = (unsigned long)p->addr;
+       else
+               regs->rip = (unsigned long)p->ainsn.insn;
+}
+
+/* Called with kretprobe_lock held */
+void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
+                                     struct pt_regs *regs)
+{
+       unsigned long *sara = (unsigned long *)regs->rsp;
+
+       ri->ret_addr = (kprobe_opcode_t *) *sara;
+       /* Replace the return addr with trampoline addr */
+       *sara = (unsigned long) &kretprobe_trampoline;
+}
+
+int __kprobes kprobe_handler(struct pt_regs *regs)
+{
+       struct kprobe *p;
+       int ret = 0;
+       kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t));
+       struct kprobe_ctlblk *kcb;
+
+       /*
+        * We don't want to be preempted for the entire
+        * duration of kprobe processing
+        */
+       preempt_disable();
+       kcb = get_kprobe_ctlblk();
+
+       /* Check we're not actually recursing */
+       if (kprobe_running()) {
+               p = get_kprobe(addr);
+               if (p) {
+                       if (kcb->kprobe_status == KPROBE_HIT_SS &&
+                               *p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
+                               regs->eflags &= ~TF_MASK;
+                               regs->eflags |= kcb->kprobe_saved_rflags;
+                               goto no_kprobe;
+                       } else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) {
+                               /* TODO: Provide re-entrancy from
+                                * post_kprobes_handler() and avoid exception
+                                * stack corruption while single-stepping on
+                                * the instruction of the new probe.
+                                */
+                               arch_disarm_kprobe(p);
+                               regs->rip = (unsigned long)p->addr;
+                               reset_current_kprobe();
+                               ret = 1;
+                       } else {
+                               /* We have reentered the kprobe_handler(), since
+                                * another probe was hit while within the
+                                * handler. We here save the original kprobe
+                                * variables and just single step on instruction
+                                * of the new probe without calling any user
+                                * handlers.
+                                */
+                               save_previous_kprobe(kcb);
+                               set_current_kprobe(p, regs, kcb);
+                               kprobes_inc_nmissed_count(p);
+                               prepare_singlestep(p, regs);
+                               kcb->kprobe_status = KPROBE_REENTER;
+                               return 1;
+                       }
+               } else {
+                       if (*addr != BREAKPOINT_INSTRUCTION) {
+                       /* The breakpoint instruction was removed by
+                        * another cpu right after we hit, no further
+                        * handling of this interrupt is appropriate
+                        */
+                               regs->rip = (unsigned long)addr;
+                               ret = 1;
+                               goto no_kprobe;
+                       }
+                       p = __get_cpu_var(current_kprobe);
+                       if (p->break_handler && p->break_handler(p, regs)) {
+                               goto ss_probe;
+                       }
+               }
+               goto no_kprobe;
+       }
+
+       p = get_kprobe(addr);
+       if (!p) {
+               if (*addr != BREAKPOINT_INSTRUCTION) {
+                       /*
+                        * The breakpoint instruction was removed right
+                        * after we hit it.  Another cpu has removed
+                        * either a probepoint or a debugger breakpoint
+                        * at this address.  In either case, no further
+                        * handling of this interrupt is appropriate.
+                        * Back up over the (now missing) int3 and run
+                        * the original instruction.
+                        */
+                       regs->rip = (unsigned long)addr;
+                       ret = 1;
+               }
+               /* Not one of ours: let kernel handle it */
+               goto no_kprobe;
+       }
+
+       set_current_kprobe(p, regs, kcb);
+       kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+       if (p->pre_handler && p->pre_handler(p, regs))
+               /* handler has already set things up, so skip ss setup */
+               return 1;
+
+ss_probe:
+       prepare_singlestep(p, regs);
+       kcb->kprobe_status = KPROBE_HIT_SS;
+       return 1;
+
+no_kprobe:
+       preempt_enable_no_resched();
+       return ret;
+}
+
+/*
+ * For function-return probes, init_kprobes() establishes a probepoint
+ * here. When a retprobed function returns, this probe is hit and
+ * trampoline_probe_handler() runs, calling the kretprobe's handler.
+ */
+ void kretprobe_trampoline_holder(void)
+ {
+       asm volatile (  ".global kretprobe_trampoline\n"
+                       "kretprobe_trampoline: \n"
+                       "nop\n");
+ }
+
+/*
+ * Called when we hit the probe point at kretprobe_trampoline
+ */
+int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
+{
+       struct kretprobe_instance *ri = NULL;
+       struct hlist_head *head, empty_rp;
+       struct hlist_node *node, *tmp;
+       unsigned long flags, orig_ret_address = 0;
+       unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
+
+       INIT_HLIST_HEAD(&empty_rp);
+       spin_lock_irqsave(&kretprobe_lock, flags);
+       head = kretprobe_inst_table_head(current);
+
+       /*
+        * It is possible to have multiple instances associated with a given
+        * task either because an multiple functions in the call path
+        * have a return probe installed on them, and/or more then one return
+        * return probe was registered for a target function.
+        *
+        * We can handle this because:
+        *     - instances are always inserted at the head of the list
+        *     - when multiple return probes are registered for the same
+        *       function, the first instance's ret_addr will point to the
+        *       real return address, and all the rest will point to
+        *       kretprobe_trampoline
+        */
+       hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+               if (ri->task != current)
+                       /* another task is sharing our hash bucket */
+                       continue;
+
+               if (ri->rp && ri->rp->handler)
+                       ri->rp->handler(ri, regs);
+
+               orig_ret_address = (unsigned long)ri->ret_addr;
+               recycle_rp_inst(ri, &empty_rp);
+
+               if (orig_ret_address != trampoline_address)
+                       /*
+                        * This is the real return address. Any other
+                        * instances associated with this task are for
+                        * other calls deeper on the call stack
+                        */
+                       break;
+       }
+
+       kretprobe_assert(ri, orig_ret_address, trampoline_address);
+       regs->rip = orig_ret_address;
+
+       reset_current_kprobe();
+       spin_unlock_irqrestore(&kretprobe_lock, flags);
+       preempt_enable_no_resched();
+
+       hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
+               hlist_del(&ri->hlist);
+               kfree(ri);
+       }
+       /*
+        * By returning a non-zero value, we are telling
+        * kprobe_handler() that we don't want the post_handler
+        * to run (and have re-enabled preemption)
+        */
+       return 1;
+}
+
+/*
+ * Called after single-stepping.  p->addr is the address of the
+ * instruction whose first byte has been replaced by the "int 3"
+ * instruction.  To avoid the SMP problems that can occur when we
+ * temporarily put back the original opcode to single-step, we
+ * single-stepped a copy of the instruction.  The address of this
+ * copy is p->ainsn.insn.
+ *
+ * This function prepares to return from the post-single-step
+ * interrupt.  We have to fix up the stack as follows:
+ *
+ * 0) Except in the case of absolute or indirect jump or call instructions,
+ * the new rip is relative to the copied instruction.  We need to make
+ * it relative to the original instruction.
+ *
+ * 1) If the single-stepped instruction was pushfl, then the TF and IF
+ * flags are set in the just-pushed eflags, and may need to be cleared.
+ *
+ * 2) If the single-stepped instruction was a call, the return address
+ * that is atop the stack is the address following the copied instruction.
+ * We need to make it the address following the original instruction.
+ */
+static void __kprobes resume_execution(struct kprobe *p,
+               struct pt_regs *regs, struct kprobe_ctlblk *kcb)
+{
+       unsigned long *tos = (unsigned long *)regs->rsp;
+       unsigned long next_rip = 0;
+       unsigned long copy_rip = (unsigned long)p->ainsn.insn;
+       unsigned long orig_rip = (unsigned long)p->addr;
+       kprobe_opcode_t *insn = p->ainsn.insn;
+
+       /*skip the REX prefix*/
+       if (*insn >= 0x40 && *insn <= 0x4f)
+               insn++;
+
+       switch (*insn) {
+       case 0x9c:              /* pushfl */
+               *tos &= ~(TF_MASK | IF_MASK);
+               *tos |= kcb->kprobe_old_rflags;
+               break;
+       case 0xc3:              /* ret/lret */
+       case 0xcb:
+       case 0xc2:
+       case 0xca:
+               regs->eflags &= ~TF_MASK;
+               /* rip is already adjusted, no more changes required*/
+               return;
+       case 0xe8:              /* call relative - Fix return addr */
+               *tos = orig_rip + (*tos - copy_rip);
+               break;
+       case 0xff:
+               if ((insn[1] & 0x30) == 0x10) {
+                       /* call absolute, indirect */
+                       /* Fix return addr; rip is correct. */
+                       next_rip = regs->rip;
+                       *tos = orig_rip + (*tos - copy_rip);
+               } else if (((insn[1] & 0x31) == 0x20) ||        /* jmp near, absolute indirect */
+                          ((insn[1] & 0x31) == 0x21)) {        /* jmp far, absolute indirect */
+                       /* rip is correct. */
+                       next_rip = regs->rip;
+               }
+               break;
+       case 0xea:              /* jmp absolute -- rip is correct */
+               next_rip = regs->rip;
+               break;
+       default:
+               break;
+       }
+
+       regs->eflags &= ~TF_MASK;
+       if (next_rip) {
+               regs->rip = next_rip;
+       } else {
+               regs->rip = orig_rip + (regs->rip - copy_rip);
+       }
+}
+
+int __kprobes post_kprobe_handler(struct pt_regs *regs)
+{
+       struct kprobe *cur = kprobe_running();
+       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+       if (!cur)
+               return 0;
+
+       if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
+               kcb->kprobe_status = KPROBE_HIT_SSDONE;
+               cur->post_handler(cur, regs, 0);
+       }
+
+       resume_execution(cur, regs, kcb);
+       regs->eflags |= kcb->kprobe_saved_rflags;
+
+       /* Restore the original saved kprobes variables and continue. */
+       if (kcb->kprobe_status == KPROBE_REENTER) {
+               restore_previous_kprobe(kcb);
+               goto out;
+       }
+       reset_current_kprobe();
+out:
+       preempt_enable_no_resched();
+
+       /*
+        * if somebody else is singlestepping across a probe point, eflags
+        * will have TF set, in which case, continue the remaining processing
+        * of do_debug, as if this is not a probe hit.
+        */
+       if (regs->eflags & TF_MASK)
+               return 0;
+
+       return 1;
+}
+
+int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+       struct kprobe *cur = kprobe_running();
+       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+       const struct exception_table_entry *fixup;
+
+       switch(kcb->kprobe_status) {
+       case KPROBE_HIT_SS:
+       case KPROBE_REENTER:
+               /*
+                * We are here because the instruction being single
+                * stepped caused a page fault. We reset the current
+                * kprobe and the rip points back to the probe address
+                * and allow the page fault handler to continue as a
+                * normal page fault.
+                */
+               regs->rip = (unsigned long)cur->addr;
+               regs->eflags |= kcb->kprobe_old_rflags;
+               if (kcb->kprobe_status == KPROBE_REENTER)
+                       restore_previous_kprobe(kcb);
+               else
+                       reset_current_kprobe();
+               preempt_enable_no_resched();
+               break;
+       case KPROBE_HIT_ACTIVE:
+       case KPROBE_HIT_SSDONE:
+               /*
+                * We increment the nmissed count for accounting,
+                * we can also use npre/npostfault count for accouting
+                * these specific fault cases.
+                */
+               kprobes_inc_nmissed_count(cur);
+
+               /*
+                * We come here because instructions in the pre/post
+                * handler caused the page_fault, this could happen
+                * if handler tries to access user space by
+                * copy_from_user(), get_user() etc. Let the
+                * user-specified handler try to fix it first.
+                */
+               if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
+                       return 1;
+
+               /*
+                * In case the user-specified fault handler returned
+                * zero, try to fix up.
+                */
+               fixup = search_exception_tables(regs->rip);
+               if (fixup) {
+                       regs->rip = fixup->fixup;
+                       return 1;
+               }
+
+               /*
+                * fixup() could not handle it,
+                * Let do_page_fault() fix it.
+                */
+               break;
+       default:
+               break;
+       }
+       return 0;
+}
+
+/*
+ * Wrapper routine for handling exceptions.
+ */
+int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
+                                      unsigned long val, void *data)
+{
+       struct die_args *args = (struct die_args *)data;
+       int ret = NOTIFY_DONE;
+
+       if (args->regs && user_mode(args->regs))
+               return ret;
+
+       switch (val) {
+       case DIE_INT3:
+               if (kprobe_handler(args->regs))
+                       ret = NOTIFY_STOP;
+               break;
+       case DIE_DEBUG:
+               if (post_kprobe_handler(args->regs))
+                       ret = NOTIFY_STOP;
+               break;
+       case DIE_GPF:
+       case DIE_PAGE_FAULT:
+               /* kprobe_running() needs smp_processor_id() */
+               preempt_disable();
+               if (kprobe_running() &&
+                   kprobe_fault_handler(args->regs, args->trapnr))
+                       ret = NOTIFY_STOP;
+               preempt_enable();
+               break;
+       default:
+               break;
+       }
+       return ret;
+}
+
+int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+       struct jprobe *jp = container_of(p, struct jprobe, kp);
+       unsigned long addr;
+       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+       kcb->jprobe_saved_regs = *regs;
+       kcb->jprobe_saved_rsp = (long *) regs->rsp;
+       addr = (unsigned long)(kcb->jprobe_saved_rsp);
+       /*
+        * As Linus pointed out, gcc assumes that the callee
+        * owns the argument space and could overwrite it, e.g.
+        * tailcall optimization. So, to be absolutely safe
+        * we also save and restore enough stack bytes to cover
+        * the argument area.
+        */
+       memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
+                       MIN_STACK_SIZE(addr));
+       regs->eflags &= ~IF_MASK;
+       regs->rip = (unsigned long)(jp->entry);
+       return 1;
+}
+
+void __kprobes jprobe_return(void)
+{
+       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+       asm volatile ("       xchg   %%rbx,%%rsp     \n"
+                     "       int3                      \n"
+                     "       .globl jprobe_return_end  \n"
+                     "       jprobe_return_end:        \n"
+                     "       nop                       \n"::"b"
+                     (kcb->jprobe_saved_rsp):"memory");
+}
+
+int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+{
+       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+       u8 *addr = (u8 *) (regs->rip - 1);
+       unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_rsp);
+       struct jprobe *jp = container_of(p, struct jprobe, kp);
+
+       if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
+               if ((long *)regs->rsp != kcb->jprobe_saved_rsp) {
+                       struct pt_regs *saved_regs =
+                           container_of(kcb->jprobe_saved_rsp,
+                                           struct pt_regs, rsp);
+                       printk("current rsp %p does not match saved rsp %p\n",
+                              (long *)regs->rsp, kcb->jprobe_saved_rsp);
+                       printk("Saved registers for jprobe %p\n", jp);
+                       show_registers(saved_regs);
+                       printk("Current registers\n");
+                       show_registers(regs);
+                       BUG();
+               }
+               *regs = kcb->jprobe_saved_regs;
+               memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
+                      MIN_STACK_SIZE(stack_addr));
+               preempt_enable_no_resched();
+               return 1;
+       }
+       return 0;
+}
+
+static struct kprobe trampoline_p = {
+       .addr = (kprobe_opcode_t *) &kretprobe_trampoline,
+       .pre_handler = trampoline_probe_handler
+};
+
+int __init arch_init_kprobes(void)
+{
+       return register_kprobe(&trampoline_p);
+}
+
+int __kprobes arch_trampoline_kprobe(struct kprobe *p)
+{
+       if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline)
+               return 1;
+
+       return 0;
+}
diff --git a/arch/x86/kernel/ldt_64.c b/arch/x86/kernel/ldt_64.c
new file mode 100644 (file)
index 0000000..bc9ffd5
--- /dev/null
@@ -0,0 +1,252 @@
+/*
+ * linux/arch/x86_64/kernel/ldt.c
+ *
+ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2002 Andi Kleen
+ * 
+ * This handles calls from both 32bit and 64bit mode.
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/ldt.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+
+#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
+static void flush_ldt(void *null)
+{
+       if (current->active_mm)
+               load_LDT(&current->active_mm->context);
+}
+#endif
+
+static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
+{
+       void *oldldt;
+       void *newldt;
+       unsigned oldsize;
+
+       if (mincount <= (unsigned)pc->size)
+               return 0;
+       oldsize = pc->size;
+       mincount = (mincount+511)&(~511);
+       if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
+               newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
+       else
+               newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
+
+       if (!newldt)
+               return -ENOMEM;
+
+       if (oldsize)
+               memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
+       oldldt = pc->ldt;
+       memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
+       wmb();
+       pc->ldt = newldt;
+       wmb();
+       pc->size = mincount;
+       wmb();
+       if (reload) {
+#ifdef CONFIG_SMP
+               cpumask_t mask;
+
+               preempt_disable();
+               mask = cpumask_of_cpu(smp_processor_id());
+               load_LDT(pc);
+               if (!cpus_equal(current->mm->cpu_vm_mask, mask))
+                       smp_call_function(flush_ldt, NULL, 1, 1);
+               preempt_enable();
+#else
+               load_LDT(pc);
+#endif
+       }
+       if (oldsize) {
+               if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
+                       vfree(oldldt);
+               else
+                       kfree(oldldt);
+       }
+       return 0;
+}
+
+static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
+{
+       int err = alloc_ldt(new, old->size, 0);
+       if (err < 0)
+               return err;
+       memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
+       return 0;
+}
+
+/*
+ * we do not have to muck with descriptors here, that is
+ * done in switch_mm() as needed.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+       struct mm_struct * old_mm;
+       int retval = 0;
+
+       init_MUTEX(&mm->context.sem);
+       mm->context.size = 0;
+       old_mm = current->mm;
+       if (old_mm && old_mm->context.size > 0) {
+               down(&old_mm->context.sem);
+               retval = copy_ldt(&mm->context, &old_mm->context);
+               up(&old_mm->context.sem);
+       }
+       return retval;
+}
+
+/*
+ * 
+ * Don't touch the LDT register - we're already in the next thread.
+ */
+void destroy_context(struct mm_struct *mm)
+{
+       if (mm->context.size) {
+               if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
+                       vfree(mm->context.ldt);
+               else
+                       kfree(mm->context.ldt);
+               mm->context.size = 0;
+       }
+}
+
+static int read_ldt(void __user * ptr, unsigned long bytecount)
+{
+       int err;
+       unsigned long size;
+       struct mm_struct * mm = current->mm;
+
+       if (!mm->context.size)
+               return 0;
+       if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
+               bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
+
+       down(&mm->context.sem);
+       size = mm->context.size*LDT_ENTRY_SIZE;
+       if (size > bytecount)
+               size = bytecount;
+
+       err = 0;
+       if (copy_to_user(ptr, mm->context.ldt, size))
+               err = -EFAULT;
+       up(&mm->context.sem);
+       if (err < 0)
+               goto error_return;
+       if (size != bytecount) {
+               /* zero-fill the rest */
+               if (clear_user(ptr+size, bytecount-size) != 0) {
+                       err = -EFAULT;
+                       goto error_return;
+               }
+       }
+       return bytecount;
+error_return:
+       return err;
+}
+
+static int read_default_ldt(void __user * ptr, unsigned long bytecount)
+{
+       /* Arbitrary number */ 
+       /* x86-64 default LDT is all zeros */
+       if (bytecount > 128) 
+               bytecount = 128;        
+       if (clear_user(ptr, bytecount))
+               return -EFAULT;
+       return bytecount; 
+}
+
+static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
+{
+       struct task_struct *me = current;
+       struct mm_struct * mm = me->mm;
+       __u32 entry_1, entry_2, *lp;
+       int error;
+       struct user_desc ldt_info;
+
+       error = -EINVAL;
+
+       if (bytecount != sizeof(ldt_info))
+               goto out;
+       error = -EFAULT;        
+       if (copy_from_user(&ldt_info, ptr, bytecount))
+               goto out;
+
+       error = -EINVAL;
+       if (ldt_info.entry_number >= LDT_ENTRIES)
+               goto out;
+       if (ldt_info.contents == 3) {
+               if (oldmode)
+                       goto out;
+               if (ldt_info.seg_not_present == 0)
+                       goto out;
+       }
+
+       down(&mm->context.sem);
+       if (ldt_info.entry_number >= (unsigned)mm->context.size) {
+               error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
+               if (error < 0)
+                       goto out_unlock;
+       }
+
+       lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
+
+       /* Allow LDTs to be cleared by the user. */
+       if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
+               if (oldmode || LDT_empty(&ldt_info)) {
+                       entry_1 = 0;
+                       entry_2 = 0;
+                       goto install;
+               }
+       }
+
+       entry_1 = LDT_entry_a(&ldt_info);
+       entry_2 = LDT_entry_b(&ldt_info);
+       if (oldmode)
+               entry_2 &= ~(1 << 20);
+
+       /* Install the new entry ...  */
+install:
+       *lp     = entry_1;
+       *(lp+1) = entry_2;
+       error = 0;
+
+out_unlock:
+       up(&mm->context.sem);
+out:
+       return error;
+}
+
+asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
+{
+       int ret = -ENOSYS;
+
+       switch (func) {
+       case 0:
+               ret = read_ldt(ptr, bytecount);
+               break;
+       case 1:
+               ret = write_ldt(ptr, bytecount, 1);
+               break;
+       case 2:
+               ret = read_default_ldt(ptr, bytecount);
+               break;
+       case 0x11:
+               ret = write_ldt(ptr, bytecount, 0);
+               break;
+       }
+       return ret;
+}
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
new file mode 100644 (file)
index 0000000..c3a5547
--- /dev/null
@@ -0,0 +1,259 @@
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/string.h>
+#include <linux/reboot.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/io.h>
+
+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+static u64 kexec_pgd[512] PAGE_ALIGNED;
+static u64 kexec_pud0[512] PAGE_ALIGNED;
+static u64 kexec_pmd0[512] PAGE_ALIGNED;
+static u64 kexec_pte0[512] PAGE_ALIGNED;
+static u64 kexec_pud1[512] PAGE_ALIGNED;
+static u64 kexec_pmd1[512] PAGE_ALIGNED;
+static u64 kexec_pte1[512] PAGE_ALIGNED;
+
+static void init_level2_page(pmd_t *level2p, unsigned long addr)
+{
+       unsigned long end_addr;
+
+       addr &= PAGE_MASK;
+       end_addr = addr + PUD_SIZE;
+       while (addr < end_addr) {
+               set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+               addr += PMD_SIZE;
+       }
+}
+
+static int init_level3_page(struct kimage *image, pud_t *level3p,
+                               unsigned long addr, unsigned long last_addr)
+{
+       unsigned long end_addr;
+       int result;
+
+       result = 0;
+       addr &= PAGE_MASK;
+       end_addr = addr + PGDIR_SIZE;
+       while ((addr < last_addr) && (addr < end_addr)) {
+               struct page *page;
+               pmd_t *level2p;
+
+               page = kimage_alloc_control_pages(image, 0);
+               if (!page) {
+                       result = -ENOMEM;
+                       goto out;
+               }
+               level2p = (pmd_t *)page_address(page);
+               init_level2_page(level2p, addr);
+               set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
+               addr += PUD_SIZE;
+       }
+       /* clear the unused entries */
+       while (addr < end_addr) {
+               pud_clear(level3p++);
+               addr += PUD_SIZE;
+       }
+out:
+       return result;
+}
+
+
+static int init_level4_page(struct kimage *image, pgd_t *level4p,
+                               unsigned long addr, unsigned long last_addr)
+{
+       unsigned long end_addr;
+       int result;
+
+       result = 0;
+       addr &= PAGE_MASK;
+       end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
+       while ((addr < last_addr) && (addr < end_addr)) {
+               struct page *page;
+               pud_t *level3p;
+
+               page = kimage_alloc_control_pages(image, 0);
+               if (!page) {
+                       result = -ENOMEM;
+                       goto out;
+               }
+               level3p = (pud_t *)page_address(page);
+               result = init_level3_page(image, level3p, addr, last_addr);
+               if (result) {
+                       goto out;
+               }
+               set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
+               addr += PGDIR_SIZE;
+       }
+       /* clear the unused entries */
+       while (addr < end_addr) {
+               pgd_clear(level4p++);
+               addr += PGDIR_SIZE;
+       }
+out:
+       return result;
+}
+
+
+static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
+{
+       pgd_t *level4p;
+       level4p = (pgd_t *)__va(start_pgtable);
+       return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
+}
+
+static void set_idt(void *newidt, u16 limit)
+{
+       struct desc_ptr curidt;
+
+       /* x86-64 supports unaliged loads & stores */
+       curidt.size    = limit;
+       curidt.address = (unsigned long)newidt;
+
+       __asm__ __volatile__ (
+               "lidtq %0\n"
+               : : "m" (curidt)
+               );
+};
+
+
+static void set_gdt(void *newgdt, u16 limit)
+{
+       struct desc_ptr curgdt;
+
+       /* x86-64 supports unaligned loads & stores */
+       curgdt.size    = limit;
+       curgdt.address = (unsigned long)newgdt;
+
+       __asm__ __volatile__ (
+               "lgdtq %0\n"
+               : : "m" (curgdt)
+               );
+};
+
+static void load_segments(void)
+{
+       __asm__ __volatile__ (
+               "\tmovl %0,%%ds\n"
+               "\tmovl %0,%%es\n"
+               "\tmovl %0,%%ss\n"
+               "\tmovl %0,%%fs\n"
+               "\tmovl %0,%%gs\n"
+               : : "a" (__KERNEL_DS) : "memory"
+               );
+}
+
+int machine_kexec_prepare(struct kimage *image)
+{
+       unsigned long start_pgtable;
+       int result;
+
+       /* Calculate the offsets */
+       start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+
+       /* Setup the identity mapped 64bit page table */
+       result = init_pgtable(image, start_pgtable);
+       if (result)
+               return result;
+
+       return 0;
+}
+
+void machine_kexec_cleanup(struct kimage *image)
+{
+       return;
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+       unsigned long page_list[PAGES_NR];
+       void *control_page;
+
+       /* Interrupts aren't acceptable while we reboot */
+       local_irq_disable();
+
+       control_page = page_address(image->control_code_page) + PAGE_SIZE;
+       memcpy(control_page, relocate_kernel, PAGE_SIZE);
+
+       page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
+       page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
+       page_list[PA_PGD] = virt_to_phys(&kexec_pgd);
+       page_list[VA_PGD] = (unsigned long)kexec_pgd;
+       page_list[PA_PUD_0] = virt_to_phys(&kexec_pud0);
+       page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
+       page_list[PA_PMD_0] = virt_to_phys(&kexec_pmd0);
+       page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
+       page_list[PA_PTE_0] = virt_to_phys(&kexec_pte0);
+       page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
+       page_list[PA_PUD_1] = virt_to_phys(&kexec_pud1);
+       page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
+       page_list[PA_PMD_1] = virt_to_phys(&kexec_pmd1);
+       page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
+       page_list[PA_PTE_1] = virt_to_phys(&kexec_pte1);
+       page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+
+       page_list[PA_TABLE_PAGE] =
+         (unsigned long)__pa(page_address(image->control_code_page));
+
+       /* The segment registers are funny things, they have both a
+        * visible and an invisible part.  Whenever the visible part is
+        * set to a specific selector, the invisible part is loaded
+        * with from a table in memory.  At no other time is the
+        * descriptor table in memory accessed.
+        *
+        * I take advantage of this here by force loading the
+        * segments, before I zap the gdt with an invalid value.
+        */
+       load_segments();
+       /* The gdt & idt are now invalid.
+        * If you want to load them you must set up your own idt & gdt.
+        */
+       set_gdt(phys_to_virt(0),0);
+       set_idt(phys_to_virt(0),0);
+
+       /* now call it */
+       relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
+                       image->start);
+}
+
+/* crashkernel=size@addr specifies the location to reserve for
+ * a crash kernel.  By reserving this memory we guarantee
+ * that linux never set's it up as a DMA target.
+ * Useful for holding code to do something appropriate
+ * after a kernel panic.
+ */
+static int __init setup_crashkernel(char *arg)
+{
+       unsigned long size, base;
+       char *p;
+       if (!arg)
+               return -EINVAL;
+       size = memparse(arg, &p);
+       if (arg == p)
+               return -EINVAL;
+       if (*p == '@') {
+               base = memparse(p+1, &p);
+               /* FIXME: Do I want a sanity check to validate the
+                * memory range?  Yes you do, but it's too early for
+                * e820 -AK */
+               crashk_res.start = base;
+               crashk_res.end   = base + size - 1;
+       }
+       return 0;
+}
+early_param("crashkernel", setup_crashkernel);
+
diff --git a/arch/x86/kernel/mce_64.c b/arch/x86/kernel/mce_64.c
new file mode 100644 (file)
index 0000000..a66d607
--- /dev/null
@@ -0,0 +1,875 @@
+/*
+ * Machine check handler.
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s). 
+ * 2004 Andi Kleen. Rewrote most of it. 
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/rcupdate.h>
+#include <linux/kallsyms.h>
+#include <linux/sysdev.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <linux/capability.h>
+#include <linux/cpu.h>
+#include <linux/percpu.h>
+#include <linux/poll.h>
+#include <linux/thread_info.h>
+#include <linux/ctype.h>
+#include <linux/kmod.h>
+#include <linux/kdebug.h>
+#include <asm/processor.h> 
+#include <asm/msr.h>
+#include <asm/mce.h>
+#include <asm/uaccess.h>
+#include <asm/smp.h>
+#include <asm/idle.h>
+
+#define MISC_MCELOG_MINOR 227
+#define NR_BANKS 6
+
+atomic_t mce_entry;
+
+static int mce_dont_init;
+
+/*
+ * Tolerant levels:
+ *   0: always panic on uncorrected errors, log corrected errors
+ *   1: panic or SIGBUS on uncorrected errors, log corrected errors
+ *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
+ *   3: never panic or SIGBUS, log all errors (for testing only)
+ */
+static int tolerant = 1;
+static int banks;
+static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
+static unsigned long notify_user;
+static int rip_msr;
+static int mce_bootlog = 1;
+static atomic_t mce_events;
+
+static char trigger[128];
+static char *trigger_argv[2] = { trigger, NULL };
+
+static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
+
+/*
+ * Lockless MCE logging infrastructure.
+ * This avoids deadlocks on printk locks without having to break locks. Also
+ * separate MCEs from kernel messages to avoid bogus bug reports.
+ */
+
+struct mce_log mcelog = { 
+       MCE_LOG_SIGNATURE,
+       MCE_LOG_LEN,
+}; 
+
+void mce_log(struct mce *mce)
+{
+       unsigned next, entry;
+       atomic_inc(&mce_events);
+       mce->finished = 0;
+       wmb();
+       for (;;) {
+               entry = rcu_dereference(mcelog.next);
+               /* The rmb forces the compiler to reload next in each
+                   iteration */
+               rmb();
+               for (;;) {
+                       /* When the buffer fills up discard new entries. Assume
+                          that the earlier errors are the more interesting. */
+                       if (entry >= MCE_LOG_LEN) {
+                               set_bit(MCE_OVERFLOW, &mcelog.flags);
+                               return;
+                       }
+                       /* Old left over entry. Skip. */
+                       if (mcelog.entry[entry].finished) {
+                               entry++;
+                               continue;
+                       }
+                       break;
+               }
+               smp_rmb();
+               next = entry + 1;
+               if (cmpxchg(&mcelog.next, entry, next) == entry)
+                       break;
+       }
+       memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
+       wmb();
+       mcelog.entry[entry].finished = 1;
+       wmb();
+
+       set_bit(0, &notify_user);
+}
+
+static void print_mce(struct mce *m)
+{
+       printk(KERN_EMERG "\n"
+              KERN_EMERG "HARDWARE ERROR\n"
+              KERN_EMERG
+              "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
+              m->cpu, m->mcgstatus, m->bank, m->status);
+       if (m->rip) {
+               printk(KERN_EMERG 
+                      "RIP%s %02x:<%016Lx> ",
+                      !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
+                      m->cs, m->rip);
+               if (m->cs == __KERNEL_CS)
+                       print_symbol("{%s}", m->rip);
+               printk("\n");
+       }
+       printk(KERN_EMERG "TSC %Lx ", m->tsc); 
+       if (m->addr)
+               printk("ADDR %Lx ", m->addr);
+       if (m->misc)
+               printk("MISC %Lx ", m->misc);   
+       printk("\n");
+       printk(KERN_EMERG "This is not a software problem!\n");
+        printk(KERN_EMERG
+    "Run through mcelog --ascii to decode and contact your hardware vendor\n");
+}
+
+static void mce_panic(char *msg, struct mce *backup, unsigned long start)
+{ 
+       int i;
+
+       oops_begin();
+       for (i = 0; i < MCE_LOG_LEN; i++) {
+               unsigned long tsc = mcelog.entry[i].tsc;
+               if (time_before(tsc, start))
+                       continue;
+               print_mce(&mcelog.entry[i]); 
+               if (backup && mcelog.entry[i].tsc == backup->tsc)
+                       backup = NULL;
+       }
+       if (backup)
+               print_mce(backup);
+       panic(msg);
+} 
+
+static int mce_available(struct cpuinfo_x86 *c)
+{
+       return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
+}
+
+static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
+{
+       if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
+               m->rip = regs->rip;
+               m->cs = regs->cs;
+       } else {
+               m->rip = 0;
+               m->cs = 0;
+       }
+       if (rip_msr) {
+               /* Assume the RIP in the MSR is exact. Is this true? */
+               m->mcgstatus |= MCG_STATUS_EIPV;
+               rdmsrl(rip_msr, m->rip);
+               m->cs = 0;
+       }
+}
+
+/* 
+ * The actual machine check handler
+ */
+
+void do_machine_check(struct pt_regs * regs, long error_code)
+{
+       struct mce m, panicm;
+       u64 mcestart = 0;
+       int i;
+       int panicm_found = 0;
+       /*
+        * If no_way_out gets set, there is no safe way to recover from this
+        * MCE.  If tolerant is cranked up, we'll try anyway.
+        */
+       int no_way_out = 0;
+       /*
+        * If kill_it gets set, there might be a way to recover from this
+        * error.
+        */
+       int kill_it = 0;
+
+       atomic_inc(&mce_entry);
+
+       if (regs)
+               notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
+       if (!banks)
+               goto out2;
+
+       memset(&m, 0, sizeof(struct mce));
+       m.cpu = smp_processor_id();
+       rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+       /* if the restart IP is not valid, we're done for */
+       if (!(m.mcgstatus & MCG_STATUS_RIPV))
+               no_way_out = 1;
+       
+       rdtscll(mcestart);
+       barrier();
+
+       for (i = 0; i < banks; i++) {
+               if (!bank[i])
+                       continue;
+               
+               m.misc = 0; 
+               m.addr = 0;
+               m.bank = i;
+               m.tsc = 0;
+
+               rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
+               if ((m.status & MCI_STATUS_VAL) == 0)
+                       continue;
+
+               if (m.status & MCI_STATUS_EN) {
+                       /* if PCC was set, there's no way out */
+                       no_way_out |= !!(m.status & MCI_STATUS_PCC);
+                       /*
+                        * If this error was uncorrectable and there was
+                        * an overflow, we're in trouble.  If no overflow,
+                        * we might get away with just killing a task.
+                        */
+                       if (m.status & MCI_STATUS_UC) {
+                               if (tolerant < 1 || m.status & MCI_STATUS_OVER)
+                                       no_way_out = 1;
+                               kill_it = 1;
+                       }
+               }
+
+               if (m.status & MCI_STATUS_MISCV)
+                       rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
+               if (m.status & MCI_STATUS_ADDRV)
+                       rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
+
+               mce_get_rip(&m, regs);
+               if (error_code >= 0)
+                       rdtscll(m.tsc);
+               if (error_code != -2)
+                       mce_log(&m);
+
+               /* Did this bank cause the exception? */
+               /* Assume that the bank with uncorrectable errors did it,
+                  and that there is only a single one. */
+               if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
+                       panicm = m;
+                       panicm_found = 1;
+               }
+
+               add_taint(TAINT_MACHINE_CHECK);
+       }
+
+       /* Never do anything final in the polling timer */
+       if (!regs)
+               goto out;
+
+       /* If we didn't find an uncorrectable error, pick
+          the last one (shouldn't happen, just being safe). */
+       if (!panicm_found)
+               panicm = m;
+
+       /*
+        * If we have decided that we just CAN'T continue, and the user
+        *  has not set tolerant to an insane level, give up and die.
+        */
+       if (no_way_out && tolerant < 3)
+               mce_panic("Machine check", &panicm, mcestart);
+
+       /*
+        * If the error seems to be unrecoverable, something should be
+        * done.  Try to kill as little as possible.  If we can kill just
+        * one task, do that.  If the user has set the tolerance very
+        * high, don't try to do anything at all.
+        */
+       if (kill_it && tolerant < 3) {
+               int user_space = 0;
+
+               /*
+                * If the EIPV bit is set, it means the saved IP is the
+                * instruction which caused the MCE.
+                */
+               if (m.mcgstatus & MCG_STATUS_EIPV)
+                       user_space = panicm.rip && (panicm.cs & 3);
+
+               /*
+                * If we know that the error was in user space, send a
+                * SIGBUS.  Otherwise, panic if tolerance is low.
+                *
+                * do_exit() takes an awful lot of locks and has a slight
+                * risk of deadlocking.
+                */
+               if (user_space) {
+                       do_exit(SIGBUS);
+               } else if (panic_on_oops || tolerant < 2) {
+                       mce_panic("Uncorrected machine check",
+                               &panicm, mcestart);
+               }
+       }
+
+       /* notify userspace ASAP */
+       set_thread_flag(TIF_MCE_NOTIFY);
+
+ out:
+       /* the last thing we do is clear state */
+       for (i = 0; i < banks; i++)
+               wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+       wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ out2:
+       atomic_dec(&mce_entry);
+}
+
+#ifdef CONFIG_X86_MCE_INTEL
+/***
+ * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
+ * @cpu: The CPU on which the event occured.
+ * @status: Event status information
+ *
+ * This function should be called by the thermal interrupt after the
+ * event has been processed and the decision was made to log the event
+ * further.
+ *
+ * The status parameter will be saved to the 'status' field of 'struct mce'
+ * and historically has been the register value of the
+ * MSR_IA32_THERMAL_STATUS (Intel) msr.
+ */
+void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
+{
+       struct mce m;
+
+       memset(&m, 0, sizeof(m));
+       m.cpu = cpu;
+       m.bank = MCE_THERMAL_BANK;
+       m.status = status;
+       rdtscll(m.tsc);
+       mce_log(&m);
+}
+#endif /* CONFIG_X86_MCE_INTEL */
+
+/*
+ * Periodic polling timer for "silent" machine check errors.  If the
+ * poller finds an MCE, poll 2x faster.  When the poller finds no more
+ * errors, poll 2x slower (up to check_interval seconds).
+ */
+
+static int check_interval = 5 * 60; /* 5 minutes */
+static int next_interval; /* in jiffies */
+static void mcheck_timer(struct work_struct *work);
+static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
+
+static void mcheck_check_cpu(void *info)
+{
+       if (mce_available(&current_cpu_data))
+               do_machine_check(NULL, 0);
+}
+
+static void mcheck_timer(struct work_struct *work)
+{
+       on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
+
+       /*
+        * Alert userspace if needed.  If we logged an MCE, reduce the
+        * polling interval, otherwise increase the polling interval.
+        */
+       if (mce_notify_user()) {
+               next_interval = max(next_interval/2, HZ/100);
+       } else {
+               next_interval = min(next_interval*2,
+                               (int)round_jiffies_relative(check_interval*HZ));
+       }
+
+       schedule_delayed_work(&mcheck_work, next_interval);
+}
+
+/*
+ * This is only called from process context.  This is where we do
+ * anything we need to alert userspace about new MCEs.  This is called
+ * directly from the poller and also from entry.S and idle, thanks to
+ * TIF_MCE_NOTIFY.
+ */
+int mce_notify_user(void)
+{
+       clear_thread_flag(TIF_MCE_NOTIFY);
+       if (test_and_clear_bit(0, &notify_user)) {
+               static unsigned long last_print;
+               unsigned long now = jiffies;
+
+               wake_up_interruptible(&mce_wait);
+               if (trigger[0])
+                       call_usermodehelper(trigger, trigger_argv, NULL,
+                                               UMH_NO_WAIT);
+
+               if (time_after_eq(now, last_print + (check_interval*HZ))) {
+                       last_print = now;
+                       printk(KERN_INFO "Machine check events logged\n");
+               }
+
+               return 1;
+       }
+       return 0;
+}
+
+/* see if the idle task needs to notify userspace */
+static int
+mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
+{
+       /* IDLE_END should be safe - interrupts are back on */
+       if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
+               mce_notify_user();
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block mce_idle_notifier = {
+       .notifier_call = mce_idle_callback,
+};
+
+static __init int periodic_mcheck_init(void)
+{ 
+       next_interval = check_interval * HZ;
+       if (next_interval)
+               schedule_delayed_work(&mcheck_work,
+                                     round_jiffies_relative(next_interval));
+       idle_notifier_register(&mce_idle_notifier);
+       return 0;
+} 
+__initcall(periodic_mcheck_init);
+
+
+/* 
+ * Initialize Machine Checks for a CPU.
+ */
+static void mce_init(void *dummy)
+{
+       u64 cap;
+       int i;
+
+       rdmsrl(MSR_IA32_MCG_CAP, cap);
+       banks = cap & 0xff;
+       if (banks > NR_BANKS) { 
+               printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
+               banks = NR_BANKS; 
+       }
+       /* Use accurate RIP reporting if available. */
+       if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
+               rip_msr = MSR_IA32_MCG_EIP;
+
+       /* Log the machine checks left over from the previous reset.
+          This also clears all registers */
+       do_machine_check(NULL, mce_bootlog ? -1 : -2);
+
+       set_in_cr4(X86_CR4_MCE);
+
+       if (cap & MCG_CTL_P)
+               wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+
+       for (i = 0; i < banks; i++) {
+               wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
+               wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+       }       
+}
+
+/* Add per CPU specific workarounds here */
+static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
+{ 
+       /* This should be disabled by the BIOS, but isn't always */
+       if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
+               /* disable GART TBL walk error reporting, which trips off 
+                  incorrectly with the IOMMU & 3ware & Cerberus. */
+               clear_bit(10, &bank[4]);
+               /* Lots of broken BIOS around that don't clear them
+                  by default and leave crap in there. Don't log. */
+               mce_bootlog = 0;
+       }
+
+}                      
+
+static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
+{
+       switch (c->x86_vendor) {
+       case X86_VENDOR_INTEL:
+               mce_intel_feature_init(c);
+               break;
+       case X86_VENDOR_AMD:
+               mce_amd_feature_init(c);
+               break;
+       default:
+               break;
+       }
+}
+
+/* 
+ * Called for each booted CPU to set up machine checks.
+ * Must be called with preempt off. 
+ */
+void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
+{
+       static cpumask_t mce_cpus = CPU_MASK_NONE;
+
+       mce_cpu_quirks(c); 
+
+       if (mce_dont_init ||
+           cpu_test_and_set(smp_processor_id(), mce_cpus) ||
+           !mce_available(c))
+               return;
+
+       mce_init(NULL);
+       mce_cpu_features(c);
+}
+
+/*
+ * Character device to read and clear the MCE log.
+ */
+
+static DEFINE_SPINLOCK(mce_state_lock);
+static int open_count; /* #times opened */
+static int open_exclu; /* already open exclusive? */
+
+static int mce_open(struct inode *inode, struct file *file)
+{
+       spin_lock(&mce_state_lock);
+
+       if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
+               spin_unlock(&mce_state_lock);
+               return -EBUSY;
+       }
+
+       if (file->f_flags & O_EXCL)
+               open_exclu = 1;
+       open_count++;
+
+       spin_unlock(&mce_state_lock);
+
+       return nonseekable_open(inode, file);
+}
+
+static int mce_release(struct inode *inode, struct file *file)
+{
+       spin_lock(&mce_state_lock);
+
+       open_count--;
+       open_exclu = 0;
+
+       spin_unlock(&mce_state_lock);
+
+       return 0;
+}
+
+static void collect_tscs(void *data) 
+{ 
+       unsigned long *cpu_tsc = (unsigned long *)data;
+       rdtscll(cpu_tsc[smp_processor_id()]);
+} 
+
+static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
+{
+       unsigned long *cpu_tsc;
+       static DECLARE_MUTEX(mce_read_sem);
+       unsigned next;
+       char __user *buf = ubuf;
+       int i, err;
+
+       cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
+       if (!cpu_tsc)
+               return -ENOMEM;
+
+       down(&mce_read_sem); 
+       next = rcu_dereference(mcelog.next);
+
+       /* Only supports full reads right now */
+       if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 
+               up(&mce_read_sem);
+               kfree(cpu_tsc);
+               return -EINVAL;
+       }
+
+       err = 0;
+       for (i = 0; i < next; i++) {            
+               unsigned long start = jiffies;
+               while (!mcelog.entry[i].finished) {
+                       if (time_after_eq(jiffies, start + 2)) {
+                               memset(mcelog.entry + i,0, sizeof(struct mce));
+                               goto timeout;
+                       }
+                       cpu_relax();
+               }
+               smp_rmb();
+               err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
+               buf += sizeof(struct mce); 
+ timeout:
+               ;
+       } 
+
+       memset(mcelog.entry, 0, next * sizeof(struct mce));
+       mcelog.next = 0;
+
+       synchronize_sched();
+
+       /* Collect entries that were still getting written before the synchronize. */
+
+       on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
+       for (i = next; i < MCE_LOG_LEN; i++) { 
+               if (mcelog.entry[i].finished && 
+                   mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {  
+                       err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
+                       smp_rmb();
+                       buf += sizeof(struct mce);
+                       memset(&mcelog.entry[i], 0, sizeof(struct mce));
+               }
+       }       
+       up(&mce_read_sem);
+       kfree(cpu_tsc);
+       return err ? -EFAULT : buf - ubuf; 
+}
+
+static unsigned int mce_poll(struct file *file, poll_table *wait)
+{
+       poll_wait(file, &mce_wait, wait);
+       if (rcu_dereference(mcelog.next))
+               return POLLIN | POLLRDNORM;
+       return 0;
+}
+
+static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
+{
+       int __user *p = (int __user *)arg;
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM; 
+       switch (cmd) {
+       case MCE_GET_RECORD_LEN: 
+               return put_user(sizeof(struct mce), p);
+       case MCE_GET_LOG_LEN:
+               return put_user(MCE_LOG_LEN, p);                
+       case MCE_GETCLEAR_FLAGS: {
+               unsigned flags;
+               do { 
+                       flags = mcelog.flags;
+               } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 
+               return put_user(flags, p); 
+       }
+       default:
+               return -ENOTTY; 
+       } 
+}
+
+static const struct file_operations mce_chrdev_ops = {
+       .open = mce_open,
+       .release = mce_release,
+       .read = mce_read,
+       .poll = mce_poll,
+       .ioctl = mce_ioctl,
+};
+
+static struct miscdevice mce_log_device = {
+       MISC_MCELOG_MINOR,
+       "mcelog",
+       &mce_chrdev_ops,
+};
+
+static unsigned long old_cr4 __initdata;
+
+void __init stop_mce(void)
+{
+       old_cr4 = read_cr4();
+       clear_in_cr4(X86_CR4_MCE);
+}
+
+void __init restart_mce(void)
+{
+       if (old_cr4 & X86_CR4_MCE)
+               set_in_cr4(X86_CR4_MCE);
+}
+
+/* 
+ * Old style boot options parsing. Only for compatibility. 
+ */
+
+static int __init mcheck_disable(char *str)
+{
+       mce_dont_init = 1;
+       return 1;
+}
+
+/* mce=off disables machine check. Note you can reenable it later
+   using sysfs.
+   mce=TOLERANCELEVEL (number, see above)
+   mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
+   mce=nobootlog Don't log MCEs from before booting. */
+static int __init mcheck_enable(char *str)
+{
+       if (*str == '=')
+               str++;
+       if (!strcmp(str, "off"))
+               mce_dont_init = 1;
+       else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
+               mce_bootlog = str[0] == 'b';
+       else if (isdigit(str[0]))
+               get_option(&str, &tolerant);
+       else
+               printk("mce= argument %s ignored. Please use /sys", str); 
+       return 1;
+}
+
+__setup("nomce", mcheck_disable);
+__setup("mce", mcheck_enable);
+
+/* 
+ * Sysfs support
+ */ 
+
+/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
+   Only one CPU is active at this time, the others get readded later using
+   CPU hotplug. */
+static int mce_resume(struct sys_device *dev)
+{
+       mce_init(NULL);
+       return 0;
+}
+
+/* Reinit MCEs after user configuration changes */
+static void mce_restart(void) 
+{ 
+       if (next_interval)
+               cancel_delayed_work(&mcheck_work);
+       /* Timer race is harmless here */
+       on_each_cpu(mce_init, NULL, 1, 1);       
+       next_interval = check_interval * HZ;
+       if (next_interval)
+               schedule_delayed_work(&mcheck_work,
+                                     round_jiffies_relative(next_interval));
+}
+
+static struct sysdev_class mce_sysclass = {
+       .resume = mce_resume,
+       set_kset_name("machinecheck"),
+};
+
+DEFINE_PER_CPU(struct sys_device, device_mce);
+
+/* Why are there no generic functions for this? */
+#define ACCESSOR(name, var, start) \
+       static ssize_t show_ ## name(struct sys_device *s, char *buf) {                    \
+               return sprintf(buf, "%lx\n", (unsigned long)var);                  \
+       }                                                                          \
+       static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
+               char *end;                                                         \
+               unsigned long new = simple_strtoul(buf, &end, 0);                  \
+               if (end == buf) return -EINVAL;                                    \
+               var = new;                                                         \
+               start;                                                             \
+               return end-buf;                                                    \
+       }                                                                          \
+       static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
+
+/* TBD should generate these dynamically based on number of available banks */
+ACCESSOR(bank0ctl,bank[0],mce_restart())
+ACCESSOR(bank1ctl,bank[1],mce_restart())
+ACCESSOR(bank2ctl,bank[2],mce_restart())
+ACCESSOR(bank3ctl,bank[3],mce_restart())
+ACCESSOR(bank4ctl,bank[4],mce_restart())
+ACCESSOR(bank5ctl,bank[5],mce_restart())
+
+static ssize_t show_trigger(struct sys_device *s, char *buf)
+{
+       strcpy(buf, trigger);
+       strcat(buf, "\n");
+       return strlen(trigger) + 1;
+}
+
+static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
+{
+       char *p;
+       int len;
+       strncpy(trigger, buf, sizeof(trigger));
+       trigger[sizeof(trigger)-1] = 0;
+       len = strlen(trigger);
+       p = strchr(trigger, '\n');
+       if (*p) *p = 0;
+       return len;
+}
+
+static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
+ACCESSOR(tolerant,tolerant,)
+ACCESSOR(check_interval,check_interval,mce_restart())
+static struct sysdev_attribute *mce_attributes[] = {
+       &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
+       &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
+       &attr_tolerant, &attr_check_interval, &attr_trigger,
+       NULL
+};
+
+/* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
+static __cpuinit int mce_create_device(unsigned int cpu)
+{
+       int err;
+       int i;
+       if (!mce_available(&cpu_data[cpu]))
+               return -EIO;
+
+       per_cpu(device_mce,cpu).id = cpu;
+       per_cpu(device_mce,cpu).cls = &mce_sysclass;
+
+       err = sysdev_register(&per_cpu(device_mce,cpu));
+
+       if (!err) {
+               for (i = 0; mce_attributes[i]; i++)
+                       sysdev_create_file(&per_cpu(device_mce,cpu),
+                               mce_attributes[i]);
+       }
+       return err;
+}
+
+static void mce_remove_device(unsigned int cpu)
+{
+       int i;
+
+       for (i = 0; mce_attributes[i]; i++)
+               sysdev_remove_file(&per_cpu(device_mce,cpu),
+                       mce_attributes[i]);
+       sysdev_unregister(&per_cpu(device_mce,cpu));
+       memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
+}
+
+/* Get notified when a cpu comes on/off. Be hotplug friendly. */
+static int
+mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (unsigned long)hcpu;
+
+       switch (action) {
+       case CPU_ONLINE:
+       case CPU_ONLINE_FROZEN:
+               mce_create_device(cpu);
+               break;
+       case CPU_DEAD:
+       case CPU_DEAD_FROZEN:
+               mce_remove_device(cpu);
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block mce_cpu_notifier = {
+       .notifier_call = mce_cpu_callback,
+};
+
+static __init int mce_init_device(void)
+{
+       int err;
+       int i = 0;
+
+       if (!mce_available(&boot_cpu_data))
+               return -EIO;
+       err = sysdev_class_register(&mce_sysclass);
+
+       for_each_online_cpu(i) {
+               mce_create_device(i);
+       }
+
+       register_hotcpu_notifier(&mce_cpu_notifier);
+       misc_register(&mce_log_device);
+       return err;
+}
+
+device_initcall(mce_init_device);
diff --git a/arch/x86/kernel/mce_amd_64.c b/arch/x86/kernel/mce_amd_64.c
new file mode 100644 (file)
index 0000000..2f8a7f1
--- /dev/null
@@ -0,0 +1,689 @@
+/*
+ *  (c) 2005, 2006 Advanced Micro Devices, Inc.
+ *  Your use of this code is subject to the terms and conditions of the
+ *  GNU general public license version 2. See "COPYING" or
+ *  http://www.gnu.org/licenses/gpl.html
+ *
+ *  Written by Jacob Shin - AMD, Inc.
+ *
+ *  Support : jacob.shin@amd.com
+ *
+ *  April 2006
+ *     - added support for AMD Family 0x10 processors
+ *
+ *  All MC4_MISCi registers are shared between multi-cores
+ */
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kobject.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/sysdev.h>
+#include <linux/sysfs.h>
+#include <asm/apic.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/percpu.h>
+#include <asm/idle.h>
+
+#define PFX               "mce_threshold: "
+#define VERSION           "version 1.1.1"
+#define NR_BANKS          6
+#define NR_BLOCKS         9
+#define THRESHOLD_MAX     0xFFF
+#define INT_TYPE_APIC     0x00020000
+#define MASK_VALID_HI     0x80000000
+#define MASK_CNTP_HI      0x40000000
+#define MASK_LOCKED_HI    0x20000000
+#define MASK_LVTOFF_HI    0x00F00000
+#define MASK_COUNT_EN_HI  0x00080000
+#define MASK_INT_TYPE_HI  0x00060000
+#define MASK_OVERFLOW_HI  0x00010000
+#define MASK_ERR_COUNT_HI 0x00000FFF
+#define MASK_BLKPTR_LO    0xFF000000
+#define MCG_XBLK_ADDR     0xC0000400
+
+struct threshold_block {
+       unsigned int block;
+       unsigned int bank;
+       unsigned int cpu;
+       u32 address;
+       u16 interrupt_enable;
+       u16 threshold_limit;
+       struct kobject kobj;
+       struct list_head miscj;
+};
+
+/* defaults used early on boot */
+static struct threshold_block threshold_defaults = {
+       .interrupt_enable = 0,
+       .threshold_limit = THRESHOLD_MAX,
+};
+
+struct threshold_bank {
+       struct kobject kobj;
+       struct threshold_block *blocks;
+       cpumask_t cpus;
+};
+static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
+
+#ifdef CONFIG_SMP
+static unsigned char shared_bank[NR_BANKS] = {
+       0, 0, 0, 0, 1
+};
+#endif
+
+static DEFINE_PER_CPU(unsigned char, bank_map);        /* see which banks are on */
+
+/*
+ * CPU Initialization
+ */
+
+/* must be called with correct cpu affinity */
+static void threshold_restart_bank(struct threshold_block *b,
+                                  int reset, u16 old_limit)
+{
+       u32 mci_misc_hi, mci_misc_lo;
+
+       rdmsr(b->address, mci_misc_lo, mci_misc_hi);
+
+       if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
+               reset = 1;      /* limit cannot be lower than err count */
+
+       if (reset) {            /* reset err count and overflow bit */
+               mci_misc_hi =
+                   (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
+                   (THRESHOLD_MAX - b->threshold_limit);
+       } else if (old_limit) { /* change limit w/o reset */
+               int new_count = (mci_misc_hi & THRESHOLD_MAX) +
+                   (old_limit - b->threshold_limit);
+               mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
+                   (new_count & THRESHOLD_MAX);
+       }
+
+       b->interrupt_enable ?
+           (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
+           (mci_misc_hi &= ~MASK_INT_TYPE_HI);
+
+       mci_misc_hi |= MASK_COUNT_EN_HI;
+       wrmsr(b->address, mci_misc_lo, mci_misc_hi);
+}
+
+/* cpu init entry point, called from mce.c with preempt off */
+void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
+{
+       unsigned int bank, block;
+       unsigned int cpu = smp_processor_id();
+       u32 low = 0, high = 0, address = 0;
+
+       for (bank = 0; bank < NR_BANKS; ++bank) {
+               for (block = 0; block < NR_BLOCKS; ++block) {
+                       if (block == 0)
+                               address = MSR_IA32_MC0_MISC + bank * 4;
+                       else if (block == 1) {
+                               address = (low & MASK_BLKPTR_LO) >> 21;
+                               if (!address)
+                                       break;
+                               address += MCG_XBLK_ADDR;
+                       }
+                       else
+                               ++address;
+
+                       if (rdmsr_safe(address, &low, &high))
+                               break;
+
+                       if (!(high & MASK_VALID_HI)) {
+                               if (block)
+                                       continue;
+                               else
+                                       break;
+                       }
+
+                       if (!(high & MASK_CNTP_HI)  ||
+                            (high & MASK_LOCKED_HI))
+                               continue;
+
+                       if (!block)
+                               per_cpu(bank_map, cpu) |= (1 << bank);
+#ifdef CONFIG_SMP
+                       if (shared_bank[bank] && c->cpu_core_id)
+                               break;
+#endif
+                       high &= ~MASK_LVTOFF_HI;
+                       high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20;
+                       wrmsr(address, low, high);
+
+                       setup_APIC_extended_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD,
+                                               THRESHOLD_APIC_VECTOR,
+                                               K8_APIC_EXT_INT_MSG_FIX, 0);
+
+                       threshold_defaults.address = address;
+                       threshold_restart_bank(&threshold_defaults, 0, 0);
+               }
+       }
+}
+
+/*
+ * APIC Interrupt Handler
+ */
+
+/*
+ * threshold interrupt handler will service THRESHOLD_APIC_VECTOR.
+ * the interrupt goes off when error_count reaches threshold_limit.
+ * the handler will simply log mcelog w/ software defined bank number.
+ */
+asmlinkage void mce_threshold_interrupt(void)
+{
+       unsigned int bank, block;
+       struct mce m;
+       u32 low = 0, high = 0, address = 0;
+
+       ack_APIC_irq();
+       exit_idle();
+       irq_enter();
+
+       memset(&m, 0, sizeof(m));
+       rdtscll(m.tsc);
+       m.cpu = smp_processor_id();
+
+       /* assume first bank caused it */
+       for (bank = 0; bank < NR_BANKS; ++bank) {
+               if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
+                       continue;
+               for (block = 0; block < NR_BLOCKS; ++block) {
+                       if (block == 0)
+                               address = MSR_IA32_MC0_MISC + bank * 4;
+                       else if (block == 1) {
+                               address = (low & MASK_BLKPTR_LO) >> 21;
+                               if (!address)
+                                       break;
+                               address += MCG_XBLK_ADDR;
+                       }
+                       else
+                               ++address;
+
+                       if (rdmsr_safe(address, &low, &high))
+                               break;
+
+                       if (!(high & MASK_VALID_HI)) {
+                               if (block)
+                                       continue;
+                               else
+                                       break;
+                       }
+
+                       if (!(high & MASK_CNTP_HI)  ||
+                            (high & MASK_LOCKED_HI))
+                               continue;
+
+                       /* Log the machine check that caused the threshold
+                          event. */
+                       do_machine_check(NULL, 0);
+
+                       if (high & MASK_OVERFLOW_HI) {
+                               rdmsrl(address, m.misc);
+                               rdmsrl(MSR_IA32_MC0_STATUS + bank * 4,
+                                      m.status);
+                               m.bank = K8_MCE_THRESHOLD_BASE
+                                      + bank * NR_BLOCKS
+                                      + block;
+                               mce_log(&m);
+                               goto out;
+                       }
+               }
+       }
+out:
+       irq_exit();
+}
+
+/*
+ * Sysfs Interface
+ */
+
+struct threshold_attr {
+       struct attribute attr;
+       ssize_t(*show) (struct threshold_block *, char *);
+       ssize_t(*store) (struct threshold_block *, const char *, size_t count);
+};
+
+static cpumask_t affinity_set(unsigned int cpu)
+{
+       cpumask_t oldmask = current->cpus_allowed;
+       cpumask_t newmask = CPU_MASK_NONE;
+       cpu_set(cpu, newmask);
+       set_cpus_allowed(current, newmask);
+       return oldmask;
+}
+
+static void affinity_restore(cpumask_t oldmask)
+{
+       set_cpus_allowed(current, oldmask);
+}
+
+#define SHOW_FIELDS(name)                                           \
+static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
+{                                                                   \
+        return sprintf(buf, "%lx\n", (unsigned long) b->name);      \
+}
+SHOW_FIELDS(interrupt_enable)
+SHOW_FIELDS(threshold_limit)
+
+static ssize_t store_interrupt_enable(struct threshold_block *b,
+                                     const char *buf, size_t count)
+{
+       char *end;
+       cpumask_t oldmask;
+       unsigned long new = simple_strtoul(buf, &end, 0);
+       if (end == buf)
+               return -EINVAL;
+       b->interrupt_enable = !!new;
+
+       oldmask = affinity_set(b->cpu);
+       threshold_restart_bank(b, 0, 0);
+       affinity_restore(oldmask);
+
+       return end - buf;
+}
+
+static ssize_t store_threshold_limit(struct threshold_block *b,
+                                    const char *buf, size_t count)
+{
+       char *end;
+       cpumask_t oldmask;
+       u16 old;
+       unsigned long new = simple_strtoul(buf, &end, 0);
+       if (end == buf)
+               return -EINVAL;
+       if (new > THRESHOLD_MAX)
+               new = THRESHOLD_MAX;
+       if (new < 1)
+               new = 1;
+       old = b->threshold_limit;
+       b->threshold_limit = new;
+
+       oldmask = affinity_set(b->cpu);
+       threshold_restart_bank(b, 0, old);
+       affinity_restore(oldmask);
+
+       return end - buf;
+}
+
+static ssize_t show_error_count(struct threshold_block *b, char *buf)
+{
+       u32 high, low;
+       cpumask_t oldmask;
+       oldmask = affinity_set(b->cpu);
+       rdmsr(b->address, low, high);
+       affinity_restore(oldmask);
+       return sprintf(buf, "%x\n",
+                      (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));
+}
+
+static ssize_t store_error_count(struct threshold_block *b,
+                                const char *buf, size_t count)
+{
+       cpumask_t oldmask;
+       oldmask = affinity_set(b->cpu);
+       threshold_restart_bank(b, 1, 0);
+       affinity_restore(oldmask);
+       return 1;
+}
+
+#define THRESHOLD_ATTR(_name,_mode,_show,_store) {            \
+        .attr = {.name = __stringify(_name), .mode = _mode }, \
+        .show = _show,                                        \
+        .store = _store,                                      \
+};
+
+#define RW_ATTR(name)                                           \
+static struct threshold_attr name =                             \
+        THRESHOLD_ATTR(name, 0644, show_## name, store_## name)
+
+RW_ATTR(interrupt_enable);
+RW_ATTR(threshold_limit);
+RW_ATTR(error_count);
+
+static struct attribute *default_attrs[] = {
+       &interrupt_enable.attr,
+       &threshold_limit.attr,
+       &error_count.attr,
+       NULL
+};
+
+#define to_block(k) container_of(k, struct threshold_block, kobj)
+#define to_attr(a) container_of(a, struct threshold_attr, attr)
+
+static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+       struct threshold_block *b = to_block(kobj);
+       struct threshold_attr *a = to_attr(attr);
+       ssize_t ret;
+       ret = a->show ? a->show(b, buf) : -EIO;
+       return ret;
+}
+
+static ssize_t store(struct kobject *kobj, struct attribute *attr,
+                    const char *buf, size_t count)
+{
+       struct threshold_block *b = to_block(kobj);
+       struct threshold_attr *a = to_attr(attr);
+       ssize_t ret;
+       ret = a->store ? a->store(b, buf, count) : -EIO;
+       return ret;
+}
+
+static struct sysfs_ops threshold_ops = {
+       .show = show,
+       .store = store,
+};
+
+static struct kobj_type threshold_ktype = {
+       .sysfs_ops = &threshold_ops,
+       .default_attrs = default_attrs,
+};
+
+static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
+                                              unsigned int bank,
+                                              unsigned int block,
+                                              u32 address)
+{
+       int err;
+       u32 low, high;
+       struct threshold_block *b = NULL;
+
+       if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
+               return 0;
+
+       if (rdmsr_safe(address, &low, &high))
+               return 0;
+
+       if (!(high & MASK_VALID_HI)) {
+               if (block)
+                       goto recurse;
+               else
+                       return 0;
+       }
+
+       if (!(high & MASK_CNTP_HI)  ||
+            (high & MASK_LOCKED_HI))
+               goto recurse;
+
+       b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL);
+       if (!b)
+               return -ENOMEM;
+
+       b->block = block;
+       b->bank = bank;
+       b->cpu = cpu;
+       b->address = address;
+       b->interrupt_enable = 0;
+       b->threshold_limit = THRESHOLD_MAX;
+
+       INIT_LIST_HEAD(&b->miscj);
+
+       if (per_cpu(threshold_banks, cpu)[bank]->blocks)
+               list_add(&b->miscj,
+                        &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
+       else
+               per_cpu(threshold_banks, cpu)[bank]->blocks = b;
+
+       kobject_set_name(&b->kobj, "misc%i", block);
+       b->kobj.parent = &per_cpu(threshold_banks, cpu)[bank]->kobj;
+       b->kobj.ktype = &threshold_ktype;
+       err = kobject_register(&b->kobj);
+       if (err)
+               goto out_free;
+recurse:
+       if (!block) {
+               address = (low & MASK_BLKPTR_LO) >> 21;
+               if (!address)
+                       return 0;
+               address += MCG_XBLK_ADDR;
+       } else
+               ++address;
+
+       err = allocate_threshold_blocks(cpu, bank, ++block, address);
+       if (err)
+               goto out_free;
+
+       return err;
+
+out_free:
+       if (b) {
+               kobject_unregister(&b->kobj);
+               kfree(b);
+       }
+       return err;
+}
+
+/* symlinks sibling shared banks to first core.  first core owns dir/files. */
+static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
+{
+       int i, err = 0;
+       struct threshold_bank *b = NULL;
+       cpumask_t oldmask = CPU_MASK_NONE;
+       char name[32];
+
+       sprintf(name, "threshold_bank%i", bank);
+
+#ifdef CONFIG_SMP
+       if (cpu_data[cpu].cpu_core_id && shared_bank[bank]) {   /* symlink */
+               i = first_cpu(cpu_core_map[cpu]);
+
+               /* first core not up yet */
+               if (cpu_data[i].cpu_core_id)
+                       goto out;
+
+               /* already linked */
+               if (per_cpu(threshold_banks, cpu)[bank])
+                       goto out;
+
+               b = per_cpu(threshold_banks, i)[bank];
+
+               if (!b)
+                       goto out;
+
+               err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj,
+                                       &b->kobj, name);
+               if (err)
+                       goto out;
+
+               b->cpus = cpu_core_map[cpu];
+               per_cpu(threshold_banks, cpu)[bank] = b;
+               goto out;
+       }
+#endif
+
+       b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
+       if (!b) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       kobject_set_name(&b->kobj, "threshold_bank%i", bank);
+       b->kobj.parent = &per_cpu(device_mce, cpu).kobj;
+#ifndef CONFIG_SMP
+       b->cpus = CPU_MASK_ALL;
+#else
+       b->cpus = cpu_core_map[cpu];
+#endif
+       err = kobject_register(&b->kobj);
+       if (err)
+               goto out_free;
+
+       per_cpu(threshold_banks, cpu)[bank] = b;
+
+       oldmask = affinity_set(cpu);
+       err = allocate_threshold_blocks(cpu, bank, 0,
+                                       MSR_IA32_MC0_MISC + bank * 4);
+       affinity_restore(oldmask);
+
+       if (err)
+               goto out_free;
+
+       for_each_cpu_mask(i, b->cpus) {
+               if (i == cpu)
+                       continue;
+
+               err = sysfs_create_link(&per_cpu(device_mce, i).kobj,
+                                       &b->kobj, name);
+               if (err)
+                       goto out;
+
+               per_cpu(threshold_banks, i)[bank] = b;
+       }
+
+       goto out;
+
+out_free:
+       per_cpu(threshold_banks, cpu)[bank] = NULL;
+       kfree(b);
+out:
+       return err;
+}
+
+/* create dir/files for all valid threshold banks */
+static __cpuinit int threshold_create_device(unsigned int cpu)
+{
+       unsigned int bank;
+       int err = 0;
+
+       for (bank = 0; bank < NR_BANKS; ++bank) {
+               if (!(per_cpu(bank_map, cpu) & 1 << bank))
+                       continue;
+               err = threshold_create_bank(cpu, bank);
+               if (err)
+                       goto out;
+       }
+out:
+       return err;
+}
+
+/*
+ * let's be hotplug friendly.
+ * in case of multiple core processors, the first core always takes ownership
+ *   of shared sysfs dir/files, and rest of the cores will be symlinked to it.
+ */
+
+static void deallocate_threshold_block(unsigned int cpu,
+                                                unsigned int bank)
+{
+       struct threshold_block *pos = NULL;
+       struct threshold_block *tmp = NULL;
+       struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank];
+
+       if (!head)
+               return;
+
+       list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
+               kobject_unregister(&pos->kobj);
+               list_del(&pos->miscj);
+               kfree(pos);
+       }
+
+       kfree(per_cpu(threshold_banks, cpu)[bank]->blocks);
+       per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
+}
+
+static void threshold_remove_bank(unsigned int cpu, int bank)
+{
+       int i = 0;
+       struct threshold_bank *b;
+       char name[32];
+
+       b = per_cpu(threshold_banks, cpu)[bank];
+
+       if (!b)
+               return;
+
+       if (!b->blocks)
+               goto free_out;
+
+       sprintf(name, "threshold_bank%i", bank);
+
+#ifdef CONFIG_SMP
+       /* sibling symlink */
+       if (shared_bank[bank] && b->blocks->cpu != cpu) {
+               sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name);
+               per_cpu(threshold_banks, cpu)[bank] = NULL;
+               return;
+       }
+#endif
+
+       /* remove all sibling symlinks before unregistering */
+       for_each_cpu_mask(i, b->cpus) {
+               if (i == cpu)
+                       continue;
+
+               sysfs_remove_link(&per_cpu(device_mce, i).kobj, name);
+               per_cpu(threshold_banks, i)[bank] = NULL;
+       }
+
+       deallocate_threshold_block(cpu, bank);
+
+free_out:
+       kobject_unregister(&b->kobj);
+       kfree(b);
+       per_cpu(threshold_banks, cpu)[bank] = NULL;
+}
+
+static void threshold_remove_device(unsigned int cpu)
+{
+       unsigned int bank;
+
+       for (bank = 0; bank < NR_BANKS; ++bank) {
+               if (!(per_cpu(bank_map, cpu) & 1 << bank))
+                       continue;
+               threshold_remove_bank(cpu, bank);
+       }
+}
+
+/* get notified when a cpu comes on/off */
+static int threshold_cpu_callback(struct notifier_block *nfb,
+                                           unsigned long action, void *hcpu)
+{
+       /* cpu was unsigned int to begin with */
+       unsigned int cpu = (unsigned long)hcpu;
+
+       if (cpu >= NR_CPUS)
+               goto out;
+
+       switch (action) {
+       case CPU_ONLINE:
+       case CPU_ONLINE_FROZEN:
+               threshold_create_device(cpu);
+               break;
+       case CPU_DEAD:
+       case CPU_DEAD_FROZEN:
+               threshold_remove_device(cpu);
+               break;
+       default:
+               break;
+       }
+      out:
+       return NOTIFY_OK;
+}
+
+static struct notifier_block threshold_cpu_notifier = {
+       .notifier_call = threshold_cpu_callback,
+};
+
+static __init int threshold_init_device(void)
+{
+       unsigned lcpu = 0;
+
+       /* to hit CPUs online before the notifier is up */
+       for_each_online_cpu(lcpu) {
+               int err = threshold_create_device(lcpu);
+               if (err)
+                       return err;
+       }
+       register_hotcpu_notifier(&threshold_cpu_notifier);
+       return 0;
+}
+
+device_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/mce_intel_64.c b/arch/x86/kernel/mce_intel_64.c
new file mode 100644 (file)
index 0000000..6551505
--- /dev/null
@@ -0,0 +1,89 @@
+/*
+ * Intel specific MCE features.
+ * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
+ */
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/mce.h>
+#include <asm/hw_irq.h>
+#include <asm/idle.h>
+#include <asm/therm_throt.h>
+
+asmlinkage void smp_thermal_interrupt(void)
+{
+       __u64 msr_val;
+
+       ack_APIC_irq();
+
+       exit_idle();
+       irq_enter();
+
+       rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
+       if (therm_throt_process(msr_val & 1))
+               mce_log_therm_throt_event(smp_processor_id(), msr_val);
+
+       irq_exit();
+}
+
+static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c)
+{
+       u32 l, h;
+       int tm2 = 0;
+       unsigned int cpu = smp_processor_id();
+
+       if (!cpu_has(c, X86_FEATURE_ACPI))
+               return;
+
+       if (!cpu_has(c, X86_FEATURE_ACC))
+               return;
+
+       /* first check if TM1 is already enabled by the BIOS, in which
+        * case there might be some SMM goo which handles it, so we can't even
+        * put a handler since it might be delivered via SMI already.
+        */
+       rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+       h = apic_read(APIC_LVTTHMR);
+       if ((l & (1 << 3)) && (h & APIC_DM_SMI)) {
+               printk(KERN_DEBUG
+                      "CPU%d: Thermal monitoring handled by SMI\n", cpu);
+               return;
+       }
+
+       if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13)))
+               tm2 = 1;
+
+       if (h & APIC_VECTOR_MASK) {
+               printk(KERN_DEBUG
+                      "CPU%d: Thermal LVT vector (%#x) already "
+                      "installed\n", cpu, (h & APIC_VECTOR_MASK));
+               return;
+       }
+
+       h = THERMAL_APIC_VECTOR;
+       h |= (APIC_DM_FIXED | APIC_LVT_MASKED);
+       apic_write(APIC_LVTTHMR, h);
+
+       rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
+       wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
+
+       rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+       wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h);
+
+       l = apic_read(APIC_LVTTHMR);
+       apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+       printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
+               cpu, tm2 ? "TM2" : "TM1");
+
+       /* enable thermal throttle processing */
+       atomic_set(&therm_throt_en, 1);
+       return;
+}
+
+void __cpuinit mce_intel_feature_init(struct cpuinfo_x86 *c)
+{
+       intel_init_thermal(c);
+}
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c
new file mode 100644 (file)
index 0000000..a888e67
--- /dev/null
@@ -0,0 +1,185 @@
+/*  Kernel module help for x86-64
+    Copyright (C) 2001 Rusty Russell.
+    Copyright (C) 2002,2003 Andi Kleen, SuSE Labs.
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+#include <linux/moduleloader.h>
+#include <linux/elf.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/bug.h>
+
+#include <asm/system.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+
+#define DEBUGP(fmt...) 
+
+#ifndef CONFIG_UML
+void module_free(struct module *mod, void *module_region)
+{
+       vfree(module_region);
+       /* FIXME: If module_region == mod->init_region, trim exception
+           table entries. */
+}
+
+void *module_alloc(unsigned long size)
+{
+       struct vm_struct *area;
+
+       if (!size)
+               return NULL;
+       size = PAGE_ALIGN(size);
+       if (size > MODULES_LEN)
+               return NULL;
+
+       area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
+       if (!area)
+               return NULL;
+
+       return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC);
+}
+#endif
+
+/* We don't need anything special. */
+int module_frob_arch_sections(Elf_Ehdr *hdr,
+                             Elf_Shdr *sechdrs,
+                             char *secstrings,
+                             struct module *mod)
+{
+       return 0;
+}
+
+int apply_relocate_add(Elf64_Shdr *sechdrs,
+                  const char *strtab,
+                  unsigned int symindex,
+                  unsigned int relsec,
+                  struct module *me)
+{
+       unsigned int i;
+       Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
+       Elf64_Sym *sym;
+       void *loc;
+       u64 val; 
+
+       DEBUGP("Applying relocate section %u to %u\n", relsec,
+              sechdrs[relsec].sh_info);
+       for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
+               /* This is where to make the change */
+               loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+                       + rel[i].r_offset;
+
+               /* This is the symbol it is referring to.  Note that all
+                  undefined symbols have been resolved.  */
+               sym = (Elf64_Sym *)sechdrs[symindex].sh_addr
+                       + ELF64_R_SYM(rel[i].r_info);
+
+               DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
+                      (int)ELF64_R_TYPE(rel[i].r_info), 
+                      sym->st_value, rel[i].r_addend, (u64)loc);
+
+               val = sym->st_value + rel[i].r_addend; 
+
+               switch (ELF64_R_TYPE(rel[i].r_info)) {
+               case R_X86_64_NONE:
+                       break;
+               case R_X86_64_64:
+                       *(u64 *)loc = val;
+                       break;
+               case R_X86_64_32:
+                       *(u32 *)loc = val;
+                       if (val != *(u32 *)loc)
+                               goto overflow;
+                       break;
+               case R_X86_64_32S:
+                       *(s32 *)loc = val;
+                       if ((s64)val != *(s32 *)loc)
+                               goto overflow;
+                       break;
+               case R_X86_64_PC32: 
+                       val -= (u64)loc;
+                       *(u32 *)loc = val;
+#if 0
+                       if ((s64)val != *(s32 *)loc)
+                               goto overflow; 
+#endif
+                       break;
+               default:
+                       printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n",
+                              me->name, ELF64_R_TYPE(rel[i].r_info));
+                       return -ENOEXEC;
+               }
+       }
+       return 0;
+
+overflow:
+       printk(KERN_ERR "overflow in relocation type %d val %Lx\n", 
+              (int)ELF64_R_TYPE(rel[i].r_info), val);
+       printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n",
+              me->name);
+       return -ENOEXEC;
+}
+
+int apply_relocate(Elf_Shdr *sechdrs,
+                  const char *strtab,
+                  unsigned int symindex,
+                  unsigned int relsec,
+                  struct module *me)
+{
+       printk("non add relocation not supported\n");
+       return -ENOSYS;
+} 
+
+int module_finalize(const Elf_Ehdr *hdr,
+                    const Elf_Shdr *sechdrs,
+                    struct module *me)
+{
+       const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL;
+       char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+
+       for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
+               if (!strcmp(".text", secstrings + s->sh_name))
+                       text = s;
+               if (!strcmp(".altinstructions", secstrings + s->sh_name))
+                       alt = s;
+               if (!strcmp(".smp_locks", secstrings + s->sh_name))
+                       locks= s;
+       }
+
+       if (alt) {
+               /* patch .altinstructions */
+               void *aseg = (void *)alt->sh_addr;
+               apply_alternatives(aseg, aseg + alt->sh_size);
+       }
+       if (locks && text) {
+               void *lseg = (void *)locks->sh_addr;
+               void *tseg = (void *)text->sh_addr;
+               alternatives_smp_module_add(me, me->name,
+                                           lseg, lseg + locks->sh_size,
+                                           tseg, tseg + text->sh_size);
+       }
+
+       return module_bug_finalize(hdr, sechdrs, me);
+}
+
+void module_arch_cleanup(struct module *mod)
+{
+       alternatives_smp_module_del(mod);
+       module_bug_cleanup(mod);
+}
diff --git a/arch/x86/kernel/mpparse_64.c b/arch/x86/kernel/mpparse_64.c
new file mode 100644 (file)
index 0000000..8bf0ca0
--- /dev/null
@@ -0,0 +1,852 @@
+/*
+ *     Intel Multiprocessor Specification 1.1 and 1.4
+ *     compliant MP-table parsing routines.
+ *
+ *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *     (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ *
+ *     Fixes
+ *             Erich Boleyn    :       MP v1.4 and additional changes.
+ *             Alan Cox        :       Added EBDA scanning
+ *             Ingo Molnar     :       various cleanups and rewrites
+ *             Maciej W. Rozycki:      Bits for default MP configurations
+ *             Paul Diefenbaugh:       Added full ACPI support
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/acpi.h>
+#include <linux/module.h>
+
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/pgalloc.h>
+#include <asm/io_apic.h>
+#include <asm/proto.h>
+#include <asm/acpi.h>
+
+/* Have we found an MP table */
+int smp_found_config;
+
+/*
+ * Various Linux-internal data structures created from the
+ * MP-table.
+ */
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
+
+static int mp_current_pci_id = 0;
+/* I/O APIC entries */
+struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+
+/* # of MP IRQ source entries */
+struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+
+/* MP IRQ source entries */
+int mp_irq_entries;
+
+int nr_ioapics;
+unsigned long mp_lapic_addr = 0;
+
+
+
+/* Processor that is doing the boot up */
+unsigned int boot_cpu_id = -1U;
+/* Internal processor count */
+unsigned int num_processors __cpuinitdata = 0;
+
+unsigned disabled_cpus __cpuinitdata;
+
+/* Bitmask of physically existing CPUs */
+physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
+
+u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+
+
+/*
+ * Intel MP BIOS table parsing routines:
+ */
+
+/*
+ * Checksum an MP configuration block.
+ */
+
+static int __init mpf_checksum(unsigned char *mp, int len)
+{
+       int sum = 0;
+
+       while (len--)
+               sum += *mp++;
+
+       return sum & 0xFF;
+}
+
+static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
+{
+       int cpu;
+       cpumask_t tmp_map;
+       char *bootup_cpu = "";
+
+       if (!(m->mpc_cpuflag & CPU_ENABLED)) {
+               disabled_cpus++;
+               return;
+       }
+       if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
+               bootup_cpu = " (Bootup-CPU)";
+               boot_cpu_id = m->mpc_apicid;
+       }
+
+       printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
+
+       if (num_processors >= NR_CPUS) {
+               printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
+                       " Processor ignored.\n", NR_CPUS);
+               return;
+       }
+
+       num_processors++;
+       cpus_complement(tmp_map, cpu_present_map);
+       cpu = first_cpu(tmp_map);
+
+       physid_set(m->mpc_apicid, phys_cpu_present_map);
+       if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
+               /*
+                * bios_cpu_apicid is required to have processors listed
+                * in same order as logical cpu numbers. Hence the first
+                * entry is BSP, and so on.
+                */
+               cpu = 0;
+       }
+       bios_cpu_apicid[cpu] = m->mpc_apicid;
+       x86_cpu_to_apicid[cpu] = m->mpc_apicid;
+
+       cpu_set(cpu, cpu_possible_map);
+       cpu_set(cpu, cpu_present_map);
+}
+
+static void __init MP_bus_info (struct mpc_config_bus *m)
+{
+       char str[7];
+
+       memcpy(str, m->mpc_bustype, 6);
+       str[6] = 0;
+       Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
+
+       if (strncmp(str, "ISA", 3) == 0) {
+               set_bit(m->mpc_busid, mp_bus_not_pci);
+       } else if (strncmp(str, "PCI", 3) == 0) {
+               clear_bit(m->mpc_busid, mp_bus_not_pci);
+               mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
+               mp_current_pci_id++;
+       } else {
+               printk(KERN_ERR "Unknown bustype %s\n", str);
+       }
+}
+
+static int bad_ioapic(unsigned long address)
+{
+       if (nr_ioapics >= MAX_IO_APICS) {
+               printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
+                       "(found %d)\n", MAX_IO_APICS, nr_ioapics);
+               panic("Recompile kernel with bigger MAX_IO_APICS!\n");
+       }
+       if (!address) {
+               printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
+                       " found in table, skipping!\n");
+               return 1;
+       }
+       return 0;
+}
+
+static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
+{
+       if (!(m->mpc_flags & MPC_APIC_USABLE))
+               return;
+
+       printk("I/O APIC #%d at 0x%X.\n",
+               m->mpc_apicid, m->mpc_apicaddr);
+
+       if (bad_ioapic(m->mpc_apicaddr))
+               return;
+
+       mp_ioapics[nr_ioapics] = *m;
+       nr_ioapics++;
+}
+
+static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
+{
+       mp_irqs [mp_irq_entries] = *m;
+       Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
+               " IRQ %02x, APIC ID %x, APIC INT %02x\n",
+                       m->mpc_irqtype, m->mpc_irqflag & 3,
+                       (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
+                       m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
+       if (++mp_irq_entries >= MAX_IRQ_SOURCES)
+               panic("Max # of irq sources exceeded!!\n");
+}
+
+static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
+{
+       Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
+               " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
+                       m->mpc_irqtype, m->mpc_irqflag & 3,
+                       (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
+                       m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
+}
+
+/*
+ * Read/parse the MPC
+ */
+
+static int __init smp_read_mpc(struct mp_config_table *mpc)
+{
+       char str[16];
+       int count=sizeof(*mpc);
+       unsigned char *mpt=((unsigned char *)mpc)+count;
+
+       if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
+               printk("MPTABLE: bad signature [%c%c%c%c]!\n",
+                       mpc->mpc_signature[0],
+                       mpc->mpc_signature[1],
+                       mpc->mpc_signature[2],
+                       mpc->mpc_signature[3]);
+               return 0;
+       }
+       if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
+               printk("MPTABLE: checksum error!\n");
+               return 0;
+       }
+       if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
+               printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
+                       mpc->mpc_spec);
+               return 0;
+       }
+       if (!mpc->mpc_lapic) {
+               printk(KERN_ERR "MPTABLE: null local APIC address!\n");
+               return 0;
+       }
+       memcpy(str,mpc->mpc_oem,8);
+       str[8] = 0;
+       printk(KERN_INFO "MPTABLE: OEM ID: %s ",str);
+
+       memcpy(str,mpc->mpc_productid,12);
+       str[12] = 0;
+       printk("MPTABLE: Product ID: %s ",str);
+
+       printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic);
+
+       /* save the local APIC address, it might be non-default */
+       if (!acpi_lapic)
+               mp_lapic_addr = mpc->mpc_lapic;
+
+       /*
+        *      Now process the configuration blocks.
+        */
+       while (count < mpc->mpc_length) {
+               switch(*mpt) {
+                       case MP_PROCESSOR:
+                       {
+                               struct mpc_config_processor *m=
+                                       (struct mpc_config_processor *)mpt;
+                               if (!acpi_lapic)
+                                       MP_processor_info(m);
+                               mpt += sizeof(*m);
+                               count += sizeof(*m);
+                               break;
+                       }
+                       case MP_BUS:
+                       {
+                               struct mpc_config_bus *m=
+                                       (struct mpc_config_bus *)mpt;
+                               MP_bus_info(m);
+                               mpt += sizeof(*m);
+                               count += sizeof(*m);
+                               break;
+                       }
+                       case MP_IOAPIC:
+                       {
+                               struct mpc_config_ioapic *m=
+                                       (struct mpc_config_ioapic *)mpt;
+                               MP_ioapic_info(m);
+                               mpt += sizeof(*m);
+                               count += sizeof(*m);
+                               break;
+                       }
+                       case MP_INTSRC:
+                       {
+                               struct mpc_config_intsrc *m=
+                                       (struct mpc_config_intsrc *)mpt;
+
+                               MP_intsrc_info(m);
+                               mpt += sizeof(*m);
+                               count += sizeof(*m);
+                               break;
+                       }
+                       case MP_LINTSRC:
+                       {
+                               struct mpc_config_lintsrc *m=
+                                       (struct mpc_config_lintsrc *)mpt;
+                               MP_lintsrc_info(m);
+                               mpt += sizeof(*m);
+                               count += sizeof(*m);
+                               break;
+                       }
+               }
+       }
+       setup_apic_routing();
+       if (!num_processors)
+               printk(KERN_ERR "MPTABLE: no processors registered!\n");
+       return num_processors;
+}
+
+static int __init ELCR_trigger(unsigned int irq)
+{
+       unsigned int port;
+
+       port = 0x4d0 + (irq >> 3);
+       return (inb(port) >> (irq & 7)) & 1;
+}
+
+static void __init construct_default_ioirq_mptable(int mpc_default_type)
+{
+       struct mpc_config_intsrc intsrc;
+       int i;
+       int ELCR_fallback = 0;
+
+       intsrc.mpc_type = MP_INTSRC;
+       intsrc.mpc_irqflag = 0;                 /* conforming */
+       intsrc.mpc_srcbus = 0;
+       intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
+
+       intsrc.mpc_irqtype = mp_INT;
+
+       /*
+        *  If true, we have an ISA/PCI system with no IRQ entries
+        *  in the MP table. To prevent the PCI interrupts from being set up
+        *  incorrectly, we try to use the ELCR. The sanity check to see if
+        *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
+        *  never be level sensitive, so we simply see if the ELCR agrees.
+        *  If it does, we assume it's valid.
+        */
+       if (mpc_default_type == 5) {
+               printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
+
+               if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
+                       printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
+               else {
+                       printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
+                       ELCR_fallback = 1;
+               }
+       }
+
+       for (i = 0; i < 16; i++) {
+               switch (mpc_default_type) {
+               case 2:
+                       if (i == 0 || i == 13)
+                               continue;       /* IRQ0 & IRQ13 not connected */
+                       /* fall through */
+               default:
+                       if (i == 2)
+                               continue;       /* IRQ2 is never connected */
+               }
+
+               if (ELCR_fallback) {
+                       /*
+                        *  If the ELCR indicates a level-sensitive interrupt, we
+                        *  copy that information over to the MP table in the
+                        *  irqflag field (level sensitive, active high polarity).
+                        */
+                       if (ELCR_trigger(i))
+                               intsrc.mpc_irqflag = 13;
+                       else
+                               intsrc.mpc_irqflag = 0;
+               }
+
+               intsrc.mpc_srcbusirq = i;
+               intsrc.mpc_dstirq = i ? i : 2;          /* IRQ0 to INTIN2 */
+               MP_intsrc_info(&intsrc);
+       }
+
+       intsrc.mpc_irqtype = mp_ExtINT;
+       intsrc.mpc_srcbusirq = 0;
+       intsrc.mpc_dstirq = 0;                          /* 8259A to INTIN0 */
+       MP_intsrc_info(&intsrc);
+}
+
+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+{
+       struct mpc_config_processor processor;
+       struct mpc_config_bus bus;
+       struct mpc_config_ioapic ioapic;
+       struct mpc_config_lintsrc lintsrc;
+       int linttypes[2] = { mp_ExtINT, mp_NMI };
+       int i;
+
+       /*
+        * local APIC has default address
+        */
+       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+       /*
+        * 2 CPUs, numbered 0 & 1.
+        */
+       processor.mpc_type = MP_PROCESSOR;
+       processor.mpc_apicver = 0;
+       processor.mpc_cpuflag = CPU_ENABLED;
+       processor.mpc_cpufeature = 0;
+       processor.mpc_featureflag = 0;
+       processor.mpc_reserved[0] = 0;
+       processor.mpc_reserved[1] = 0;
+       for (i = 0; i < 2; i++) {
+               processor.mpc_apicid = i;
+               MP_processor_info(&processor);
+       }
+
+       bus.mpc_type = MP_BUS;
+       bus.mpc_busid = 0;
+       switch (mpc_default_type) {
+               default:
+                       printk(KERN_ERR "???\nUnknown standard configuration %d\n",
+                               mpc_default_type);
+                       /* fall through */
+               case 1:
+               case 5:
+                       memcpy(bus.mpc_bustype, "ISA   ", 6);
+                       break;
+       }
+       MP_bus_info(&bus);
+       if (mpc_default_type > 4) {
+               bus.mpc_busid = 1;
+               memcpy(bus.mpc_bustype, "PCI   ", 6);
+               MP_bus_info(&bus);
+       }
+
+       ioapic.mpc_type = MP_IOAPIC;
+       ioapic.mpc_apicid = 2;
+       ioapic.mpc_apicver = 0;
+       ioapic.mpc_flags = MPC_APIC_USABLE;
+       ioapic.mpc_apicaddr = 0xFEC00000;
+       MP_ioapic_info(&ioapic);
+
+       /*
+        * We set up most of the low 16 IO-APIC pins according to MPS rules.
+        */
+       construct_default_ioirq_mptable(mpc_default_type);
+
+       lintsrc.mpc_type = MP_LINTSRC;
+       lintsrc.mpc_irqflag = 0;                /* conforming */
+       lintsrc.mpc_srcbusid = 0;
+       lintsrc.mpc_srcbusirq = 0;
+       lintsrc.mpc_destapic = MP_APIC_ALL;
+       for (i = 0; i < 2; i++) {
+               lintsrc.mpc_irqtype = linttypes[i];
+               lintsrc.mpc_destapiclint = i;
+               MP_lintsrc_info(&lintsrc);
+       }
+}
+
+static struct intel_mp_floating *mpf_found;
+
+/*
+ * Scan the memory blocks for an SMP configuration block.
+ */
+void __init get_smp_config (void)
+{
+       struct intel_mp_floating *mpf = mpf_found;
+
+       /*
+        * ACPI supports both logical (e.g. Hyper-Threading) and physical 
+        * processors, where MPS only supports physical.
+        */
+       if (acpi_lapic && acpi_ioapic) {
+               printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
+               return;
+       }
+       else if (acpi_lapic)
+               printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
+
+       printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
+
+       /*
+        * Now see if we need to read further.
+        */
+       if (mpf->mpf_feature1 != 0) {
+
+               printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
+               construct_default_ISA_mptable(mpf->mpf_feature1);
+
+       } else if (mpf->mpf_physptr) {
+
+               /*
+                * Read the physical hardware table.  Anything here will
+                * override the defaults.
+                */
+               if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) {
+                       smp_found_config = 0;
+                       printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
+                       printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
+                       return;
+               }
+               /*
+                * If there are no explicit MP IRQ entries, then we are
+                * broken.  We set up most of the low 16 IO-APIC pins to
+                * ISA defaults and hope it will work.
+                */
+               if (!mp_irq_entries) {
+                       struct mpc_config_bus bus;
+
+                       printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
+
+                       bus.mpc_type = MP_BUS;
+                       bus.mpc_busid = 0;
+                       memcpy(bus.mpc_bustype, "ISA   ", 6);
+                       MP_bus_info(&bus);
+
+                       construct_default_ioirq_mptable(0);
+               }
+
+       } else
+               BUG();
+
+       printk(KERN_INFO "Processors: %d\n", num_processors);
+       /*
+        * Only use the first configuration found.
+        */
+}
+
+static int __init smp_scan_config (unsigned long base, unsigned long length)
+{
+       extern void __bad_mpf_size(void); 
+       unsigned int *bp = phys_to_virt(base);
+       struct intel_mp_floating *mpf;
+
+       Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
+       if (sizeof(*mpf) != 16)
+               __bad_mpf_size();
+
+       while (length > 0) {
+               mpf = (struct intel_mp_floating *)bp;
+               if ((*bp == SMP_MAGIC_IDENT) &&
+                       (mpf->mpf_length == 1) &&
+                       !mpf_checksum((unsigned char *)bp, 16) &&
+                       ((mpf->mpf_specification == 1)
+                               || (mpf->mpf_specification == 4)) ) {
+
+                       smp_found_config = 1;
+                       reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
+                       if (mpf->mpf_physptr)
+                               reserve_bootmem_generic(mpf->mpf_physptr, PAGE_SIZE);
+                       mpf_found = mpf;
+                       return 1;
+               }
+               bp += 4;
+               length -= 16;
+       }
+       return 0;
+}
+
+void __init find_smp_config(void)
+{
+       unsigned int address;
+
+       /*
+        * FIXME: Linux assumes you have 640K of base ram..
+        * this continues the error...
+        *
+        * 1) Scan the bottom 1K for a signature
+        * 2) Scan the top 1K of base RAM
+        * 3) Scan the 64K of bios
+        */
+       if (smp_scan_config(0x0,0x400) ||
+               smp_scan_config(639*0x400,0x400) ||
+                       smp_scan_config(0xF0000,0x10000))
+               return;
+       /*
+        * If it is an SMP machine we should know now.
+        *
+        * there is a real-mode segmented pointer pointing to the
+        * 4K EBDA area at 0x40E, calculate and scan it here.
+        *
+        * NOTE! There are Linux loaders that will corrupt the EBDA
+        * area, and as such this kind of SMP config may be less
+        * trustworthy, simply because the SMP table may have been
+        * stomped on during early boot. These loaders are buggy and
+        * should be fixed.
+        */
+
+       address = *(unsigned short *)phys_to_virt(0x40E);
+       address <<= 4;
+       if (smp_scan_config(address, 0x1000))
+               return;
+
+       /* If we have come this far, we did not find an MP table  */
+        printk(KERN_INFO "No mptable found.\n");
+}
+
+/* --------------------------------------------------------------------------
+                            ACPI-based MP Configuration
+   -------------------------------------------------------------------------- */
+
+#ifdef CONFIG_ACPI
+
+void __init mp_register_lapic_address(u64 address)
+{
+       mp_lapic_addr = (unsigned long) address;
+       set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
+       if (boot_cpu_id == -1U)
+               boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
+}
+
+void __cpuinit mp_register_lapic (u8 id, u8 enabled)
+{
+       struct mpc_config_processor processor;
+       int                     boot_cpu = 0;
+       
+       if (id == boot_cpu_id)
+               boot_cpu = 1;
+
+       processor.mpc_type = MP_PROCESSOR;
+       processor.mpc_apicid = id;
+       processor.mpc_apicver = 0;
+       processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
+       processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
+       processor.mpc_cpufeature = 0;
+       processor.mpc_featureflag = 0;
+       processor.mpc_reserved[0] = 0;
+       processor.mpc_reserved[1] = 0;
+
+       MP_processor_info(&processor);
+}
+
+#define MP_ISA_BUS             0
+#define MP_MAX_IOAPIC_PIN      127
+
+static struct mp_ioapic_routing {
+       int                     apic_id;
+       int                     gsi_start;
+       int                     gsi_end;
+       u32                     pin_programmed[4];
+} mp_ioapic_routing[MAX_IO_APICS];
+
+static int mp_find_ioapic(int gsi)
+{
+       int i = 0;
+
+       /* Find the IOAPIC that manages this GSI. */
+       for (i = 0; i < nr_ioapics; i++) {
+               if ((gsi >= mp_ioapic_routing[i].gsi_start)
+                       && (gsi <= mp_ioapic_routing[i].gsi_end))
+                       return i;
+       }
+
+       printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+       return -1;
+}
+
+static u8 uniq_ioapic_id(u8 id)
+{
+       int i;
+       DECLARE_BITMAP(used, 256);
+       bitmap_zero(used, 256);
+       for (i = 0; i < nr_ioapics; i++) {
+               struct mpc_config_ioapic *ia = &mp_ioapics[i];
+               __set_bit(ia->mpc_apicid, used);
+       }
+       if (!test_bit(id, used))
+               return id;
+       return find_first_zero_bit(used, 256);
+}
+
+void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
+{
+       int idx = 0;
+
+       if (bad_ioapic(address))
+               return;
+
+       idx = nr_ioapics;
+
+       mp_ioapics[idx].mpc_type = MP_IOAPIC;
+       mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
+       mp_ioapics[idx].mpc_apicaddr = address;
+
+       set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+       mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
+       mp_ioapics[idx].mpc_apicver = 0;
+       
+       /* 
+        * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
+        * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
+        */
+       mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
+       mp_ioapic_routing[idx].gsi_start = gsi_base;
+       mp_ioapic_routing[idx].gsi_end = gsi_base + 
+               io_apic_get_redir_entries(idx);
+
+       printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, "
+               "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 
+               mp_ioapics[idx].mpc_apicaddr,
+               mp_ioapic_routing[idx].gsi_start,
+               mp_ioapic_routing[idx].gsi_end);
+
+       nr_ioapics++;
+}
+
+void __init
+mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32        gsi)
+{
+       struct mpc_config_intsrc intsrc;
+       int                     ioapic = -1;
+       int                     pin = -1;
+
+       /* 
+        * Convert 'gsi' to 'ioapic.pin'.
+        */
+       ioapic = mp_find_ioapic(gsi);
+       if (ioapic < 0)
+               return;
+       pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
+
+       /*
+        * TBD: This check is for faulty timer entries, where the override
+        *      erroneously sets the trigger to level, resulting in a HUGE 
+        *      increase of timer interrupts!
+        */
+       if ((bus_irq == 0) && (trigger == 3))
+               trigger = 1;
+
+       intsrc.mpc_type = MP_INTSRC;
+       intsrc.mpc_irqtype = mp_INT;
+       intsrc.mpc_irqflag = (trigger << 2) | polarity;
+       intsrc.mpc_srcbus = MP_ISA_BUS;
+       intsrc.mpc_srcbusirq = bus_irq;                                /* IRQ */
+       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;        /* APIC ID */
+       intsrc.mpc_dstirq = pin;                                    /* INTIN# */
+
+       Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", 
+               intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
+               (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
+               intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
+
+       mp_irqs[mp_irq_entries] = intsrc;
+       if (++mp_irq_entries == MAX_IRQ_SOURCES)
+               panic("Max # of irq sources exceeded!\n");
+}
+
+void __init mp_config_acpi_legacy_irqs(void)
+{
+       struct mpc_config_intsrc intsrc;
+       int i = 0;
+       int ioapic = -1;
+
+       /* 
+        * Fabricate the legacy ISA bus (bus #31).
+        */
+       set_bit(MP_ISA_BUS, mp_bus_not_pci);
+
+       /* 
+        * Locate the IOAPIC that manages the ISA IRQs (0-15). 
+        */
+       ioapic = mp_find_ioapic(0);
+       if (ioapic < 0)
+               return;
+
+       intsrc.mpc_type = MP_INTSRC;
+       intsrc.mpc_irqflag = 0;                                 /* Conforming */
+       intsrc.mpc_srcbus = MP_ISA_BUS;
+       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
+
+       /* 
+        * Use the default configuration for the IRQs 0-15.  Unless
+        * overridden by (MADT) interrupt source override entries.
+        */
+       for (i = 0; i < 16; i++) {
+               int idx;
+
+               for (idx = 0; idx < mp_irq_entries; idx++) {
+                       struct mpc_config_intsrc *irq = mp_irqs + idx;
+
+                       /* Do we already have a mapping for this ISA IRQ? */
+                       if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
+                               break;
+
+                       /* Do we already have a mapping for this IOAPIC pin */
+                       if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
+                               (irq->mpc_dstirq == i))
+                               break;
+               }
+
+               if (idx != mp_irq_entries) {
+                       printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
+                       continue;                       /* IRQ already used */
+               }
+
+               intsrc.mpc_irqtype = mp_INT;
+               intsrc.mpc_srcbusirq = i;                  /* Identity mapped */
+               intsrc.mpc_dstirq = i;
+
+               Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
+                       "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
+                       (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
+                       intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, 
+                       intsrc.mpc_dstirq);
+
+               mp_irqs[mp_irq_entries] = intsrc;
+               if (++mp_irq_entries == MAX_IRQ_SOURCES)
+                       panic("Max # of irq sources exceeded!\n");
+       }
+}
+
+int mp_register_gsi(u32 gsi, int triggering, int polarity)
+{
+       int ioapic = -1;
+       int ioapic_pin = 0;
+       int idx, bit = 0;
+
+       if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
+               return gsi;
+
+       /* Don't set up the ACPI SCI because it's already set up */
+       if (acpi_gbl_FADT.sci_interrupt == gsi)
+               return gsi;
+
+       ioapic = mp_find_ioapic(gsi);
+       if (ioapic < 0) {
+               printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
+               return gsi;
+       }
+
+       ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
+
+       /* 
+        * Avoid pin reprogramming.  PRTs typically include entries  
+        * with redundant pin->gsi mappings (but unique PCI devices);
+        * we only program the IOAPIC on the first.
+        */
+       bit = ioapic_pin % 32;
+       idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
+       if (idx > 3) {
+               printk(KERN_ERR "Invalid reference to IOAPIC pin "
+                       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, 
+                       ioapic_pin);
+               return gsi;
+       }
+       if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
+               Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
+                       mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
+               return gsi;
+       }
+
+       mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
+
+       io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
+               triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
+               polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+       return gsi;
+}
+#endif /*CONFIG_ACPI*/
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
new file mode 100644 (file)
index 0000000..0ec6d2d
--- /dev/null
@@ -0,0 +1,483 @@
+/*
+ *  linux/arch/x86_64/nmi.c
+ *
+ *  NMI watchdog support on APIC systems
+ *
+ *  Started by Ingo Molnar <mingo@redhat.com>
+ *
+ *  Fixes:
+ *  Mikael Pettersson  : AMD K7 support for local APIC NMI watchdog.
+ *  Mikael Pettersson  : Power Management for local APIC NMI watchdog.
+ *  Pavel Machek and
+ *  Mikael Pettersson  : PM converted to driver model. Disable/enable API.
+ */
+
+#include <linux/nmi.h>
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/sysdev.h>
+#include <linux/sysctl.h>
+#include <linux/kprobes.h>
+#include <linux/cpumask.h>
+#include <linux/kdebug.h>
+
+#include <asm/smp.h>
+#include <asm/nmi.h>
+#include <asm/proto.h>
+#include <asm/mce.h>
+
+int unknown_nmi_panic;
+int nmi_watchdog_enabled;
+int panic_on_unrecovered_nmi;
+
+static cpumask_t backtrace_mask = CPU_MASK_NONE;
+
+/* nmi_active:
+ * >0: the lapic NMI watchdog is active, but can be disabled
+ * <0: the lapic NMI watchdog has not been set up, and cannot
+ *     be enabled
+ *  0: the lapic NMI watchdog is disabled, but can be enabled
+ */
+atomic_t nmi_active = ATOMIC_INIT(0);          /* oprofile uses this */
+int panic_on_timeout;
+
+unsigned int nmi_watchdog = NMI_DEFAULT;
+static unsigned int nmi_hz = HZ;
+
+static DEFINE_PER_CPU(short, wd_enabled);
+
+/* local prototypes */
+static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
+
+/* Run after command line and cpu_init init, but before all other checks */
+void nmi_watchdog_default(void)
+{
+       if (nmi_watchdog != NMI_DEFAULT)
+               return;
+       nmi_watchdog = NMI_NONE;
+}
+
+static int endflag __initdata = 0;
+
+#ifdef CONFIG_SMP
+/* The performance counters used by NMI_LOCAL_APIC don't trigger when
+ * the CPU is idle. To make sure the NMI watchdog really ticks on all
+ * CPUs during the test make them busy.
+ */
+static __init void nmi_cpu_busy(void *data)
+{
+       local_irq_enable_in_hardirq();
+       /* Intentionally don't use cpu_relax here. This is
+          to make sure that the performance counter really ticks,
+          even if there is a simulator or similar that catches the
+          pause instruction. On a real HT machine this is fine because
+          all other CPUs are busy with "useless" delay loops and don't
+          care if they get somewhat less cycles. */
+       while (endflag == 0)
+               mb();
+}
+#endif
+
+int __init check_nmi_watchdog (void)
+{
+       int *counts;
+       int cpu;
+
+       if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) 
+               return 0;
+
+       if (!atomic_read(&nmi_active))
+               return 0;
+
+       counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
+       if (!counts)
+               return -1;
+
+       printk(KERN_INFO "testing NMI watchdog ... ");
+
+#ifdef CONFIG_SMP
+       if (nmi_watchdog == NMI_LOCAL_APIC)
+               smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
+#endif
+
+       for (cpu = 0; cpu < NR_CPUS; cpu++)
+               counts[cpu] = cpu_pda(cpu)->__nmi_count;
+       local_irq_enable();
+       mdelay((20*1000)/nmi_hz); // wait 20 ticks
+
+       for_each_online_cpu(cpu) {
+               if (!per_cpu(wd_enabled, cpu))
+                       continue;
+               if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) {
+                       printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
+                              cpu,
+                              counts[cpu],
+                              cpu_pda(cpu)->__nmi_count);
+                       per_cpu(wd_enabled, cpu) = 0;
+                       atomic_dec(&nmi_active);
+               }
+       }
+       if (!atomic_read(&nmi_active)) {
+               kfree(counts);
+               atomic_set(&nmi_active, -1);
+               endflag = 1;
+               return -1;
+       }
+       endflag = 1;
+       printk("OK.\n");
+
+       /* now that we know it works we can reduce NMI frequency to
+          something more reasonable; makes a difference in some configs */
+       if (nmi_watchdog == NMI_LOCAL_APIC)
+               nmi_hz = lapic_adjust_nmi_hz(1);
+
+       kfree(counts);
+       return 0;
+}
+
+int __init setup_nmi_watchdog(char *str)
+{
+       int nmi;
+
+       if (!strncmp(str,"panic",5)) {
+               panic_on_timeout = 1;
+               str = strchr(str, ',');
+               if (!str)
+                       return 1;
+               ++str;
+       }
+
+       get_option(&str, &nmi);
+
+       if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
+               return 0;
+
+       nmi_watchdog = nmi;
+       return 1;
+}
+
+__setup("nmi_watchdog=", setup_nmi_watchdog);
+
+
+static void __acpi_nmi_disable(void *__unused)
+{
+       apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+}
+
+/*
+ * Disable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_disable(void)
+{
+       if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+               on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
+}
+
+static void __acpi_nmi_enable(void *__unused)
+{
+       apic_write(APIC_LVT0, APIC_DM_NMI);
+}
+
+/*
+ * Enable timer based NMIs on all CPUs:
+ */
+void acpi_nmi_enable(void)
+{
+       if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
+               on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
+}
+#ifdef CONFIG_PM
+
+static int nmi_pm_active; /* nmi_active before suspend */
+
+static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
+{
+       /* only CPU0 goes here, other CPUs should be offline */
+       nmi_pm_active = atomic_read(&nmi_active);
+       stop_apic_nmi_watchdog(NULL);
+       BUG_ON(atomic_read(&nmi_active) != 0);
+       return 0;
+}
+
+static int lapic_nmi_resume(struct sys_device *dev)
+{
+       /* only CPU0 goes here, other CPUs should be offline */
+       if (nmi_pm_active > 0) {
+               setup_apic_nmi_watchdog(NULL);
+               touch_nmi_watchdog();
+       }
+       return 0;
+}
+
+static struct sysdev_class nmi_sysclass = {
+       set_kset_name("lapic_nmi"),
+       .resume         = lapic_nmi_resume,
+       .suspend        = lapic_nmi_suspend,
+};
+
+static struct sys_device device_lapic_nmi = {
+       .id             = 0,
+       .cls    = &nmi_sysclass,
+};
+
+static int __init init_lapic_nmi_sysfs(void)
+{
+       int error;
+
+       /* should really be a BUG_ON but b/c this is an
+        * init call, it just doesn't work.  -dcz
+        */
+       if (nmi_watchdog != NMI_LOCAL_APIC)
+               return 0;
+
+       if ( atomic_read(&nmi_active) < 0 )
+               return 0;
+
+       error = sysdev_class_register(&nmi_sysclass);
+       if (!error)
+               error = sysdev_register(&device_lapic_nmi);
+       return error;
+}
+/* must come after the local APIC's device_initcall() */
+late_initcall(init_lapic_nmi_sysfs);
+
+#endif /* CONFIG_PM */
+
+void setup_apic_nmi_watchdog(void *unused)
+{
+       if (__get_cpu_var(wd_enabled) == 1)
+               return;
+
+       /* cheap hack to support suspend/resume */
+       /* if cpu0 is not active neither should the other cpus */
+       if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
+               return;
+
+       switch (nmi_watchdog) {
+       case NMI_LOCAL_APIC:
+               __get_cpu_var(wd_enabled) = 1;
+               if (lapic_watchdog_init(nmi_hz) < 0) {
+                       __get_cpu_var(wd_enabled) = 0;
+                       return;
+               }
+               /* FALL THROUGH */
+       case NMI_IO_APIC:
+               __get_cpu_var(wd_enabled) = 1;
+               atomic_inc(&nmi_active);
+       }
+}
+
+void stop_apic_nmi_watchdog(void *unused)
+{
+       /* only support LOCAL and IO APICs for now */
+       if ((nmi_watchdog != NMI_LOCAL_APIC) &&
+           (nmi_watchdog != NMI_IO_APIC))
+               return;
+       if (__get_cpu_var(wd_enabled) == 0)
+               return;
+       if (nmi_watchdog == NMI_LOCAL_APIC)
+               lapic_watchdog_stop();
+       __get_cpu_var(wd_enabled) = 0;
+       atomic_dec(&nmi_active);
+}
+
+/*
+ * the best way to detect whether a CPU has a 'hard lockup' problem
+ * is to check it's local APIC timer IRQ counts. If they are not
+ * changing then that CPU has some problem.
+ *
+ * as these watchdog NMI IRQs are generated on every CPU, we only
+ * have to check the current processor.
+ */
+
+static DEFINE_PER_CPU(unsigned, last_irq_sum);
+static DEFINE_PER_CPU(local_t, alert_counter);
+static DEFINE_PER_CPU(int, nmi_touch);
+
+void touch_nmi_watchdog(void)
+{
+       if (nmi_watchdog > 0) {
+               unsigned cpu;
+
+               /*
+                * Tell other CPUs to reset their alert counters. We cannot
+                * do it ourselves because the alert count increase is not
+                * atomic.
+                */
+               for_each_present_cpu(cpu) {
+                       if (per_cpu(nmi_touch, cpu) != 1)
+                               per_cpu(nmi_touch, cpu) = 1;
+               }
+       }
+
+       touch_softlockup_watchdog();
+}
+
+int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
+{
+       int sum;
+       int touched = 0;
+       int cpu = smp_processor_id();
+       int rc = 0;
+
+       /* check for other users first */
+       if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+                       == NOTIFY_STOP) {
+               rc = 1;
+               touched = 1;
+       }
+
+       sum = read_pda(apic_timer_irqs);
+       if (__get_cpu_var(nmi_touch)) {
+               __get_cpu_var(nmi_touch) = 0;
+               touched = 1;
+       }
+
+       if (cpu_isset(cpu, backtrace_mask)) {
+               static DEFINE_SPINLOCK(lock);   /* Serialise the printks */
+
+               spin_lock(&lock);
+               printk("NMI backtrace for cpu %d\n", cpu);
+               dump_stack();
+               spin_unlock(&lock);
+               cpu_clear(cpu, backtrace_mask);
+       }
+
+#ifdef CONFIG_X86_MCE
+       /* Could check oops_in_progress here too, but it's safer
+          not too */
+       if (atomic_read(&mce_entry) > 0)
+               touched = 1;
+#endif
+       /* if the apic timer isn't firing, this cpu isn't doing much */
+       if (!touched && __get_cpu_var(last_irq_sum) == sum) {
+               /*
+                * Ayiee, looks like this CPU is stuck ...
+                * wait a few IRQs (5 seconds) before doing the oops ...
+                */
+               local_inc(&__get_cpu_var(alert_counter));
+               if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz)
+                       die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs,
+                               panic_on_timeout);
+       } else {
+               __get_cpu_var(last_irq_sum) = sum;
+               local_set(&__get_cpu_var(alert_counter), 0);
+       }
+
+       /* see if the nmi watchdog went off */
+       if (!__get_cpu_var(wd_enabled))
+               return rc;
+       switch (nmi_watchdog) {
+       case NMI_LOCAL_APIC:
+               rc |= lapic_wd_event(nmi_hz);
+               break;
+       case NMI_IO_APIC:
+               /* don't know how to accurately check for this.
+                * just assume it was a watchdog timer interrupt
+                * This matches the old behaviour.
+                */
+               rc = 1;
+               break;
+       }
+       return rc;
+}
+
+static unsigned ignore_nmis;
+
+asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code)
+{
+       nmi_enter();
+       add_pda(__nmi_count,1);
+       if (!ignore_nmis)
+               default_do_nmi(regs);
+       nmi_exit();
+}
+
+int do_nmi_callback(struct pt_regs * regs, int cpu)
+{
+#ifdef CONFIG_SYSCTL
+       if (unknown_nmi_panic)
+               return unknown_nmi_panic_callback(regs, cpu);
+#endif
+       return 0;
+}
+
+void stop_nmi(void)
+{
+       acpi_nmi_disable();
+       ignore_nmis++;
+}
+
+void restart_nmi(void)
+{
+       ignore_nmis--;
+       acpi_nmi_enable();
+}
+
+#ifdef CONFIG_SYSCTL
+
+static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
+{
+       unsigned char reason = get_nmi_reason();
+       char buf[64];
+
+       sprintf(buf, "NMI received for unknown reason %02x\n", reason);
+       die_nmi(buf, regs, 1);  /* Always panic here */
+       return 0;
+}
+
+/*
+ * proc handler for /proc/sys/kernel/nmi
+ */
+int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
+                       void __user *buffer, size_t *length, loff_t *ppos)
+{
+       int old_state;
+
+       nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
+       old_state = nmi_watchdog_enabled;
+       proc_dointvec(table, write, file, buffer, length, ppos);
+       if (!!old_state == !!nmi_watchdog_enabled)
+               return 0;
+
+       if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) {
+               printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
+               return -EIO;
+       }
+
+       /* if nmi_watchdog is not set yet, then set it */
+       nmi_watchdog_default();
+
+       if (nmi_watchdog == NMI_LOCAL_APIC) {
+               if (nmi_watchdog_enabled)
+                       enable_lapic_nmi_watchdog();
+               else
+                       disable_lapic_nmi_watchdog();
+       } else {
+               printk( KERN_WARNING
+                       "NMI watchdog doesn't know what hardware to touch\n");
+               return -EIO;
+       }
+       return 0;
+}
+
+#endif
+
+void __trigger_all_cpu_backtrace(void)
+{
+       int i;
+
+       backtrace_mask = cpu_online_map;
+       /* Wait for up to 10 seconds for all CPUs to do the backtrace */
+       for (i = 0; i < 10 * 1000; i++) {
+               if (cpus_empty(backtrace_mask))
+                       break;
+               mdelay(1);
+       }
+}
+
+EXPORT_SYMBOL(nmi_active);
+EXPORT_SYMBOL(nmi_watchdog);
+EXPORT_SYMBOL(touch_nmi_watchdog);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
new file mode 100644 (file)
index 0000000..71da01e
--- /dev/null
@@ -0,0 +1,1578 @@
+/*
+ * Derived from arch/powerpc/kernel/iommu.c
+ *
+ * Copyright IBM Corporation, 2006-2007
+ * Copyright (C) 2006  Jon Mason <jdmason@kudzu.us>
+ *
+ * Author: Jon Mason <jdmason@kudzu.us>
+ * Author: Muli Ben-Yehuda <muli@il.ibm.com>
+
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/dma-mapping.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/pci_ids.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <asm/iommu.h>
+#include <asm/calgary.h>
+#include <asm/tce.h>
+#include <asm/pci-direct.h>
+#include <asm/system.h>
+#include <asm/dma.h>
+#include <asm/rio.h>
+
+#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
+int use_calgary __read_mostly = 1;
+#else
+int use_calgary __read_mostly = 0;
+#endif /* CONFIG_CALGARY_DEFAULT_ENABLED */
+
+#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1
+#define PCI_DEVICE_ID_IBM_CALIOC2 0x0308
+
+/* register offsets inside the host bridge space */
+#define CALGARY_CONFIG_REG     0x0108
+#define PHB_CSR_OFFSET         0x0110 /* Channel Status */
+#define PHB_PLSSR_OFFSET       0x0120
+#define PHB_CONFIG_RW_OFFSET   0x0160
+#define PHB_IOBASE_BAR_LOW     0x0170
+#define PHB_IOBASE_BAR_HIGH    0x0180
+#define PHB_MEM_1_LOW          0x0190
+#define PHB_MEM_1_HIGH         0x01A0
+#define PHB_IO_ADDR_SIZE       0x01B0
+#define PHB_MEM_1_SIZE         0x01C0
+#define PHB_MEM_ST_OFFSET      0x01D0
+#define PHB_AER_OFFSET         0x0200
+#define PHB_CONFIG_0_HIGH      0x0220
+#define PHB_CONFIG_0_LOW       0x0230
+#define PHB_CONFIG_0_END       0x0240
+#define PHB_MEM_2_LOW          0x02B0
+#define PHB_MEM_2_HIGH         0x02C0
+#define PHB_MEM_2_SIZE_HIGH    0x02D0
+#define PHB_MEM_2_SIZE_LOW     0x02E0
+#define PHB_DOSHOLE_OFFSET     0x08E0
+
+/* CalIOC2 specific */
+#define PHB_SAVIOR_L2          0x0DB0
+#define PHB_PAGE_MIG_CTRL      0x0DA8
+#define PHB_PAGE_MIG_DEBUG     0x0DA0
+#define PHB_ROOT_COMPLEX_STATUS 0x0CB0
+
+/* PHB_CONFIG_RW */
+#define PHB_TCE_ENABLE         0x20000000
+#define PHB_SLOT_DISABLE       0x1C000000
+#define PHB_DAC_DISABLE                0x01000000
+#define PHB_MEM2_ENABLE                0x00400000
+#define PHB_MCSR_ENABLE                0x00100000
+/* TAR (Table Address Register) */
+#define TAR_SW_BITS            0x0000ffffffff800fUL
+#define TAR_VALID              0x0000000000000008UL
+/* CSR (Channel/DMA Status Register) */
+#define CSR_AGENT_MASK         0xffe0ffff
+/* CCR (Calgary Configuration Register) */
+#define CCR_2SEC_TIMEOUT       0x000000000000000EUL
+/* PMCR/PMDR (Page Migration Control/Debug Registers */
+#define PMR_SOFTSTOP           0x80000000
+#define PMR_SOFTSTOPFAULT      0x40000000
+#define PMR_HARDSTOP           0x20000000
+
+#define MAX_NUM_OF_PHBS                8 /* how many PHBs in total? */
+#define MAX_NUM_CHASSIS                8 /* max number of chassis */
+/* MAX_PHB_BUS_NUM is the maximal possible dev->bus->number */
+#define MAX_PHB_BUS_NUM                (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2)
+#define PHBS_PER_CALGARY       4
+
+/* register offsets in Calgary's internal register space */
+static const unsigned long tar_offsets[] = {
+       0x0580 /* TAR0 */,
+       0x0588 /* TAR1 */,
+       0x0590 /* TAR2 */,
+       0x0598 /* TAR3 */
+};
+
+static const unsigned long split_queue_offsets[] = {
+       0x4870 /* SPLIT QUEUE 0 */,
+       0x5870 /* SPLIT QUEUE 1 */,
+       0x6870 /* SPLIT QUEUE 2 */,
+       0x7870 /* SPLIT QUEUE 3 */
+};
+
+static const unsigned long phb_offsets[] = {
+       0x8000 /* PHB0 */,
+       0x9000 /* PHB1 */,
+       0xA000 /* PHB2 */,
+       0xB000 /* PHB3 */
+};
+
+/* PHB debug registers */
+
+static const unsigned long phb_debug_offsets[] = {
+       0x4000  /* PHB 0 DEBUG */,
+       0x5000  /* PHB 1 DEBUG */,
+       0x6000  /* PHB 2 DEBUG */,
+       0x7000  /* PHB 3 DEBUG */
+};
+
+/*
+ * STUFF register for each debug PHB,
+ * byte 1 = start bus number, byte 2 = end bus number
+ */
+
+#define PHB_DEBUG_STUFF_OFFSET 0x0020
+
+#define EMERGENCY_PAGES 32 /* = 128KB */
+
+unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
+static int translate_empty_slots __read_mostly = 0;
+static int calgary_detected __read_mostly = 0;
+
+static struct rio_table_hdr    *rio_table_hdr __initdata;
+static struct scal_detail      *scal_devs[MAX_NUMNODES] __initdata;
+static struct rio_detail       *rio_devs[MAX_NUMNODES * 4] __initdata;
+
+struct calgary_bus_info {
+       void *tce_space;
+       unsigned char translation_disabled;
+       signed char phbid;
+       void __iomem *bbar;
+};
+
+static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
+static void calgary_tce_cache_blast(struct iommu_table *tbl);
+static void calgary_dump_error_regs(struct iommu_table *tbl);
+static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
+static void calioc2_tce_cache_blast(struct iommu_table *tbl);
+static void calioc2_dump_error_regs(struct iommu_table *tbl);
+
+static struct cal_chipset_ops calgary_chip_ops = {
+       .handle_quirks = calgary_handle_quirks,
+       .tce_cache_blast = calgary_tce_cache_blast,
+       .dump_error_regs = calgary_dump_error_regs
+};
+
+static struct cal_chipset_ops calioc2_chip_ops = {
+       .handle_quirks = calioc2_handle_quirks,
+       .tce_cache_blast = calioc2_tce_cache_blast,
+       .dump_error_regs = calioc2_dump_error_regs
+};
+
+static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
+
+/* enable this to stress test the chip's TCE cache */
+#ifdef CONFIG_IOMMU_DEBUG
+int debugging __read_mostly = 1;
+
+static inline unsigned long verify_bit_range(unsigned long* bitmap,
+       int expected, unsigned long start, unsigned long end)
+{
+       unsigned long idx = start;
+
+       BUG_ON(start >= end);
+
+       while (idx < end) {
+               if (!!test_bit(idx, bitmap) != expected)
+                       return idx;
+               ++idx;
+       }
+
+       /* all bits have the expected value */
+       return ~0UL;
+}
+#else /* debugging is disabled */
+int debugging __read_mostly = 0;
+
+static inline unsigned long verify_bit_range(unsigned long* bitmap,
+       int expected, unsigned long start, unsigned long end)
+{
+       return ~0UL;
+}
+
+#endif /* CONFIG_IOMMU_DEBUG */
+
+static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen)
+{
+       unsigned int npages;
+
+       npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK);
+       npages >>= PAGE_SHIFT;
+
+       return npages;
+}
+
+static inline int translate_phb(struct pci_dev* dev)
+{
+       int disabled = bus_info[dev->bus->number].translation_disabled;
+       return !disabled;
+}
+
+static void iommu_range_reserve(struct iommu_table *tbl,
+       unsigned long start_addr, unsigned int npages)
+{
+       unsigned long index;
+       unsigned long end;
+       unsigned long badbit;
+       unsigned long flags;
+
+       index = start_addr >> PAGE_SHIFT;
+
+       /* bail out if we're asked to reserve a region we don't cover */
+       if (index >= tbl->it_size)
+               return;
+
+       end = index + npages;
+       if (end > tbl->it_size) /* don't go off the table */
+               end = tbl->it_size;
+
+       spin_lock_irqsave(&tbl->it_lock, flags);
+
+       badbit = verify_bit_range(tbl->it_map, 0, index, end);
+       if (badbit != ~0UL) {
+               if (printk_ratelimit())
+                       printk(KERN_ERR "Calgary: entry already allocated at "
+                              "0x%lx tbl %p dma 0x%lx npages %u\n",
+                              badbit, tbl, start_addr, npages);
+       }
+
+       set_bit_string(tbl->it_map, index, npages);
+
+       spin_unlock_irqrestore(&tbl->it_lock, flags);
+}
+
+static unsigned long iommu_range_alloc(struct iommu_table *tbl,
+       unsigned int npages)
+{
+       unsigned long flags;
+       unsigned long offset;
+
+       BUG_ON(npages == 0);
+
+       spin_lock_irqsave(&tbl->it_lock, flags);
+
+       offset = find_next_zero_string(tbl->it_map, tbl->it_hint,
+                                      tbl->it_size, npages);
+       if (offset == ~0UL) {
+               tbl->chip_ops->tce_cache_blast(tbl);
+               offset = find_next_zero_string(tbl->it_map, 0,
+                                              tbl->it_size, npages);
+               if (offset == ~0UL) {
+                       printk(KERN_WARNING "Calgary: IOMMU full.\n");
+                       spin_unlock_irqrestore(&tbl->it_lock, flags);
+                       if (panic_on_overflow)
+                               panic("Calgary: fix the allocator.\n");
+                       else
+                               return bad_dma_address;
+               }
+       }
+
+       set_bit_string(tbl->it_map, offset, npages);
+       tbl->it_hint = offset + npages;
+       BUG_ON(tbl->it_hint > tbl->it_size);
+
+       spin_unlock_irqrestore(&tbl->it_lock, flags);
+
+       return offset;
+}
+
+static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr,
+       unsigned int npages, int direction)
+{
+       unsigned long entry;
+       dma_addr_t ret = bad_dma_address;
+
+       entry = iommu_range_alloc(tbl, npages);
+
+       if (unlikely(entry == bad_dma_address))
+               goto error;
+
+       /* set the return dma address */
+       ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK);
+
+       /* put the TCEs in the HW table */
+       tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK,
+                 direction);
+
+       return ret;
+
+error:
+       printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
+              "iommu %p\n", npages, tbl);
+       return bad_dma_address;
+}
+
+static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
+       unsigned int npages)
+{
+       unsigned long entry;
+       unsigned long badbit;
+       unsigned long badend;
+       unsigned long flags;
+
+       /* were we called with bad_dma_address? */
+       badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE);
+       if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) {
+               printk(KERN_ERR "Calgary: driver tried unmapping bad DMA "
+                      "address 0x%Lx\n", dma_addr);
+               WARN_ON(1);
+               return;
+       }
+
+       entry = dma_addr >> PAGE_SHIFT;
+
+       BUG_ON(entry + npages > tbl->it_size);
+
+       tce_free(tbl, entry, npages);
+
+       spin_lock_irqsave(&tbl->it_lock, flags);
+
+       badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages);
+       if (badbit != ~0UL) {
+               if (printk_ratelimit())
+                       printk(KERN_ERR "Calgary: bit is off at 0x%lx "
+                              "tbl %p dma 0x%Lx entry 0x%lx npages %u\n",
+                              badbit, tbl, dma_addr, entry, npages);
+       }
+
+       __clear_bit_string(tbl->it_map, entry, npages);
+
+       spin_unlock_irqrestore(&tbl->it_lock, flags);
+}
+
+static inline struct iommu_table *find_iommu_table(struct device *dev)
+{
+       struct pci_dev *pdev;
+       struct pci_bus *pbus;
+       struct iommu_table *tbl;
+
+       pdev = to_pci_dev(dev);
+
+       pbus = pdev->bus;
+
+       /* is the device behind a bridge? Look for the root bus */
+       while (pbus->parent)
+               pbus = pbus->parent;
+
+       tbl = pci_iommu(pbus);
+
+       BUG_ON(tbl && (tbl->it_busno != pbus->number));
+
+       return tbl;
+}
+
+static void calgary_unmap_sg(struct device *dev,
+       struct scatterlist *sglist, int nelems, int direction)
+{
+       struct iommu_table *tbl = find_iommu_table(dev);
+
+       if (!translate_phb(to_pci_dev(dev)))
+               return;
+
+       while (nelems--) {
+               unsigned int npages;
+               dma_addr_t dma = sglist->dma_address;
+               unsigned int dmalen = sglist->dma_length;
+
+               if (dmalen == 0)
+                       break;
+
+               npages = num_dma_pages(dma, dmalen);
+               iommu_free(tbl, dma, npages);
+               sglist++;
+       }
+}
+
+static int calgary_nontranslate_map_sg(struct device* dev,
+       struct scatterlist *sg, int nelems, int direction)
+{
+       int i;
+
+       for (i = 0; i < nelems; i++ ) {
+               struct scatterlist *s = &sg[i];
+               BUG_ON(!s->page);
+               s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
+               s->dma_length = s->length;
+       }
+       return nelems;
+}
+
+static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
+       int nelems, int direction)
+{
+       struct iommu_table *tbl = find_iommu_table(dev);
+       unsigned long vaddr;
+       unsigned int npages;
+       unsigned long entry;
+       int i;
+
+       if (!translate_phb(to_pci_dev(dev)))
+               return calgary_nontranslate_map_sg(dev, sg, nelems, direction);
+
+       for (i = 0; i < nelems; i++ ) {
+               struct scatterlist *s = &sg[i];
+               BUG_ON(!s->page);
+
+               vaddr = (unsigned long)page_address(s->page) + s->offset;
+               npages = num_dma_pages(vaddr, s->length);
+
+               entry = iommu_range_alloc(tbl, npages);
+               if (entry == bad_dma_address) {
+                       /* makes sure unmap knows to stop */
+                       s->dma_length = 0;
+                       goto error;
+               }
+
+               s->dma_address = (entry << PAGE_SHIFT) | s->offset;
+
+               /* insert into HW table */
+               tce_build(tbl, entry, npages, vaddr & PAGE_MASK,
+                         direction);
+
+               s->dma_length = s->length;
+       }
+
+       return nelems;
+error:
+       calgary_unmap_sg(dev, sg, nelems, direction);
+       for (i = 0; i < nelems; i++) {
+               sg[i].dma_address = bad_dma_address;
+               sg[i].dma_length = 0;
+       }
+       return 0;
+}
+
+static dma_addr_t calgary_map_single(struct device *dev, void *vaddr,
+       size_t size, int direction)
+{
+       dma_addr_t dma_handle = bad_dma_address;
+       unsigned long uaddr;
+       unsigned int npages;
+       struct iommu_table *tbl = find_iommu_table(dev);
+
+       uaddr = (unsigned long)vaddr;
+       npages = num_dma_pages(uaddr, size);
+
+       if (translate_phb(to_pci_dev(dev)))
+               dma_handle = iommu_alloc(tbl, vaddr, npages, direction);
+       else
+               dma_handle = virt_to_bus(vaddr);
+
+       return dma_handle;
+}
+
+static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
+       size_t size, int direction)
+{
+       struct iommu_table *tbl = find_iommu_table(dev);
+       unsigned int npages;
+
+       if (!translate_phb(to_pci_dev(dev)))
+               return;
+
+       npages = num_dma_pages(dma_handle, size);
+       iommu_free(tbl, dma_handle, npages);
+}
+
+static void* calgary_alloc_coherent(struct device *dev, size_t size,
+       dma_addr_t *dma_handle, gfp_t flag)
+{
+       void *ret = NULL;
+       dma_addr_t mapping;
+       unsigned int npages, order;
+       struct iommu_table *tbl = find_iommu_table(dev);
+
+       size = PAGE_ALIGN(size); /* size rounded up to full pages */
+       npages = size >> PAGE_SHIFT;
+       order = get_order(size);
+
+       /* alloc enough pages (and possibly more) */
+       ret = (void *)__get_free_pages(flag, order);
+       if (!ret)
+               goto error;
+       memset(ret, 0, size);
+
+       if (translate_phb(to_pci_dev(dev))) {
+               /* set up tces to cover the allocated range */
+               mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL);
+               if (mapping == bad_dma_address)
+                       goto free;
+
+               *dma_handle = mapping;
+       } else /* non translated slot */
+               *dma_handle = virt_to_bus(ret);
+
+       return ret;
+
+free:
+       free_pages((unsigned long)ret, get_order(size));
+       ret = NULL;
+error:
+       return ret;
+}
+
+static const struct dma_mapping_ops calgary_dma_ops = {
+       .alloc_coherent = calgary_alloc_coherent,
+       .map_single = calgary_map_single,
+       .unmap_single = calgary_unmap_single,
+       .map_sg = calgary_map_sg,
+       .unmap_sg = calgary_unmap_sg,
+};
+
+static inline void __iomem * busno_to_bbar(unsigned char num)
+{
+       return bus_info[num].bbar;
+}
+
+static inline int busno_to_phbid(unsigned char num)
+{
+       return bus_info[num].phbid;
+}
+
+static inline unsigned long split_queue_offset(unsigned char num)
+{
+       size_t idx = busno_to_phbid(num);
+
+       return split_queue_offsets[idx];
+}
+
+static inline unsigned long tar_offset(unsigned char num)
+{
+       size_t idx = busno_to_phbid(num);
+
+       return tar_offsets[idx];
+}
+
+static inline unsigned long phb_offset(unsigned char num)
+{
+       size_t idx = busno_to_phbid(num);
+
+       return phb_offsets[idx];
+}
+
+static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset)
+{
+       unsigned long target = ((unsigned long)bar) | offset;
+       return (void __iomem*)target;
+}
+
+static inline int is_calioc2(unsigned short device)
+{
+       return (device == PCI_DEVICE_ID_IBM_CALIOC2);
+}
+
+static inline int is_calgary(unsigned short device)
+{
+       return (device == PCI_DEVICE_ID_IBM_CALGARY);
+}
+
+static inline int is_cal_pci_dev(unsigned short device)
+{
+       return (is_calgary(device) || is_calioc2(device));
+}
+
+static void calgary_tce_cache_blast(struct iommu_table *tbl)
+{
+       u64 val;
+       u32 aer;
+       int i = 0;
+       void __iomem *bbar = tbl->bbar;
+       void __iomem *target;
+
+       /* disable arbitration on the bus */
+       target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
+       aer = readl(target);
+       writel(0, target);
+
+       /* read plssr to ensure it got there */
+       target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
+       val = readl(target);
+
+       /* poll split queues until all DMA activity is done */
+       target = calgary_reg(bbar, split_queue_offset(tbl->it_busno));
+       do {
+               val = readq(target);
+               i++;
+       } while ((val & 0xff) != 0xff && i < 100);
+       if (i == 100)
+               printk(KERN_WARNING "Calgary: PCI bus not quiesced, "
+                      "continuing anyway\n");
+
+       /* invalidate TCE cache */
+       target = calgary_reg(bbar, tar_offset(tbl->it_busno));
+       writeq(tbl->tar_val, target);
+
+       /* enable arbitration */
+       target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
+       writel(aer, target);
+       (void)readl(target); /* flush */
+}
+
+static void calioc2_tce_cache_blast(struct iommu_table *tbl)
+{
+       void __iomem *bbar = tbl->bbar;
+       void __iomem *target;
+       u64 val64;
+       u32 val;
+       int i = 0;
+       int count = 1;
+       unsigned char bus = tbl->it_busno;
+
+begin:
+       printk(KERN_DEBUG "Calgary: CalIOC2 bus 0x%x entering tce cache blast "
+              "sequence - count %d\n", bus, count);
+
+       /* 1. using the Page Migration Control reg set SoftStop */
+       target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
+       val = be32_to_cpu(readl(target));
+       printk(KERN_DEBUG "1a. read 0x%x [LE] from %p\n", val, target);
+       val |= PMR_SOFTSTOP;
+       printk(KERN_DEBUG "1b. writing 0x%x [LE] to %p\n", val, target);
+       writel(cpu_to_be32(val), target);
+
+       /* 2. poll split queues until all DMA activity is done */
+       printk(KERN_DEBUG "2a. starting to poll split queues\n");
+       target = calgary_reg(bbar, split_queue_offset(bus));
+       do {
+               val64 = readq(target);
+               i++;
+       } while ((val64 & 0xff) != 0xff && i < 100);
+       if (i == 100)
+               printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, "
+                      "continuing anyway\n");
+
+       /* 3. poll Page Migration DEBUG for SoftStopFault */
+       target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
+       val = be32_to_cpu(readl(target));
+       printk(KERN_DEBUG "3. read 0x%x [LE] from %p\n", val, target);
+
+       /* 4. if SoftStopFault - goto (1) */
+       if (val & PMR_SOFTSTOPFAULT) {
+               if (++count < 100)
+                       goto begin;
+               else {
+                       printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, "
+                              "aborting TCE cache flush sequence!\n");
+                       return; /* pray for the best */
+               }
+       }
+
+       /* 5. Slam into HardStop by reading PHB_PAGE_MIG_CTRL */
+       target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
+       printk(KERN_DEBUG "5a. slamming into HardStop by reading %p\n", target);
+       val = be32_to_cpu(readl(target));
+       printk(KERN_DEBUG "5b. read 0x%x [LE] from %p\n", val, target);
+       target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
+       val = be32_to_cpu(readl(target));
+       printk(KERN_DEBUG "5c. read 0x%x [LE] from %p (debug)\n", val, target);
+
+       /* 6. invalidate TCE cache */
+       printk(KERN_DEBUG "6. invalidating TCE cache\n");
+       target = calgary_reg(bbar, tar_offset(bus));
+       writeq(tbl->tar_val, target);
+
+       /* 7. Re-read PMCR */
+       printk(KERN_DEBUG "7a. Re-reading PMCR\n");
+       target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
+       val = be32_to_cpu(readl(target));
+       printk(KERN_DEBUG "7b. read 0x%x [LE] from %p\n", val, target);
+
+       /* 8. Remove HardStop */
+       printk(KERN_DEBUG "8a. removing HardStop from PMCR\n");
+       target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
+       val = 0;
+       printk(KERN_DEBUG "8b. writing 0x%x [LE] to %p\n", val, target);
+       writel(cpu_to_be32(val), target);
+       val = be32_to_cpu(readl(target));
+       printk(KERN_DEBUG "8c. read 0x%x [LE] from %p\n", val, target);
+}
+
+static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start,
+       u64 limit)
+{
+       unsigned int numpages;
+
+       limit = limit | 0xfffff;
+       limit++;
+
+       numpages = ((limit - start) >> PAGE_SHIFT);
+       iommu_range_reserve(pci_iommu(dev->bus), start, numpages);
+}
+
+static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev)
+{
+       void __iomem *target;
+       u64 low, high, sizelow;
+       u64 start, limit;
+       struct iommu_table *tbl = pci_iommu(dev->bus);
+       unsigned char busnum = dev->bus->number;
+       void __iomem *bbar = tbl->bbar;
+
+       /* peripheral MEM_1 region */
+       target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW);
+       low = be32_to_cpu(readl(target));
+       target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH);
+       high = be32_to_cpu(readl(target));
+       target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE);
+       sizelow = be32_to_cpu(readl(target));
+
+       start = (high << 32) | low;
+       limit = sizelow;
+
+       calgary_reserve_mem_region(dev, start, limit);
+}
+
+static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev)
+{
+       void __iomem *target;
+       u32 val32;
+       u64 low, high, sizelow, sizehigh;
+       u64 start, limit;
+       struct iommu_table *tbl = pci_iommu(dev->bus);
+       unsigned char busnum = dev->bus->number;
+       void __iomem *bbar = tbl->bbar;
+
+       /* is it enabled? */
+       target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
+       val32 = be32_to_cpu(readl(target));
+       if (!(val32 & PHB_MEM2_ENABLE))
+               return;
+
+       target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW);
+       low = be32_to_cpu(readl(target));
+       target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH);
+       high = be32_to_cpu(readl(target));
+       target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW);
+       sizelow = be32_to_cpu(readl(target));
+       target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH);
+       sizehigh = be32_to_cpu(readl(target));
+
+       start = (high << 32) | low;
+       limit = (sizehigh << 32) | sizelow;
+
+       calgary_reserve_mem_region(dev, start, limit);
+}
+
+/*
+ * some regions of the IO address space do not get translated, so we
+ * must not give devices IO addresses in those regions. The regions
+ * are the 640KB-1MB region and the two PCI peripheral memory holes.
+ * Reserve all of them in the IOMMU bitmap to avoid giving them out
+ * later.
+ */
+static void __init calgary_reserve_regions(struct pci_dev *dev)
+{
+       unsigned int npages;
+       u64 start;
+       struct iommu_table *tbl = pci_iommu(dev->bus);
+
+       /* reserve EMERGENCY_PAGES from bad_dma_address and up */
+       iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES);
+
+       /* avoid the BIOS/VGA first 640KB-1MB region */
+       /* for CalIOC2 - avoid the entire first MB */
+       if (is_calgary(dev->device)) {
+               start = (640 * 1024);
+               npages = ((1024 - 640) * 1024) >> PAGE_SHIFT;
+       } else { /* calioc2 */
+               start = 0;
+               npages = (1 * 1024 * 1024) >> PAGE_SHIFT;
+       }
+       iommu_range_reserve(tbl, start, npages);
+
+       /* reserve the two PCI peripheral memory regions in IO space */
+       calgary_reserve_peripheral_mem_1(dev);
+       calgary_reserve_peripheral_mem_2(dev);
+}
+
+static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
+{
+       u64 val64;
+       u64 table_phys;
+       void __iomem *target;
+       int ret;
+       struct iommu_table *tbl;
+
+       /* build TCE tables for each PHB */
+       ret = build_tce_table(dev, bbar);
+       if (ret)
+               return ret;
+
+       tbl = pci_iommu(dev->bus);
+       tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space;
+       tce_free(tbl, 0, tbl->it_size);
+
+       if (is_calgary(dev->device))
+               tbl->chip_ops = &calgary_chip_ops;
+       else if (is_calioc2(dev->device))
+               tbl->chip_ops = &calioc2_chip_ops;
+       else
+               BUG();
+
+       calgary_reserve_regions(dev);
+
+       /* set TARs for each PHB */
+       target = calgary_reg(bbar, tar_offset(dev->bus->number));
+       val64 = be64_to_cpu(readq(target));
+
+       /* zero out all TAR bits under sw control */
+       val64 &= ~TAR_SW_BITS;
+       table_phys = (u64)__pa(tbl->it_base);
+
+       val64 |= table_phys;
+
+       BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M);
+       val64 |= (u64) specified_table_size;
+
+       tbl->tar_val = cpu_to_be64(val64);
+
+       writeq(tbl->tar_val, target);
+       readq(target); /* flush */
+
+       return 0;
+}
+
+static void __init calgary_free_bus(struct pci_dev *dev)
+{
+       u64 val64;
+       struct iommu_table *tbl = pci_iommu(dev->bus);
+       void __iomem *target;
+       unsigned int bitmapsz;
+
+       target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number));
+       val64 = be64_to_cpu(readq(target));
+       val64 &= ~TAR_SW_BITS;
+       writeq(cpu_to_be64(val64), target);
+       readq(target); /* flush */
+
+       bitmapsz = tbl->it_size / BITS_PER_BYTE;
+       free_pages((unsigned long)tbl->it_map, get_order(bitmapsz));
+       tbl->it_map = NULL;
+
+       kfree(tbl);
+       
+       set_pci_iommu(dev->bus, NULL);
+
+       /* Can't free bootmem allocated memory after system is up :-( */
+       bus_info[dev->bus->number].tce_space = NULL;
+}
+
+static void calgary_dump_error_regs(struct iommu_table *tbl)
+{
+       void __iomem *bbar = tbl->bbar;
+       void __iomem *target;
+       u32 csr, plssr;
+
+       target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
+       csr = be32_to_cpu(readl(target));
+
+       target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
+       plssr = be32_to_cpu(readl(target));
+
+       /* If no error, the agent ID in the CSR is not valid */
+       printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, "
+              "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr);
+}
+
+static void calioc2_dump_error_regs(struct iommu_table *tbl)
+{
+       void __iomem *bbar = tbl->bbar;
+       u32 csr, csmr, plssr, mck, rcstat;
+       void __iomem *target;
+       unsigned long phboff = phb_offset(tbl->it_busno);
+       unsigned long erroff;
+       u32 errregs[7];
+       int i;
+
+       /* dump CSR */
+       target = calgary_reg(bbar, phboff | PHB_CSR_OFFSET);
+       csr = be32_to_cpu(readl(target));
+       /* dump PLSSR */
+       target = calgary_reg(bbar, phboff | PHB_PLSSR_OFFSET);
+       plssr = be32_to_cpu(readl(target));
+       /* dump CSMR */
+       target = calgary_reg(bbar, phboff | 0x290);
+       csmr = be32_to_cpu(readl(target));
+       /* dump mck */
+       target = calgary_reg(bbar, phboff | 0x800);
+       mck = be32_to_cpu(readl(target));
+
+       printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n",
+              tbl->it_busno);
+
+       printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n",
+              csr, plssr, csmr, mck);
+
+       /* dump rest of error regs */
+       printk(KERN_EMERG "Calgary: ");
+       for (i = 0; i < ARRAY_SIZE(errregs); i++) {
+               /* err regs are at 0x810 - 0x870 */
+               erroff = (0x810 + (i * 0x10));
+               target = calgary_reg(bbar, phboff | erroff);
+               errregs[i] = be32_to_cpu(readl(target));
+               printk("0x%08x@0x%lx ", errregs[i], erroff);
+       }
+       printk("\n");
+
+       /* root complex status */
+       target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS);
+       rcstat = be32_to_cpu(readl(target));
+       printk(KERN_EMERG "Calgary: 0x%08x@0x%x\n", rcstat,
+              PHB_ROOT_COMPLEX_STATUS);
+}
+
+static void calgary_watchdog(unsigned long data)
+{
+       struct pci_dev *dev = (struct pci_dev *)data;
+       struct iommu_table *tbl = pci_iommu(dev->bus);
+       void __iomem *bbar = tbl->bbar;
+       u32 val32;
+       void __iomem *target;
+
+       target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
+       val32 = be32_to_cpu(readl(target));
+
+       /* If no error, the agent ID in the CSR is not valid */
+       if (val32 & CSR_AGENT_MASK) {
+               tbl->chip_ops->dump_error_regs(tbl);
+
+               /* reset error */
+               writel(0, target);
+
+               /* Disable bus that caused the error */
+               target = calgary_reg(bbar, phb_offset(tbl->it_busno) |
+                                    PHB_CONFIG_RW_OFFSET);
+               val32 = be32_to_cpu(readl(target));
+               val32 |= PHB_SLOT_DISABLE;
+               writel(cpu_to_be32(val32), target);
+               readl(target); /* flush */
+       } else {
+               /* Reset the timer */
+               mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ);
+       }
+}
+
+static void __init calgary_set_split_completion_timeout(void __iomem *bbar,
+       unsigned char busnum, unsigned long timeout)
+{
+       u64 val64;
+       void __iomem *target;
+       unsigned int phb_shift = ~0; /* silence gcc */
+       u64 mask;
+
+       switch (busno_to_phbid(busnum)) {
+       case 0: phb_shift = (63 - 19);
+               break;
+       case 1: phb_shift = (63 - 23);
+               break;
+       case 2: phb_shift = (63 - 27);
+               break;
+       case 3: phb_shift = (63 - 35);
+               break;
+       default:
+               BUG_ON(busno_to_phbid(busnum));
+       }
+
+       target = calgary_reg(bbar, CALGARY_CONFIG_REG);
+       val64 = be64_to_cpu(readq(target));
+
+       /* zero out this PHB's timer bits */
+       mask = ~(0xFUL << phb_shift);
+       val64 &= mask;
+       val64 |= (timeout << phb_shift);
+       writeq(cpu_to_be64(val64), target);
+       readq(target); /* flush */
+}
+
+static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
+{
+       unsigned char busnum = dev->bus->number;
+       void __iomem *bbar = tbl->bbar;
+       void __iomem *target;
+       u32 val;
+
+       /*
+        * CalIOC2 designers recommend setting bit 8 in 0xnDB0 to 1
+        */
+       target = calgary_reg(bbar, phb_offset(busnum) | PHB_SAVIOR_L2);
+       val = cpu_to_be32(readl(target));
+       val |= 0x00800000;
+       writel(cpu_to_be32(val), target);
+}
+
+static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
+{
+       unsigned char busnum = dev->bus->number;
+
+       /*
+        * Give split completion a longer timeout on bus 1 for aic94xx
+        * http://bugzilla.kernel.org/show_bug.cgi?id=7180
+        */
+       if (is_calgary(dev->device) && (busnum == 1))
+               calgary_set_split_completion_timeout(tbl->bbar, busnum,
+                                                    CCR_2SEC_TIMEOUT);
+}
+
+static void __init calgary_enable_translation(struct pci_dev *dev)
+{
+       u32 val32;
+       unsigned char busnum;
+       void __iomem *target;
+       void __iomem *bbar;
+       struct iommu_table *tbl;
+
+       busnum = dev->bus->number;
+       tbl = pci_iommu(dev->bus);
+       bbar = tbl->bbar;
+
+       /* enable TCE in PHB Config Register */
+       target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
+       val32 = be32_to_cpu(readl(target));
+       val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE;
+
+       printk(KERN_INFO "Calgary: enabling translation on %s PHB %#x\n",
+              (dev->device == PCI_DEVICE_ID_IBM_CALGARY) ?
+              "Calgary" : "CalIOC2", busnum);
+       printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this "
+              "bus.\n");
+
+       writel(cpu_to_be32(val32), target);
+       readl(target); /* flush */
+
+       init_timer(&tbl->watchdog_timer);
+       tbl->watchdog_timer.function = &calgary_watchdog;
+       tbl->watchdog_timer.data = (unsigned long)dev;
+       mod_timer(&tbl->watchdog_timer, jiffies);
+}
+
+static void __init calgary_disable_translation(struct pci_dev *dev)
+{
+       u32 val32;
+       unsigned char busnum;
+       void __iomem *target;
+       void __iomem *bbar;
+       struct iommu_table *tbl;
+
+       busnum = dev->bus->number;
+       tbl = pci_iommu(dev->bus);
+       bbar = tbl->bbar;
+
+       /* disable TCE in PHB Config Register */
+       target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
+       val32 = be32_to_cpu(readl(target));
+       val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE);
+
+       printk(KERN_INFO "Calgary: disabling translation on PHB %#x!\n", busnum);
+       writel(cpu_to_be32(val32), target);
+       readl(target); /* flush */
+
+       del_timer_sync(&tbl->watchdog_timer);
+}
+
+static void __init calgary_init_one_nontraslated(struct pci_dev *dev)
+{
+       pci_dev_get(dev);
+       set_pci_iommu(dev->bus, NULL);
+
+       /* is the device behind a bridge? */
+       if (dev->bus->parent)
+               dev->bus->parent->self = dev;
+       else
+               dev->bus->self = dev;
+}
+
+static int __init calgary_init_one(struct pci_dev *dev)
+{
+       void __iomem *bbar;
+       struct iommu_table *tbl;
+       int ret;
+
+       BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM);
+
+       bbar = busno_to_bbar(dev->bus->number);
+       ret = calgary_setup_tar(dev, bbar);
+       if (ret)
+               goto done;
+
+       pci_dev_get(dev);
+
+       if (dev->bus->parent) {
+               if (dev->bus->parent->self)
+                       printk(KERN_WARNING "Calgary: IEEEE, dev %p has "
+                              "bus->parent->self!\n", dev);
+               dev->bus->parent->self = dev;
+       } else
+               dev->bus->self = dev;
+
+       tbl = pci_iommu(dev->bus);
+       tbl->chip_ops->handle_quirks(tbl, dev);
+
+       calgary_enable_translation(dev);
+
+       return 0;
+
+done:
+       return ret;
+}
+
+static int __init calgary_locate_bbars(void)
+{
+       int ret;
+       int rioidx, phb, bus;
+       void __iomem *bbar;
+       void __iomem *target;
+       unsigned long offset;
+       u8 start_bus, end_bus;
+       u32 val;
+
+       ret = -ENODATA;
+       for (rioidx = 0; rioidx < rio_table_hdr->num_rio_dev; rioidx++) {
+               struct rio_detail *rio = rio_devs[rioidx];
+
+               if ((rio->type != COMPAT_CALGARY) && (rio->type != ALT_CALGARY))
+                       continue;
+
+               /* map entire 1MB of Calgary config space */
+               bbar = ioremap_nocache(rio->BBAR, 1024 * 1024);
+               if (!bbar)
+                       goto error;
+
+               for (phb = 0; phb < PHBS_PER_CALGARY; phb++) {
+                       offset = phb_debug_offsets[phb] | PHB_DEBUG_STUFF_OFFSET;
+                       target = calgary_reg(bbar, offset);
+
+                       val = be32_to_cpu(readl(target));
+
+                       start_bus = (u8)((val & 0x00FF0000) >> 16);
+                       end_bus = (u8)((val & 0x0000FF00) >> 8);
+
+                       if (end_bus) {
+                               for (bus = start_bus; bus <= end_bus; bus++) {
+                                       bus_info[bus].bbar = bbar;
+                                       bus_info[bus].phbid = phb;
+                               }
+                       } else {
+                               bus_info[start_bus].bbar = bbar;
+                               bus_info[start_bus].phbid = phb;
+                       }
+               }
+       }
+
+       return 0;
+
+error:
+       /* scan bus_info and iounmap any bbars we previously ioremap'd */
+       for (bus = 0; bus < ARRAY_SIZE(bus_info); bus++)
+               if (bus_info[bus].bbar)
+                       iounmap(bus_info[bus].bbar);
+
+       return ret;
+}
+
+static int __init calgary_init(void)
+{
+       int ret;
+       struct pci_dev *dev = NULL;
+       void *tce_space;
+
+       ret = calgary_locate_bbars();
+       if (ret)
+               return ret;
+
+       do {
+               dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
+               if (!dev)
+                       break;
+               if (!is_cal_pci_dev(dev->device))
+                       continue;
+               if (!translate_phb(dev)) {
+                       calgary_init_one_nontraslated(dev);
+                       continue;
+               }
+               tce_space = bus_info[dev->bus->number].tce_space;
+               if (!tce_space && !translate_empty_slots)
+                       continue;
+
+               ret = calgary_init_one(dev);
+               if (ret)
+                       goto error;
+       } while (1);
+
+       return ret;
+
+error:
+       do {
+               dev = pci_get_device_reverse(PCI_VENDOR_ID_IBM,
+                                            PCI_ANY_ID, dev);
+               if (!dev)
+                       break;
+               if (!is_cal_pci_dev(dev->device))
+                       continue;
+               if (!translate_phb(dev)) {
+                       pci_dev_put(dev);
+                       continue;
+               }
+               if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots)
+                       continue;
+
+               calgary_disable_translation(dev);
+               calgary_free_bus(dev);
+               pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
+       } while (1);
+
+       return ret;
+}
+
+static inline int __init determine_tce_table_size(u64 ram)
+{
+       int ret;
+
+       if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED)
+               return specified_table_size;
+
+       /*
+        * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to
+        * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each
+        * larger table size has twice as many entries, so shift the
+        * max ram address by 13 to divide by 8K and then look at the
+        * order of the result to choose between 0-7.
+        */
+       ret = get_order(ram >> 13);
+       if (ret > TCE_TABLE_SIZE_8M)
+               ret = TCE_TABLE_SIZE_8M;
+
+       return ret;
+}
+
+static int __init build_detail_arrays(void)
+{
+       unsigned long ptr;
+       int i, scal_detail_size, rio_detail_size;
+
+       if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){
+               printk(KERN_WARNING
+                       "Calgary: MAX_NUMNODES too low! Defined as %d, "
+                       "but system has %d nodes.\n",
+                       MAX_NUMNODES, rio_table_hdr->num_scal_dev);
+               return -ENODEV;
+       }
+
+       switch (rio_table_hdr->version){
+       case 2:
+               scal_detail_size = 11;
+               rio_detail_size = 13;
+               break;
+       case 3:
+               scal_detail_size = 12;
+               rio_detail_size = 15;
+               break;
+       default:
+               printk(KERN_WARNING
+                      "Calgary: Invalid Rio Grande Table Version: %d\n",
+                      rio_table_hdr->version);
+               return -EPROTO;
+       }
+
+       ptr = ((unsigned long)rio_table_hdr) + 3;
+       for (i = 0; i < rio_table_hdr->num_scal_dev;
+                   i++, ptr += scal_detail_size)
+               scal_devs[i] = (struct scal_detail *)ptr;
+
+       for (i = 0; i < rio_table_hdr->num_rio_dev;
+                   i++, ptr += rio_detail_size)
+               rio_devs[i] = (struct rio_detail *)ptr;
+
+       return 0;
+}
+
+static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
+{
+       int dev;
+       u32 val;
+
+       if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) {
+               /*
+                * FIXME: properly scan for devices accross the
+                * PCI-to-PCI bridge on every CalIOC2 port.
+                */
+               return 1;
+       }
+
+       for (dev = 1; dev < 8; dev++) {
+               val = read_pci_config(bus, dev, 0, 0);
+               if (val != 0xffffffff)
+                       break;
+       }
+       return (val != 0xffffffff);
+}
+
+void __init detect_calgary(void)
+{
+       int bus;
+       void *tbl;
+       int calgary_found = 0;
+       unsigned long ptr;
+       unsigned int offset, prev_offset;
+       int ret;
+
+       /*
+        * if the user specified iommu=off or iommu=soft or we found
+        * another HW IOMMU already, bail out.
+        */
+       if (swiotlb || no_iommu || iommu_detected)
+               return;
+
+       if (!use_calgary)
+               return;
+
+       if (!early_pci_allowed())
+               return;
+
+       printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n");
+
+       ptr = (unsigned long)phys_to_virt(get_bios_ebda());
+
+       rio_table_hdr = NULL;
+       prev_offset = 0;
+       offset = 0x180;
+       /*
+        * The next offset is stored in the 1st word.
+        * Only parse up until the offset increases:
+        */
+       while (offset > prev_offset) {
+               /* The block id is stored in the 2nd word */
+               if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
+                       /* set the pointer past the offset & block id */
+                       rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
+                       break;
+               }
+               prev_offset = offset;
+               offset = *((unsigned short *)(ptr + offset));
+       }
+       if (!rio_table_hdr) {
+               printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table "
+                      "in EBDA - bailing!\n");
+               return;
+       }
+
+       ret = build_detail_arrays();
+       if (ret) {
+               printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret);
+               return;
+       }
+
+       specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);
+
+       for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
+               struct calgary_bus_info *info = &bus_info[bus];
+               unsigned short pci_device;
+               u32 val;
+
+               val = read_pci_config(bus, 0, 0, 0);
+               pci_device = (val & 0xFFFF0000) >> 16;
+
+               if (!is_cal_pci_dev(pci_device))
+                       continue;
+
+               if (info->translation_disabled)
+                       continue;
+
+               if (calgary_bus_has_devices(bus, pci_device) ||
+                   translate_empty_slots) {
+                       tbl = alloc_tce_table();
+                       if (!tbl)
+                               goto cleanup;
+                       info->tce_space = tbl;
+                       calgary_found = 1;
+               }
+       }
+
+       printk(KERN_DEBUG "Calgary: finished detection, Calgary %s\n",
+              calgary_found ? "found" : "not found");
+
+       if (calgary_found) {
+               iommu_detected = 1;
+               calgary_detected = 1;
+               printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
+               printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
+                      "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
+                      debugging ? "enabled" : "disabled");
+       }
+       return;
+
+cleanup:
+       for (--bus; bus >= 0; --bus) {
+               struct calgary_bus_info *info = &bus_info[bus];
+
+               if (info->tce_space)
+                       free_tce_table(info->tce_space);
+       }
+}
+
+int __init calgary_iommu_init(void)
+{
+       int ret;
+
+       if (no_iommu || swiotlb)
+               return -ENODEV;
+
+       if (!calgary_detected)
+               return -ENODEV;
+
+       /* ok, we're trying to use Calgary - let's roll */
+       printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
+
+       ret = calgary_init();
+       if (ret) {
+               printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
+                      "falling back to no_iommu\n", ret);
+               if (end_pfn > MAX_DMA32_PFN)
+                       printk(KERN_ERR "WARNING more than 4GB of memory, "
+                                       "32bit PCI may malfunction.\n");
+               return ret;
+       }
+
+       force_iommu = 1;
+       bad_dma_address = 0x0;
+       dma_ops = &calgary_dma_ops;
+
+       return 0;
+}
+
+static int __init calgary_parse_options(char *p)
+{
+       unsigned int bridge;
+       size_t len;
+       char* endp;
+
+       while (*p) {
+               if (!strncmp(p, "64k", 3))
+                       specified_table_size = TCE_TABLE_SIZE_64K;
+               else if (!strncmp(p, "128k", 4))
+                       specified_table_size = TCE_TABLE_SIZE_128K;
+               else if (!strncmp(p, "256k", 4))
+                       specified_table_size = TCE_TABLE_SIZE_256K;
+               else if (!strncmp(p, "512k", 4))
+                       specified_table_size = TCE_TABLE_SIZE_512K;
+               else if (!strncmp(p, "1M", 2))
+                       specified_table_size = TCE_TABLE_SIZE_1M;
+               else if (!strncmp(p, "2M", 2))
+                       specified_table_size = TCE_TABLE_SIZE_2M;
+               else if (!strncmp(p, "4M", 2))
+                       specified_table_size = TCE_TABLE_SIZE_4M;
+               else if (!strncmp(p, "8M", 2))
+                       specified_table_size = TCE_TABLE_SIZE_8M;
+
+               len = strlen("translate_empty_slots");
+               if (!strncmp(p, "translate_empty_slots", len))
+                       translate_empty_slots = 1;
+
+               len = strlen("disable");
+               if (!strncmp(p, "disable", len)) {
+                       p += len;
+                       if (*p == '=')
+                               ++p;
+                       if (*p == '\0')
+                               break;
+                       bridge = simple_strtol(p, &endp, 0);
+                       if (p == endp)
+                               break;
+
+                       if (bridge < MAX_PHB_BUS_NUM) {
+                               printk(KERN_INFO "Calgary: disabling "
+                                      "translation for PHB %#x\n", bridge);
+                               bus_info[bridge].translation_disabled = 1;
+                       }
+               }
+
+               p = strpbrk(p, ",");
+               if (!p)
+                       break;
+
+               p++; /* skip ',' */
+       }
+       return 1;
+}
+__setup("calgary=", calgary_parse_options);
+
+static void __init calgary_fixup_one_tce_space(struct pci_dev *dev)
+{
+       struct iommu_table *tbl;
+       unsigned int npages;
+       int i;
+
+       tbl = pci_iommu(dev->bus);
+
+       for (i = 0; i < 4; i++) {
+               struct resource *r = &dev->resource[PCI_BRIDGE_RESOURCES + i];
+
+               /* Don't give out TCEs that map MEM resources */
+               if (!(r->flags & IORESOURCE_MEM))
+                       continue;
+
+               /* 0-based? we reserve the whole 1st MB anyway */
+               if (!r->start)
+                       continue;
+
+               /* cover the whole region */
+               npages = (r->end - r->start) >> PAGE_SHIFT;
+               npages++;
+
+               iommu_range_reserve(tbl, r->start, npages);
+       }
+}
+
+static int __init calgary_fixup_tce_spaces(void)
+{
+       struct pci_dev *dev = NULL;
+       void *tce_space;
+
+       if (no_iommu || swiotlb || !calgary_detected)
+               return -ENODEV;
+
+       printk(KERN_DEBUG "Calgary: fixing up tce spaces\n");
+
+       do {
+               dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
+               if (!dev)
+                       break;
+               if (!is_cal_pci_dev(dev->device))
+                       continue;
+               if (!translate_phb(dev))
+                       continue;
+
+               tce_space = bus_info[dev->bus->number].tce_space;
+               if (!tce_space)
+                       continue;
+
+               calgary_fixup_one_tce_space(dev);
+
+       } while (1);
+
+       return 0;
+}
+
+/*
+ * We need to be call after pcibios_assign_resources (fs_initcall level)
+ * and before device_initcall.
+ */
+rootfs_initcall(calgary_fixup_tce_spaces);
diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma_64.c
new file mode 100644 (file)
index 0000000..2971144
--- /dev/null
@@ -0,0 +1,346 @@
+/*
+ * Dynamic DMA mapping support.
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <asm/io.h>
+#include <asm/iommu.h>
+#include <asm/calgary.h>
+
+int iommu_merge __read_mostly = 0;
+EXPORT_SYMBOL(iommu_merge);
+
+dma_addr_t bad_dma_address __read_mostly;
+EXPORT_SYMBOL(bad_dma_address);
+
+/* This tells the BIO block layer to assume merging. Default to off
+   because we cannot guarantee merging later. */
+int iommu_bio_merge __read_mostly = 0;
+EXPORT_SYMBOL(iommu_bio_merge);
+
+static int iommu_sac_force __read_mostly = 0;
+
+int no_iommu __read_mostly;
+#ifdef CONFIG_IOMMU_DEBUG
+int panic_on_overflow __read_mostly = 1;
+int force_iommu __read_mostly = 1;
+#else
+int panic_on_overflow __read_mostly = 0;
+int force_iommu __read_mostly= 0;
+#endif
+
+/* Set this to 1 if there is a HW IOMMU in the system */
+int iommu_detected __read_mostly = 0;
+
+/* Dummy device used for NULL arguments (normally ISA). Better would
+   be probably a smaller DMA mask, but this is bug-to-bug compatible
+   to i386. */
+struct device fallback_dev = {
+       .bus_id = "fallback device",
+       .coherent_dma_mask = DMA_32BIT_MASK,
+       .dma_mask = &fallback_dev.coherent_dma_mask,
+};
+
+/* Allocate DMA memory on node near device */
+noinline static void *
+dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
+{
+       struct page *page;
+       int node;
+#ifdef CONFIG_PCI
+       if (dev->bus == &pci_bus_type)
+               node = pcibus_to_node(to_pci_dev(dev)->bus);
+       else
+#endif
+               node = numa_node_id();
+
+       if (node < first_node(node_online_map))
+               node = first_node(node_online_map);
+
+       page = alloc_pages_node(node, gfp, order);
+       return page ? page_address(page) : NULL;
+}
+
+/*
+ * Allocate memory for a coherent mapping.
+ */
+void *
+dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
+                  gfp_t gfp)
+{
+       void *memory;
+       unsigned long dma_mask = 0;
+       u64 bus;
+
+       if (!dev)
+               dev = &fallback_dev;
+       dma_mask = dev->coherent_dma_mask;
+       if (dma_mask == 0)
+               dma_mask = DMA_32BIT_MASK;
+
+       /* Device not DMA able */
+       if (dev->dma_mask == NULL)
+               return NULL;
+
+       /* Don't invoke OOM killer */
+       gfp |= __GFP_NORETRY;
+
+       /* Kludge to make it bug-to-bug compatible with i386. i386
+          uses the normal dma_mask for alloc_coherent. */
+       dma_mask &= *dev->dma_mask;
+
+       /* Why <=? Even when the mask is smaller than 4GB it is often
+          larger than 16MB and in this case we have a chance of
+          finding fitting memory in the next higher zone first. If
+          not retry with true GFP_DMA. -AK */
+       if (dma_mask <= DMA_32BIT_MASK)
+               gfp |= GFP_DMA32;
+
+ again:
+       memory = dma_alloc_pages(dev, gfp, get_order(size));
+       if (memory == NULL)
+               return NULL;
+
+       {
+               int high, mmu;
+               bus = virt_to_bus(memory);
+               high = (bus + size) >= dma_mask;
+               mmu = high;
+               if (force_iommu && !(gfp & GFP_DMA))
+                       mmu = 1;
+               else if (high) {
+                       free_pages((unsigned long)memory,
+                                  get_order(size));
+
+                       /* Don't use the 16MB ZONE_DMA unless absolutely
+                          needed. It's better to use remapping first. */
+                       if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
+                               gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
+                               goto again;
+                       }
+
+                       /* Let low level make its own zone decisions */
+                       gfp &= ~(GFP_DMA32|GFP_DMA);
+
+                       if (dma_ops->alloc_coherent)
+                               return dma_ops->alloc_coherent(dev, size,
+                                                          dma_handle, gfp);
+                       return NULL;
+               }
+
+               memset(memory, 0, size);
+               if (!mmu) {
+                       *dma_handle = virt_to_bus(memory);
+                       return memory;
+               }
+       }
+
+       if (dma_ops->alloc_coherent) {
+               free_pages((unsigned long)memory, get_order(size));
+               gfp &= ~(GFP_DMA|GFP_DMA32);
+               return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
+       }
+
+       if (dma_ops->map_simple) {
+               *dma_handle = dma_ops->map_simple(dev, memory,
+                                             size,
+                                             PCI_DMA_BIDIRECTIONAL);
+               if (*dma_handle != bad_dma_address)
+                       return memory;
+       }
+
+       if (panic_on_overflow)
+               panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",size);
+       free_pages((unsigned long)memory, get_order(size));
+       return NULL;
+}
+EXPORT_SYMBOL(dma_alloc_coherent);
+
+/*
+ * Unmap coherent memory.
+ * The caller must ensure that the device has finished accessing the mapping.
+ */
+void dma_free_coherent(struct device *dev, size_t size,
+                        void *vaddr, dma_addr_t bus)
+{
+       if (dma_ops->unmap_single)
+               dma_ops->unmap_single(dev, bus, size, 0);
+       free_pages((unsigned long)vaddr, get_order(size));
+}
+EXPORT_SYMBOL(dma_free_coherent);
+
+static int forbid_dac __read_mostly;
+
+int dma_supported(struct device *dev, u64 mask)
+{
+#ifdef CONFIG_PCI
+       if (mask > 0xffffffff && forbid_dac > 0) {
+
+
+
+               printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", dev->bus_id);
+               return 0;
+       }
+#endif
+
+       if (dma_ops->dma_supported)
+               return dma_ops->dma_supported(dev, mask);
+
+       /* Copied from i386. Doesn't make much sense, because it will
+          only work for pci_alloc_coherent.
+          The caller just has to use GFP_DMA in this case. */
+        if (mask < DMA_24BIT_MASK)
+                return 0;
+
+       /* Tell the device to use SAC when IOMMU force is on.  This
+          allows the driver to use cheaper accesses in some cases.
+
+          Problem with this is that if we overflow the IOMMU area and
+          return DAC as fallback address the device may not handle it
+          correctly.
+
+          As a special case some controllers have a 39bit address
+          mode that is as efficient as 32bit (aic79xx). Don't force
+          SAC for these.  Assume all masks <= 40 bits are of this
+          type. Normally this doesn't make any difference, but gives
+          more gentle handling of IOMMU overflow. */
+       if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
+               printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask);
+               return 0;
+       }
+
+       return 1;
+}
+EXPORT_SYMBOL(dma_supported);
+
+int dma_set_mask(struct device *dev, u64 mask)
+{
+       if (!dev->dma_mask || !dma_supported(dev, mask))
+               return -EIO;
+       *dev->dma_mask = mask;
+       return 0;
+}
+EXPORT_SYMBOL(dma_set_mask);
+
+/*
+ * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
+ * documentation.
+ */
+__init int iommu_setup(char *p)
+{
+       iommu_merge = 1;
+
+       if (!p)
+               return -EINVAL;
+
+       while (*p) {
+               if (!strncmp(p,"off",3))
+                       no_iommu = 1;
+               /* gart_parse_options has more force support */
+               if (!strncmp(p,"force",5))
+                       force_iommu = 1;
+               if (!strncmp(p,"noforce",7)) {
+                       iommu_merge = 0;
+                       force_iommu = 0;
+               }
+
+               if (!strncmp(p, "biomerge",8)) {
+                       iommu_bio_merge = 4096;
+                       iommu_merge = 1;
+                       force_iommu = 1;
+               }
+               if (!strncmp(p, "panic",5))
+                       panic_on_overflow = 1;
+               if (!strncmp(p, "nopanic",7))
+                       panic_on_overflow = 0;
+               if (!strncmp(p, "merge",5)) {
+                       iommu_merge = 1;
+                       force_iommu = 1;
+               }
+               if (!strncmp(p, "nomerge",7))
+                       iommu_merge = 0;
+               if (!strncmp(p, "forcesac",8))
+                       iommu_sac_force = 1;
+               if (!strncmp(p, "allowdac", 8))
+                       forbid_dac = 0;
+               if (!strncmp(p, "nodac", 5))
+                       forbid_dac = -1;
+
+#ifdef CONFIG_SWIOTLB
+               if (!strncmp(p, "soft",4))
+                       swiotlb = 1;
+#endif
+
+#ifdef CONFIG_IOMMU
+               gart_parse_options(p);
+#endif
+
+#ifdef CONFIG_CALGARY_IOMMU
+               if (!strncmp(p, "calgary", 7))
+                       use_calgary = 1;
+#endif /* CONFIG_CALGARY_IOMMU */
+
+               p += strcspn(p, ",");
+               if (*p == ',')
+                       ++p;
+       }
+       return 0;
+}
+early_param("iommu", iommu_setup);
+
+void __init pci_iommu_alloc(void)
+{
+       /*
+        * The order of these functions is important for
+        * fall-back/fail-over reasons
+        */
+#ifdef CONFIG_IOMMU
+       iommu_hole_init();
+#endif
+
+#ifdef CONFIG_CALGARY_IOMMU
+       detect_calgary();
+#endif
+
+#ifdef CONFIG_SWIOTLB
+       pci_swiotlb_init();
+#endif
+}
+
+static int __init pci_iommu_init(void)
+{
+#ifdef CONFIG_CALGARY_IOMMU
+       calgary_iommu_init();
+#endif
+
+#ifdef CONFIG_IOMMU
+       gart_iommu_init();
+#endif
+
+       no_iommu_init();
+       return 0;
+}
+
+void pci_iommu_shutdown(void)
+{
+       gart_iommu_shutdown();
+}
+
+#ifdef CONFIG_PCI
+/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
+
+static __devinit void via_no_dac(struct pci_dev *dev)
+{
+       if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
+               printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
+               forbid_dac = 1;
+       }
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
+#endif
+/* Must execute after PCI subsystem */
+fs_initcall(pci_iommu_init);
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
new file mode 100644 (file)
index 0000000..4918c57
--- /dev/null
@@ -0,0 +1,740 @@
+/*
+ * Dynamic DMA mapping support for AMD Hammer.
+ * 
+ * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI.
+ * This allows to use PCI devices that only support 32bit addresses on systems
+ * with more than 4GB. 
+ *
+ * See Documentation/DMA-mapping.txt for the interface specification.
+ * 
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ */
+
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/agp_backend.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <linux/topology.h>
+#include <linux/interrupt.h>
+#include <linux/bitops.h>
+#include <linux/kdebug.h>
+#include <asm/atomic.h>
+#include <asm/io.h>
+#include <asm/mtrr.h>
+#include <asm/pgtable.h>
+#include <asm/proto.h>
+#include <asm/iommu.h>
+#include <asm/cacheflush.h>
+#include <asm/swiotlb.h>
+#include <asm/dma.h>
+#include <asm/k8.h>
+
+unsigned long iommu_bus_base;  /* GART remapping area (physical) */
+static unsigned long iommu_size;       /* size of remapping area bytes */
+static unsigned long iommu_pages;      /* .. and in pages */
+
+u32 *iommu_gatt_base;          /* Remapping table */
+
+/* If this is disabled the IOMMU will use an optimized flushing strategy
+   of only flushing when an mapping is reused. With it true the GART is flushed 
+   for every mapping. Problem is that doing the lazy flush seems to trigger
+   bugs with some popular PCI cards, in particular 3ware (but has been also
+   also seen with Qlogic at least). */
+int iommu_fullflush = 1;
+
+/* Allocation bitmap for the remapping area */ 
+static DEFINE_SPINLOCK(iommu_bitmap_lock);
+static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */
+
+static u32 gart_unmapped_entry; 
+
+#define GPTE_VALID    1
+#define GPTE_COHERENT 2
+#define GPTE_ENCODE(x) \
+       (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
+#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
+
+#define to_pages(addr,size) \
+       (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
+
+#define EMERGENCY_PAGES 32 /* = 128KB */ 
+
+#ifdef CONFIG_AGP
+#define AGPEXTERN extern
+#else
+#define AGPEXTERN
+#endif
+
+/* backdoor interface to AGP driver */
+AGPEXTERN int agp_memory_reserved;
+AGPEXTERN __u32 *agp_gatt_table;
+
+static unsigned long next_bit;  /* protected by iommu_bitmap_lock */
+static int need_flush;                 /* global flush state. set for each gart wrap */
+
+static unsigned long alloc_iommu(int size) 
+{      
+       unsigned long offset, flags;
+
+       spin_lock_irqsave(&iommu_bitmap_lock, flags);   
+       offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size);
+       if (offset == -1) {
+               need_flush = 1;
+               offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size);
+       }
+       if (offset != -1) { 
+               set_bit_string(iommu_gart_bitmap, offset, size); 
+               next_bit = offset+size; 
+               if (next_bit >= iommu_pages) { 
+                       next_bit = 0;
+                       need_flush = 1;
+               } 
+       } 
+       if (iommu_fullflush)
+               need_flush = 1;
+       spin_unlock_irqrestore(&iommu_bitmap_lock, flags);      
+       return offset;
+} 
+
+static void free_iommu(unsigned long offset, int size)
+{ 
+       unsigned long flags;
+       spin_lock_irqsave(&iommu_bitmap_lock, flags);
+       __clear_bit_string(iommu_gart_bitmap, offset, size);
+       spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+} 
+
+/* 
+ * Use global flush state to avoid races with multiple flushers.
+ */
+static void flush_gart(void)
+{ 
+       unsigned long flags;
+       spin_lock_irqsave(&iommu_bitmap_lock, flags);
+       if (need_flush) {
+               k8_flush_garts();
+               need_flush = 0;
+       } 
+       spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+} 
+
+#ifdef CONFIG_IOMMU_LEAK
+
+#define SET_LEAK(x) if (iommu_leak_tab) \
+                       iommu_leak_tab[x] = __builtin_return_address(0);
+#define CLEAR_LEAK(x) if (iommu_leak_tab) \
+                       iommu_leak_tab[x] = NULL;
+
+/* Debugging aid for drivers that don't free their IOMMU tables */
+static void **iommu_leak_tab; 
+static int leak_trace;
+int iommu_leak_pages = 20; 
+void dump_leak(void)
+{
+       int i;
+       static int dump; 
+       if (dump || !iommu_leak_tab) return;
+       dump = 1;
+       show_stack(NULL,NULL);
+       /* Very crude. dump some from the end of the table too */ 
+       printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages); 
+       for (i = 0; i < iommu_leak_pages; i+=2) {
+               printk("%lu: ", iommu_pages-i);
+               printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]);
+               printk("%c", (i+1)%2 == 0 ? '\n' : ' '); 
+       } 
+       printk("\n");
+}
+#else
+#define SET_LEAK(x)
+#define CLEAR_LEAK(x)
+#endif
+
+static void iommu_full(struct device *dev, size_t size, int dir)
+{
+       /* 
+        * Ran out of IOMMU space for this operation. This is very bad.
+        * Unfortunately the drivers cannot handle this operation properly.
+        * Return some non mapped prereserved space in the aperture and 
+        * let the Northbridge deal with it. This will result in garbage
+        * in the IO operation. When the size exceeds the prereserved space
+        * memory corruption will occur or random memory will be DMAed 
+        * out. Hopefully no network devices use single mappings that big.
+        */ 
+       
+       printk(KERN_ERR 
+  "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
+              size, dev->bus_id);
+
+       if (size > PAGE_SIZE*EMERGENCY_PAGES) {
+               if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
+                       panic("PCI-DMA: Memory would be corrupted\n");
+               if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) 
+                       panic(KERN_ERR "PCI-DMA: Random memory would be DMAed\n");
+       } 
+
+#ifdef CONFIG_IOMMU_LEAK
+       dump_leak(); 
+#endif
+} 
+
+static inline int need_iommu(struct device *dev, unsigned long addr, size_t size)
+{ 
+       u64 mask = *dev->dma_mask;
+       int high = addr + size > mask;
+       int mmu = high;
+       if (force_iommu) 
+               mmu = 1; 
+       return mmu; 
+}
+
+static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
+{ 
+       u64 mask = *dev->dma_mask;
+       int high = addr + size > mask;
+       int mmu = high;
+       return mmu; 
+}
+
+/* Map a single continuous physical area into the IOMMU.
+ * Caller needs to check if the iommu is needed and flush.
+ */
+static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
+                               size_t size, int dir)
+{ 
+       unsigned long npages = to_pages(phys_mem, size);
+       unsigned long iommu_page = alloc_iommu(npages);
+       int i;
+       if (iommu_page == -1) {
+               if (!nonforced_iommu(dev, phys_mem, size))
+                       return phys_mem; 
+               if (panic_on_overflow)
+                       panic("dma_map_area overflow %lu bytes\n", size);
+               iommu_full(dev, size, dir);
+               return bad_dma_address;
+       }
+
+       for (i = 0; i < npages; i++) {
+               iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
+               SET_LEAK(iommu_page + i);
+               phys_mem += PAGE_SIZE;
+       }
+       return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
+}
+
+static dma_addr_t gart_map_simple(struct device *dev, char *buf,
+                                size_t size, int dir)
+{
+       dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir);
+       flush_gart();
+       return map;
+}
+
+/* Map a single area into the IOMMU */
+static dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir)
+{
+       unsigned long phys_mem, bus;
+
+       if (!dev)
+               dev = &fallback_dev;
+
+       phys_mem = virt_to_phys(addr); 
+       if (!need_iommu(dev, phys_mem, size))
+               return phys_mem; 
+
+       bus = gart_map_simple(dev, addr, size, dir);
+       return bus; 
+}
+
+/*
+ * Free a DMA mapping.
+ */
+static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
+                     size_t size, int direction)
+{
+       unsigned long iommu_page;
+       int npages;
+       int i;
+
+       if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
+           dma_addr >= iommu_bus_base + iommu_size)
+               return;
+       iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
+       npages = to_pages(dma_addr, size);
+       for (i = 0; i < npages; i++) {
+               iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
+               CLEAR_LEAK(iommu_page + i);
+       }
+       free_iommu(iommu_page, npages);
+}
+
+/*
+ * Wrapper for pci_unmap_single working with scatterlists.
+ */
+static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
+{
+       int i;
+
+       for (i = 0; i < nents; i++) {
+               struct scatterlist *s = &sg[i];
+               if (!s->dma_length || !s->length)
+                       break;
+               gart_unmap_single(dev, s->dma_address, s->dma_length, dir);
+       }
+}
+
+/* Fallback for dma_map_sg in case of overflow */
+static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
+                              int nents, int dir)
+{
+       int i;
+
+#ifdef CONFIG_IOMMU_DEBUG
+       printk(KERN_DEBUG "dma_map_sg overflow\n");
+#endif
+
+       for (i = 0; i < nents; i++ ) {
+               struct scatterlist *s = &sg[i];
+               unsigned long addr = page_to_phys(s->page) + s->offset; 
+               if (nonforced_iommu(dev, addr, s->length)) { 
+                       addr = dma_map_area(dev, addr, s->length, dir);
+                       if (addr == bad_dma_address) { 
+                               if (i > 0) 
+                                       gart_unmap_sg(dev, sg, i, dir);
+                               nents = 0; 
+                               sg[0].dma_length = 0;
+                               break;
+                       }
+               }
+               s->dma_address = addr;
+               s->dma_length = s->length;
+       }
+       flush_gart();
+       return nents;
+}
+
+/* Map multiple scatterlist entries continuous into the first. */
+static int __dma_map_cont(struct scatterlist *sg, int start, int stopat,
+                     struct scatterlist *sout, unsigned long pages)
+{
+       unsigned long iommu_start = alloc_iommu(pages);
+       unsigned long iommu_page = iommu_start; 
+       int i;
+
+       if (iommu_start == -1)
+               return -1;
+       
+       for (i = start; i < stopat; i++) {
+               struct scatterlist *s = &sg[i];
+               unsigned long pages, addr;
+               unsigned long phys_addr = s->dma_address;
+               
+               BUG_ON(i > start && s->offset);
+               if (i == start) {
+                       *sout = *s; 
+                       sout->dma_address = iommu_bus_base;
+                       sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
+                       sout->dma_length = s->length;
+               } else { 
+                       sout->dma_length += s->length; 
+               }
+
+               addr = phys_addr;
+               pages = to_pages(s->offset, s->length); 
+               while (pages--) { 
+                       iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 
+                       SET_LEAK(iommu_page);
+                       addr += PAGE_SIZE;
+                       iommu_page++;
+               }
+       } 
+       BUG_ON(iommu_page - iommu_start != pages);      
+       return 0;
+}
+
+static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat,
+                     struct scatterlist *sout,
+                     unsigned long pages, int need)
+{
+       if (!need) { 
+               BUG_ON(stopat - start != 1);
+               *sout = sg[start]; 
+               sout->dma_length = sg[start].length; 
+               return 0;
+       } 
+       return __dma_map_cont(sg, start, stopat, sout, pages);
+}
+               
+/*
+ * DMA map all entries in a scatterlist.
+ * Merge chunks that have page aligned sizes into a continuous mapping. 
+ */
+int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
+{
+       int i;
+       int out;
+       int start;
+       unsigned long pages = 0;
+       int need = 0, nextneed;
+
+       if (nents == 0) 
+               return 0;
+
+       if (!dev)
+               dev = &fallback_dev;
+
+       out = 0;
+       start = 0;
+       for (i = 0; i < nents; i++) {
+               struct scatterlist *s = &sg[i];
+               dma_addr_t addr = page_to_phys(s->page) + s->offset;
+               s->dma_address = addr;
+               BUG_ON(s->length == 0); 
+
+               nextneed = need_iommu(dev, addr, s->length); 
+
+               /* Handle the previous not yet processed entries */
+               if (i > start) {
+                       struct scatterlist *ps = &sg[i-1];
+                       /* Can only merge when the last chunk ends on a page 
+                          boundary and the new one doesn't have an offset. */
+                       if (!iommu_merge || !nextneed || !need || s->offset ||
+                           (ps->offset + ps->length) % PAGE_SIZE) { 
+                               if (dma_map_cont(sg, start, i, sg+out, pages,
+                                                need) < 0)
+                                       goto error;
+                               out++;
+                               pages = 0;
+                               start = i;      
+                       }
+               }
+
+               need = nextneed;
+               pages += to_pages(s->offset, s->length);
+       }
+       if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0)
+               goto error;
+       out++;
+       flush_gart();
+       if (out < nents) 
+               sg[out].dma_length = 0; 
+       return out;
+
+error:
+       flush_gart();
+       gart_unmap_sg(dev, sg, nents, dir);
+       /* When it was forced or merged try again in a dumb way */
+       if (force_iommu || iommu_merge) {
+               out = dma_map_sg_nonforce(dev, sg, nents, dir);
+               if (out > 0)
+                       return out;
+       }
+       if (panic_on_overflow)
+               panic("dma_map_sg: overflow on %lu pages\n", pages);
+       iommu_full(dev, pages << PAGE_SHIFT, dir);
+       for (i = 0; i < nents; i++)
+               sg[i].dma_address = bad_dma_address;
+       return 0;
+} 
+
+static int no_agp;
+
+static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
+{ 
+       unsigned long a; 
+       if (!iommu_size) { 
+               iommu_size = aper_size; 
+               if (!no_agp) 
+                       iommu_size /= 2; 
+       } 
+
+       a = aper + iommu_size; 
+       iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a;
+
+       if (iommu_size < 64*1024*1024) 
+               printk(KERN_WARNING
+  "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20); 
+       
+       return iommu_size;
+} 
+
+static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) 
+{ 
+       unsigned aper_size = 0, aper_base_32;
+       u64 aper_base;
+       unsigned aper_order;
+
+       pci_read_config_dword(dev, 0x94, &aper_base_32); 
+       pci_read_config_dword(dev, 0x90, &aper_order);
+       aper_order = (aper_order >> 1) & 7;     
+
+       aper_base = aper_base_32 & 0x7fff; 
+       aper_base <<= 25;
+
+       aper_size = (32 * 1024 * 1024) << aper_order; 
+       if (aper_base + aper_size > 0x100000000UL || !aper_size)
+               aper_base = 0;
+
+       *size = aper_size;
+       return aper_base;
+} 
+
+/* 
+ * Private Northbridge GATT initialization in case we cannot use the
+ * AGP driver for some reason.  
+ */
+static __init int init_k8_gatt(struct agp_kern_info *info)
+{ 
+       struct pci_dev *dev;
+       void *gatt;
+       unsigned aper_base, new_aper_base;
+       unsigned aper_size, gatt_size, new_aper_size;
+       int i;
+
+       printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
+       aper_size = aper_base = info->aper_size = 0;
+       dev = NULL;
+       for (i = 0; i < num_k8_northbridges; i++) {
+               dev = k8_northbridges[i];
+               new_aper_base = read_aperture(dev, &new_aper_size); 
+               if (!new_aper_base) 
+                       goto nommu; 
+               
+               if (!aper_base) { 
+                       aper_size = new_aper_size;
+                       aper_base = new_aper_base;
+               } 
+               if (aper_size != new_aper_size || aper_base != new_aper_base) 
+                       goto nommu;
+       }
+       if (!aper_base)
+               goto nommu; 
+       info->aper_base = aper_base;
+       info->aper_size = aper_size>>20; 
+
+       gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); 
+       gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); 
+       if (!gatt) 
+               panic("Cannot allocate GATT table");
+       if (change_page_attr_addr((unsigned long)gatt, gatt_size >> PAGE_SHIFT, PAGE_KERNEL_NOCACHE))
+               panic("Could not set GART PTEs to uncacheable pages");
+       global_flush_tlb();
+
+       memset(gatt, 0, gatt_size); 
+       agp_gatt_table = gatt;
+
+       for (i = 0; i < num_k8_northbridges; i++) {
+               u32 ctl; 
+               u32 gatt_reg; 
+
+               dev = k8_northbridges[i];
+               gatt_reg = __pa(gatt) >> 12; 
+               gatt_reg <<= 4; 
+               pci_write_config_dword(dev, 0x98, gatt_reg);
+               pci_read_config_dword(dev, 0x90, &ctl); 
+
+               ctl |= 1;
+               ctl &= ~((1<<4) | (1<<5));
+
+               pci_write_config_dword(dev, 0x90, ctl); 
+       }
+       flush_gart();
+       
+       printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); 
+       return 0;
+
+ nommu:
+       /* Should not happen anymore */
+       printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
+              KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n");
+       return -1; 
+} 
+
+extern int agp_amd64_init(void);
+
+static const struct dma_mapping_ops gart_dma_ops = {
+       .mapping_error = NULL,
+       .map_single = gart_map_single,
+       .map_simple = gart_map_simple,
+       .unmap_single = gart_unmap_single,
+       .sync_single_for_cpu = NULL,
+       .sync_single_for_device = NULL,
+       .sync_single_range_for_cpu = NULL,
+       .sync_single_range_for_device = NULL,
+       .sync_sg_for_cpu = NULL,
+       .sync_sg_for_device = NULL,
+       .map_sg = gart_map_sg,
+       .unmap_sg = gart_unmap_sg,
+};
+
+void gart_iommu_shutdown(void)
+{
+       struct pci_dev *dev;
+       int i;
+
+       if (no_agp && (dma_ops != &gart_dma_ops))
+               return;
+
+        for (i = 0; i < num_k8_northbridges; i++) {
+                u32 ctl;
+
+                dev = k8_northbridges[i];
+                pci_read_config_dword(dev, 0x90, &ctl);
+
+                ctl &= ~1;
+
+                pci_write_config_dword(dev, 0x90, ctl);
+        }
+}
+
+void __init gart_iommu_init(void)
+{ 
+       struct agp_kern_info info;
+       unsigned long aper_size;
+       unsigned long iommu_start;
+       unsigned long scratch;
+       long i;
+
+       if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) {
+               printk(KERN_INFO "PCI-GART: No AMD northbridge found.\n");
+               return;
+       }
+
+#ifndef CONFIG_AGP_AMD64
+       no_agp = 1; 
+#else
+       /* Makefile puts PCI initialization via subsys_initcall first. */
+       /* Add other K8 AGP bridge drivers here */
+       no_agp = no_agp || 
+               (agp_amd64_init() < 0) || 
+               (agp_copy_info(agp_bridge, &info) < 0);
+#endif 
+
+       if (swiotlb)
+               return;
+
+       /* Did we detect a different HW IOMMU? */
+       if (iommu_detected && !iommu_aperture)
+               return;
+
+       if (no_iommu ||
+           (!force_iommu && end_pfn <= MAX_DMA32_PFN) ||
+           !iommu_aperture ||
+           (no_agp && init_k8_gatt(&info) < 0)) {
+               if (end_pfn > MAX_DMA32_PFN) {
+                       printk(KERN_ERR "WARNING more than 4GB of memory "
+                                       "but GART IOMMU not available.\n"
+                              KERN_ERR "WARNING 32bit PCI may malfunction.\n");
+               }
+               return;
+       }
+
+       printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
+       aper_size = info.aper_size * 1024 * 1024;       
+       iommu_size = check_iommu_size(info.aper_base, aper_size); 
+       iommu_pages = iommu_size >> PAGE_SHIFT; 
+
+       iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, 
+                                                   get_order(iommu_pages/8)); 
+       if (!iommu_gart_bitmap) 
+               panic("Cannot allocate iommu bitmap\n"); 
+       memset(iommu_gart_bitmap, 0, iommu_pages/8);
+
+#ifdef CONFIG_IOMMU_LEAK
+       if (leak_trace) { 
+               iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, 
+                                 get_order(iommu_pages*sizeof(void *)));
+               if (iommu_leak_tab) 
+                       memset(iommu_leak_tab, 0, iommu_pages * 8); 
+               else
+                       printk("PCI-DMA: Cannot allocate leak trace area\n"); 
+       } 
+#endif
+
+       /* 
+        * Out of IOMMU space handling.
+        * Reserve some invalid pages at the beginning of the GART. 
+        */ 
+       set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 
+
+       agp_memory_reserved = iommu_size;       
+       printk(KERN_INFO
+              "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
+              iommu_size>>20); 
+
+       iommu_start = aper_size - iommu_size;   
+       iommu_bus_base = info.aper_base + iommu_start; 
+       bad_dma_address = iommu_bus_base;
+       iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
+
+       /* 
+        * Unmap the IOMMU part of the GART. The alias of the page is
+        * always mapped with cache enabled and there is no full cache
+        * coherency across the GART remapping. The unmapping avoids
+        * automatic prefetches from the CPU allocating cache lines in
+        * there. All CPU accesses are done via the direct mapping to
+        * the backing memory. The GART address is only used by PCI
+        * devices. 
+        */
+       clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size);
+
+       /* 
+        * Try to workaround a bug (thanks to BenH) 
+        * Set unmapped entries to a scratch page instead of 0. 
+        * Any prefetches that hit unmapped entries won't get an bus abort
+        * then.
+        */
+       scratch = get_zeroed_page(GFP_KERNEL); 
+       if (!scratch) 
+               panic("Cannot allocate iommu scratch page");
+       gart_unmapped_entry = GPTE_ENCODE(__pa(scratch));
+       for (i = EMERGENCY_PAGES; i < iommu_pages; i++) 
+               iommu_gatt_base[i] = gart_unmapped_entry;
+
+       flush_gart();
+       dma_ops = &gart_dma_ops;
+} 
+
+void __init gart_parse_options(char *p)
+{
+       int arg;
+
+#ifdef CONFIG_IOMMU_LEAK
+       if (!strncmp(p,"leak",4)) {
+               leak_trace = 1;
+               p += 4;
+               if (*p == '=') ++p;
+               if (isdigit(*p) && get_option(&p, &arg))
+                       iommu_leak_pages = arg;
+       }
+#endif
+       if (isdigit(*p) && get_option(&p, &arg))
+               iommu_size = arg;
+       if (!strncmp(p, "fullflush",8))
+               iommu_fullflush = 1;
+       if (!strncmp(p, "nofullflush",11))
+               iommu_fullflush = 0;
+       if (!strncmp(p,"noagp",5))
+               no_agp = 1;
+       if (!strncmp(p, "noaperture",10))
+               fix_aperture = 0;
+       /* duplicated from pci-dma.c */
+       if (!strncmp(p,"force",5))
+               iommu_aperture_allowed = 1;
+       if (!strncmp(p,"allowed",7))
+               iommu_aperture_allowed = 1;
+       if (!strncmp(p, "memaper", 7)) {
+               fallback_aper_force = 1;
+               p += 7;
+               if (*p == '=') {
+                       ++p;
+                       if (get_option(&p, &arg))
+                               fallback_aper_order = arg;
+               }
+       }
+}
diff --git a/arch/x86/kernel/pci-nommu_64.c b/arch/x86/kernel/pci-nommu_64.c
new file mode 100644 (file)
index 0000000..2a34c6c
--- /dev/null
@@ -0,0 +1,97 @@
+/* Fallback functions when the main IOMMU code is not compiled in. This
+   code is roughly equivalent to i386. */
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+#include <linux/dma-mapping.h>
+
+#include <asm/iommu.h>
+#include <asm/processor.h>
+#include <asm/dma.h>
+
+static int
+check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
+{
+        if (hwdev && bus + size > *hwdev->dma_mask) {
+               if (*hwdev->dma_mask >= DMA_32BIT_MASK)
+                       printk(KERN_ERR
+                           "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
+                               name, (long long)bus, size,
+                               (long long)*hwdev->dma_mask);
+               return 0;
+       }
+       return 1;
+}
+
+static dma_addr_t
+nommu_map_single(struct device *hwdev, void *ptr, size_t size,
+              int direction)
+{
+       dma_addr_t bus = virt_to_bus(ptr);
+       if (!check_addr("map_single", hwdev, bus, size))
+                               return bad_dma_address;
+       return bus;
+}
+
+static void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
+                       int direction)
+{
+}
+
+/* Map a set of buffers described by scatterlist in streaming
+ * mode for DMA.  This is the scatter-gather version of the
+ * above pci_map_single interface.  Here the scatter gather list
+ * elements are each tagged with the appropriate dma address
+ * and length.  They are obtained via sg_dma_{address,length}(SG).
+ *
+ * NOTE: An implementation may be able to use a smaller number of
+ *       DMA address/length pairs than there are SG table elements.
+ *       (for example via virtual mapping capabilities)
+ *       The routine returns the number of addr/length pairs actually
+ *       used, at most nents.
+ *
+ * Device ownership issues as mentioned above for pci_map_single are
+ * the same here.
+ */
+static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
+              int nents, int direction)
+{
+       int i;
+
+       for (i = 0; i < nents; i++ ) {
+               struct scatterlist *s = &sg[i];
+               BUG_ON(!s->page);
+               s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
+               if (!check_addr("map_sg", hwdev, s->dma_address, s->length))
+                       return 0;
+               s->dma_length = s->length;
+       }
+       return nents;
+}
+
+/* Unmap a set of streaming mode DMA translations.
+ * Again, cpu read rules concerning calls here are the same as for
+ * pci_unmap_single() above.
+ */
+static void nommu_unmap_sg(struct device *dev, struct scatterlist *sg,
+                 int nents, int dir)
+{
+}
+
+const struct dma_mapping_ops nommu_dma_ops = {
+       .map_single = nommu_map_single,
+       .unmap_single = nommu_unmap_single,
+       .map_sg = nommu_map_sg,
+       .unmap_sg = nommu_unmap_sg,
+       .is_phys = 1,
+};
+
+void __init no_iommu_init(void)
+{
+       if (dma_ops)
+               return;
+
+       force_iommu = 0; /* no HW IOMMU */
+       dma_ops = &nommu_dma_ops;
+}
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
new file mode 100644 (file)
index 0000000..b2f405e
--- /dev/null
@@ -0,0 +1,44 @@
+/* Glue code to lib/swiotlb.c */
+
+#include <linux/pci.h>
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <linux/dma-mapping.h>
+
+#include <asm/iommu.h>
+#include <asm/swiotlb.h>
+#include <asm/dma.h>
+
+int swiotlb __read_mostly;
+EXPORT_SYMBOL(swiotlb);
+
+const struct dma_mapping_ops swiotlb_dma_ops = {
+       .mapping_error = swiotlb_dma_mapping_error,
+       .alloc_coherent = swiotlb_alloc_coherent,
+       .free_coherent = swiotlb_free_coherent,
+       .map_single = swiotlb_map_single,
+       .unmap_single = swiotlb_unmap_single,
+       .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
+       .sync_single_for_device = swiotlb_sync_single_for_device,
+       .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
+       .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
+       .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
+       .sync_sg_for_device = swiotlb_sync_sg_for_device,
+       .map_sg = swiotlb_map_sg,
+       .unmap_sg = swiotlb_unmap_sg,
+       .dma_supported = NULL,
+};
+
+void __init pci_swiotlb_init(void)
+{
+       /* don't initialize swiotlb if iommu=off (no_iommu=1) */
+       if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN)
+              swiotlb = 1;
+       if (swiotlb_force)
+               swiotlb = 1;
+       if (swiotlb) {
+               printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
+               swiotlb_init();
+               dma_ops = &swiotlb_dma_ops;
+       }
+}
diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c
new file mode 100644 (file)
index 0000000..ae8f912
--- /dev/null
@@ -0,0 +1,69 @@
+/* Ported over from i386 by AK, original copyright was:
+ *
+ * (C) Dominik Brodowski <linux@brodo.de> 2003
+ *
+ * Driver to use the Power Management Timer (PMTMR) available in some
+ * southbridges as primary timing source for the Linux kernel.
+ *
+ * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
+ * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
+ *
+ * This file is licensed under the GPL v2.
+ *
+ * Dropped all the hardware bug workarounds for now. Hopefully they
+ * are not needed on 64bit chipsets.
+ */
+
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/cpumask.h>
+#include <asm/io.h>
+#include <asm/proto.h>
+#include <asm/msr.h>
+#include <asm/vsyscall.h>
+
+#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
+
+static inline u32 cyc2us(u32 cycles)
+{
+       /* The Power Management Timer ticks at 3.579545 ticks per microsecond.
+        * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
+        *
+        * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
+        * easily be multiplied with 286 (=0x11E) without having to fear
+        * u32 overflows.
+        */
+       cycles *= 286;
+       return (cycles >> 10);
+}
+
+static unsigned pmtimer_wait_tick(void)
+{
+       u32 a, b;
+       for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK;
+            a == b;
+            b = inl(pmtmr_ioport) & ACPI_PM_MASK)
+               cpu_relax();
+       return b;
+}
+
+/* note: wait time is rounded up to one tick */
+void pmtimer_wait(unsigned us)
+{
+       u32 a, b;
+       a = pmtimer_wait_tick();
+       do {
+               b = inl(pmtmr_ioport);
+               cpu_relax();
+       } while (cyc2us(b - a) < us);
+}
+
+static int __init nopmtimer_setup(char *s)
+{
+       pmtmr_ioport = 0;
+       return 1;
+}
+
+__setup("nopmtimer", nopmtimer_setup);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
new file mode 100644 (file)
index 0000000..9895655
--- /dev/null
@@ -0,0 +1,903 @@
+/*
+ *  linux/arch/x86-64/kernel/process.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *     Gareth Hughes <gareth@valinux.com>, May 2000
+ * 
+ *  X86-64 port
+ *     Andi Kleen.
+ *
+ *     CPU hotplug support - ashok.raj@intel.com
+ */
+
+/*
+ * This file handles the architecture-dependent parts of process handling..
+ */
+
+#include <stdarg.h>
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/elfcore.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/module.h>
+#include <linux/a.out.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/ptrace.h>
+#include <linux/utsname.h>
+#include <linux/random.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/mmu_context.h>
+#include <asm/pda.h>
+#include <asm/prctl.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/ia32.h>
+#include <asm/idle.h>
+
+asmlinkage extern void ret_from_fork(void);
+
+unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
+
+unsigned long boot_option_idle_override = 0;
+EXPORT_SYMBOL(boot_option_idle_override);
+
+/*
+ * Powermanagement idle function, if any..
+ */
+void (*pm_idle)(void);
+EXPORT_SYMBOL(pm_idle);
+static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
+
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+
+void idle_notifier_register(struct notifier_block *n)
+{
+       atomic_notifier_chain_register(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_register);
+
+void idle_notifier_unregister(struct notifier_block *n)
+{
+       atomic_notifier_chain_unregister(&idle_notifier, n);
+}
+EXPORT_SYMBOL(idle_notifier_unregister);
+
+void enter_idle(void)
+{
+       write_pda(isidle, 1);
+       atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
+}
+
+static void __exit_idle(void)
+{
+       if (test_and_clear_bit_pda(0, isidle) == 0)
+               return;
+       atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
+}
+
+/* Called from interrupts to signify idle end */
+void exit_idle(void)
+{
+       /* idle loop has pid 0 */
+       if (current->pid)
+               return;
+       __exit_idle();
+}
+
+/*
+ * We use this if we don't have any better
+ * idle routine..
+ */
+static void default_idle(void)
+{
+       current_thread_info()->status &= ~TS_POLLING;
+       /*
+        * TS_POLLING-cleared state must be visible before we
+        * test NEED_RESCHED:
+        */
+       smp_mb();
+       local_irq_disable();
+       if (!need_resched()) {
+               /* Enables interrupts one instruction before HLT.
+                  x86 special cases this so there is no race. */
+               safe_halt();
+       } else
+               local_irq_enable();
+       current_thread_info()->status |= TS_POLLING;
+}
+
+/*
+ * On SMP it's slightly faster (but much more power-consuming!)
+ * to poll the ->need_resched flag instead of waiting for the
+ * cross-CPU IPI to arrive. Use this option with caution.
+ */
+static void poll_idle (void)
+{
+       local_irq_enable();
+       cpu_relax();
+}
+
+void cpu_idle_wait(void)
+{
+       unsigned int cpu, this_cpu = get_cpu();
+       cpumask_t map, tmp = current->cpus_allowed;
+
+       set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
+       put_cpu();
+
+       cpus_clear(map);
+       for_each_online_cpu(cpu) {
+               per_cpu(cpu_idle_state, cpu) = 1;
+               cpu_set(cpu, map);
+       }
+
+       __get_cpu_var(cpu_idle_state) = 0;
+
+       wmb();
+       do {
+               ssleep(1);
+               for_each_online_cpu(cpu) {
+                       if (cpu_isset(cpu, map) &&
+                                       !per_cpu(cpu_idle_state, cpu))
+                               cpu_clear(cpu, map);
+               }
+               cpus_and(map, map, cpu_online_map);
+       } while (!cpus_empty(map));
+
+       set_cpus_allowed(current, tmp);
+}
+EXPORT_SYMBOL_GPL(cpu_idle_wait);
+
+#ifdef CONFIG_HOTPLUG_CPU
+DECLARE_PER_CPU(int, cpu_state);
+
+#include <asm/nmi.h>
+/* We halt the CPU with physical CPU hotplug */
+static inline void play_dead(void)
+{
+       idle_task_exit();
+       wbinvd();
+       mb();
+       /* Ack it */
+       __get_cpu_var(cpu_state) = CPU_DEAD;
+
+       local_irq_disable();
+       while (1)
+               halt();
+}
+#else
+static inline void play_dead(void)
+{
+       BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+/*
+ * The idle thread. There's no useful work to be
+ * done, so just try to conserve power and have a
+ * low exit latency (ie sit in a loop waiting for
+ * somebody to say that they'd like to reschedule)
+ */
+void cpu_idle (void)
+{
+       current_thread_info()->status |= TS_POLLING;
+       /* endless idle loop with no priority at all */
+       while (1) {
+               while (!need_resched()) {
+                       void (*idle)(void);
+
+                       if (__get_cpu_var(cpu_idle_state))
+                               __get_cpu_var(cpu_idle_state) = 0;
+
+                       rmb();
+                       idle = pm_idle;
+                       if (!idle)
+                               idle = default_idle;
+                       if (cpu_is_offline(smp_processor_id()))
+                               play_dead();
+                       /*
+                        * Idle routines should keep interrupts disabled
+                        * from here on, until they go to idle.
+                        * Otherwise, idle callbacks can misfire.
+                        */
+                       local_irq_disable();
+                       enter_idle();
+                       idle();
+                       /* In many cases the interrupt that ended idle
+                          has already called exit_idle. But some idle
+                          loops can be woken up without interrupt. */
+                       __exit_idle();
+               }
+
+               preempt_enable_no_resched();
+               schedule();
+               preempt_disable();
+       }
+}
+
+/*
+ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
+ * which can obviate IPI to trigger checking of need_resched.
+ * We execute MONITOR against need_resched and enter optimized wait state
+ * through MWAIT. Whenever someone changes need_resched, we would be woken
+ * up from MWAIT (without an IPI).
+ *
+ * New with Core Duo processors, MWAIT can take some hints based on CPU
+ * capability.
+ */
+void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
+{
+       if (!need_resched()) {
+               __monitor((void *)&current_thread_info()->flags, 0, 0);
+               smp_mb();
+               if (!need_resched())
+                       __mwait(eax, ecx);
+       }
+}
+
+/* Default MONITOR/MWAIT with no hints, used for default C1 state */
+static void mwait_idle(void)
+{
+       if (!need_resched()) {
+               __monitor((void *)&current_thread_info()->flags, 0, 0);
+               smp_mb();
+               if (!need_resched())
+                       __sti_mwait(0, 0);
+               else
+                       local_irq_enable();
+       } else {
+               local_irq_enable();
+       }
+}
+
+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
+{
+       static int printed;
+       if (cpu_has(c, X86_FEATURE_MWAIT)) {
+               /*
+                * Skip, if setup has overridden idle.
+                * One CPU supports mwait => All CPUs supports mwait
+                */
+               if (!pm_idle) {
+                       if (!printed) {
+                               printk(KERN_INFO "using mwait in idle threads.\n");
+                               printed = 1;
+                       }
+                       pm_idle = mwait_idle;
+               }
+       }
+}
+
+static int __init idle_setup (char *str)
+{
+       if (!strcmp(str, "poll")) {
+               printk("using polling idle threads.\n");
+               pm_idle = poll_idle;
+       } else if (!strcmp(str, "mwait"))
+               force_mwait = 1;
+       else
+               return -1;
+
+       boot_option_idle_override = 1;
+       return 0;
+}
+early_param("idle", idle_setup);
+
+/* Prints also some state that isn't saved in the pt_regs */ 
+void __show_regs(struct pt_regs * regs)
+{
+       unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
+       unsigned long d0, d1, d2, d3, d6, d7;
+       unsigned int fsindex,gsindex;
+       unsigned int ds,cs,es; 
+
+       printk("\n");
+       print_modules();
+       printk("Pid: %d, comm: %.20s %s %s %.*s\n",
+               current->pid, current->comm, print_tainted(),
+               init_utsname()->release,
+               (int)strcspn(init_utsname()->version, " "),
+               init_utsname()->version);
+       printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
+       printk_address(regs->rip); 
+       printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
+               regs->eflags);
+       printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
+              regs->rax, regs->rbx, regs->rcx);
+       printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
+              regs->rdx, regs->rsi, regs->rdi); 
+       printk("RBP: %016lx R08: %016lx R09: %016lx\n",
+              regs->rbp, regs->r8, regs->r9); 
+       printk("R10: %016lx R11: %016lx R12: %016lx\n",
+              regs->r10, regs->r11, regs->r12); 
+       printk("R13: %016lx R14: %016lx R15: %016lx\n",
+              regs->r13, regs->r14, regs->r15); 
+
+       asm("movl %%ds,%0" : "=r" (ds)); 
+       asm("movl %%cs,%0" : "=r" (cs)); 
+       asm("movl %%es,%0" : "=r" (es)); 
+       asm("movl %%fs,%0" : "=r" (fsindex));
+       asm("movl %%gs,%0" : "=r" (gsindex));
+
+       rdmsrl(MSR_FS_BASE, fs);
+       rdmsrl(MSR_GS_BASE, gs); 
+       rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 
+
+       cr0 = read_cr0();
+       cr2 = read_cr2();
+       cr3 = read_cr3();
+       cr4 = read_cr4();
+
+       printk("FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 
+              fs,fsindex,gs,gsindex,shadowgs); 
+       printk("CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); 
+       printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
+
+       get_debugreg(d0, 0);
+       get_debugreg(d1, 1);
+       get_debugreg(d2, 2);
+       printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
+       get_debugreg(d3, 3);
+       get_debugreg(d6, 6);
+       get_debugreg(d7, 7);
+       printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
+}
+
+void show_regs(struct pt_regs *regs)
+{
+       printk("CPU %d:", smp_processor_id());
+       __show_regs(regs);
+       show_trace(NULL, regs, (void *)(regs + 1));
+}
+
+/*
+ * Free current thread data structures etc..
+ */
+void exit_thread(void)
+{
+       struct task_struct *me = current;
+       struct thread_struct *t = &me->thread;
+
+       if (me->thread.io_bitmap_ptr) { 
+               struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
+
+               kfree(t->io_bitmap_ptr);
+               t->io_bitmap_ptr = NULL;
+               clear_thread_flag(TIF_IO_BITMAP);
+               /*
+                * Careful, clear this in the TSS too:
+                */
+               memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
+               t->io_bitmap_max = 0;
+               put_cpu();
+       }
+}
+
+void flush_thread(void)
+{
+       struct task_struct *tsk = current;
+
+       if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
+               clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
+               if (test_tsk_thread_flag(tsk, TIF_IA32)) {
+                       clear_tsk_thread_flag(tsk, TIF_IA32);
+               } else {
+                       set_tsk_thread_flag(tsk, TIF_IA32);
+                       current_thread_info()->status |= TS_COMPAT;
+               }
+       }
+       clear_tsk_thread_flag(tsk, TIF_DEBUG);
+
+       tsk->thread.debugreg0 = 0;
+       tsk->thread.debugreg1 = 0;
+       tsk->thread.debugreg2 = 0;
+       tsk->thread.debugreg3 = 0;
+       tsk->thread.debugreg6 = 0;
+       tsk->thread.debugreg7 = 0;
+       memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));        
+       /*
+        * Forget coprocessor state..
+        */
+       clear_fpu(tsk);
+       clear_used_math();
+}
+
+void release_thread(struct task_struct *dead_task)
+{
+       if (dead_task->mm) {
+               if (dead_task->mm->context.size) {
+                       printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
+                                       dead_task->comm,
+                                       dead_task->mm->context.ldt,
+                                       dead_task->mm->context.size);
+                       BUG();
+               }
+       }
+}
+
+static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
+{
+       struct user_desc ud = { 
+               .base_addr = addr,
+               .limit = 0xfffff,
+               .seg_32bit = 1,
+               .limit_in_pages = 1,
+               .useable = 1,
+       };
+       struct n_desc_struct *desc = (void *)t->thread.tls_array;
+       desc += tls;
+       desc->a = LDT_entry_a(&ud); 
+       desc->b = LDT_entry_b(&ud); 
+}
+
+static inline u32 read_32bit_tls(struct task_struct *t, int tls)
+{
+       struct desc_struct *desc = (void *)t->thread.tls_array;
+       desc += tls;
+       return desc->base0 | 
+               (((u32)desc->base1) << 16) | 
+               (((u32)desc->base2) << 24);
+}
+
+/*
+ * This gets called before we allocate a new thread and copy
+ * the current task into it.
+ */
+void prepare_to_copy(struct task_struct *tsk)
+{
+       unlazy_fpu(tsk);
+}
+
+int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, 
+               unsigned long unused,
+       struct task_struct * p, struct pt_regs * regs)
+{
+       int err;
+       struct pt_regs * childregs;
+       struct task_struct *me = current;
+
+       childregs = ((struct pt_regs *)
+                       (THREAD_SIZE + task_stack_page(p))) - 1;
+       *childregs = *regs;
+
+       childregs->rax = 0;
+       childregs->rsp = rsp;
+       if (rsp == ~0UL)
+               childregs->rsp = (unsigned long)childregs;
+
+       p->thread.rsp = (unsigned long) childregs;
+       p->thread.rsp0 = (unsigned long) (childregs+1);
+       p->thread.userrsp = me->thread.userrsp; 
+
+       set_tsk_thread_flag(p, TIF_FORK);
+
+       p->thread.fs = me->thread.fs;
+       p->thread.gs = me->thread.gs;
+
+       asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
+       asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
+       asm("mov %%es,%0" : "=m" (p->thread.es));
+       asm("mov %%ds,%0" : "=m" (p->thread.ds));
+
+       if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
+               p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+               if (!p->thread.io_bitmap_ptr) {
+                       p->thread.io_bitmap_max = 0;
+                       return -ENOMEM;
+               }
+               memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
+                               IO_BITMAP_BYTES);
+               set_tsk_thread_flag(p, TIF_IO_BITMAP);
+       } 
+
+       /*
+        * Set a new TLS for the child thread?
+        */
+       if (clone_flags & CLONE_SETTLS) {
+#ifdef CONFIG_IA32_EMULATION
+               if (test_thread_flag(TIF_IA32))
+                       err = ia32_child_tls(p, childregs); 
+               else                    
+#endif  
+                       err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 
+               if (err) 
+                       goto out;
+       }
+       err = 0;
+out:
+       if (err && p->thread.io_bitmap_ptr) {
+               kfree(p->thread.io_bitmap_ptr);
+               p->thread.io_bitmap_max = 0;
+       }
+       return err;
+}
+
+/*
+ * This special macro can be used to load a debugging register
+ */
+#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
+
+static inline void __switch_to_xtra(struct task_struct *prev_p,
+                                   struct task_struct *next_p,
+                                   struct tss_struct *tss)
+{
+       struct thread_struct *prev, *next;
+
+       prev = &prev_p->thread,
+       next = &next_p->thread;
+
+       if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
+               loaddebug(next, 0);
+               loaddebug(next, 1);
+               loaddebug(next, 2);
+               loaddebug(next, 3);
+               /* no 4 and 5 */
+               loaddebug(next, 6);
+               loaddebug(next, 7);
+       }
+
+       if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
+               /*
+                * Copy the relevant range of the IO bitmap.
+                * Normally this is 128 bytes or less:
+                */
+               memcpy(tss->io_bitmap, next->io_bitmap_ptr,
+                      max(prev->io_bitmap_max, next->io_bitmap_max));
+       } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
+               /*
+                * Clear any possible leftover bits:
+                */
+               memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
+       }
+}
+
+/*
+ *     switch_to(x,y) should switch tasks from x to y.
+ *
+ * This could still be optimized: 
+ * - fold all the options into a flag word and test it with a single test.
+ * - could test fs/gs bitsliced
+ *
+ * Kprobes not supported here. Set the probe on schedule instead.
+ */
+__kprobes struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+{
+       struct thread_struct *prev = &prev_p->thread,
+                                *next = &next_p->thread;
+       int cpu = smp_processor_id();  
+       struct tss_struct *tss = &per_cpu(init_tss, cpu);
+
+       /* we're going to use this soon, after a few expensive things */
+       if (next_p->fpu_counter>5)
+               prefetch(&next->i387.fxsave);
+
+       /*
+        * Reload esp0, LDT and the page table pointer:
+        */
+       tss->rsp0 = next->rsp0;
+
+       /* 
+        * Switch DS and ES.
+        * This won't pick up thread selector changes, but I guess that is ok.
+        */
+       asm volatile("mov %%es,%0" : "=m" (prev->es));
+       if (unlikely(next->es | prev->es))
+               loadsegment(es, next->es); 
+       
+       asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
+       if (unlikely(next->ds | prev->ds))
+               loadsegment(ds, next->ds);
+
+       load_TLS(next, cpu);
+
+       /* 
+        * Switch FS and GS.
+        */
+       { 
+               unsigned fsindex;
+               asm volatile("movl %%fs,%0" : "=r" (fsindex)); 
+               /* segment register != 0 always requires a reload. 
+                  also reload when it has changed. 
+                  when prev process used 64bit base always reload
+                  to avoid an information leak. */
+               if (unlikely(fsindex | next->fsindex | prev->fs)) {
+                       loadsegment(fs, next->fsindex);
+                       /* check if the user used a selector != 0
+                        * if yes clear 64bit base, since overloaded base
+                         * is always mapped to the Null selector
+                         */
+                       if (fsindex)
+                       prev->fs = 0;                           
+               }
+               /* when next process has a 64bit base use it */
+               if (next->fs) 
+                       wrmsrl(MSR_FS_BASE, next->fs); 
+               prev->fsindex = fsindex;
+       }
+       { 
+               unsigned gsindex;
+               asm volatile("movl %%gs,%0" : "=r" (gsindex)); 
+               if (unlikely(gsindex | next->gsindex | prev->gs)) {
+                       load_gs_index(next->gsindex);
+                       if (gsindex)
+                       prev->gs = 0;                           
+               }
+               if (next->gs)
+                       wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 
+               prev->gsindex = gsindex;
+       }
+
+       /* Must be after DS reload */
+       unlazy_fpu(prev_p);
+
+       /* 
+        * Switch the PDA and FPU contexts.
+        */
+       prev->userrsp = read_pda(oldrsp); 
+       write_pda(oldrsp, next->userrsp); 
+       write_pda(pcurrent, next_p); 
+
+       write_pda(kernelstack,
+       (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
+#ifdef CONFIG_CC_STACKPROTECTOR
+       write_pda(stack_canary, next_p->stack_canary);
+       /*
+        * Build time only check to make sure the stack_canary is at
+        * offset 40 in the pda; this is a gcc ABI requirement
+        */
+       BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
+#endif
+
+       /*
+        * Now maybe reload the debug registers and handle I/O bitmaps
+        */
+       if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
+           || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
+               __switch_to_xtra(prev_p, next_p, tss);
+
+       /* If the task has used fpu the last 5 timeslices, just do a full
+        * restore of the math state immediately to avoid the trap; the
+        * chances of needing FPU soon are obviously high now
+        */
+       if (next_p->fpu_counter>5)
+               math_state_restore();
+       return prev_p;
+}
+
+/*
+ * sys_execve() executes a new program.
+ */
+asmlinkage 
+long sys_execve(char __user *name, char __user * __user *argv,
+               char __user * __user *envp, struct pt_regs regs)
+{
+       long error;
+       char * filename;
+
+       filename = getname(name);
+       error = PTR_ERR(filename);
+       if (IS_ERR(filename)) 
+               return error;
+       error = do_execve(filename, argv, envp, &regs); 
+       if (error == 0) {
+               task_lock(current);
+               current->ptrace &= ~PT_DTRACE;
+               task_unlock(current);
+       }
+       putname(filename);
+       return error;
+}
+
+void set_personality_64bit(void)
+{
+       /* inherit personality from parent */
+
+       /* Make sure to be in 64bit mode */
+       clear_thread_flag(TIF_IA32); 
+
+       /* TBD: overwrites user setup. Should have two bits.
+          But 64bit processes have always behaved this way,
+          so it's not too bad. The main problem is just that
+          32bit childs are affected again. */
+       current->personality &= ~READ_IMPLIES_EXEC;
+}
+
+asmlinkage long sys_fork(struct pt_regs *regs)
+{
+       return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
+}
+
+asmlinkage long
+sys_clone(unsigned long clone_flags, unsigned long newsp,
+         void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
+{
+       if (!newsp)
+               newsp = regs->rsp;
+       return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
+}
+
+/*
+ * This is trivial, and on the face of it looks like it
+ * could equally well be done in user mode.
+ *
+ * Not so, for quite unobvious reasons - register pressure.
+ * In user mode vfork() cannot have a stack frame, and if
+ * done by calling the "clone()" system call directly, you
+ * do not have enough call-clobbered registers to hold all
+ * the information you need.
+ */
+asmlinkage long sys_vfork(struct pt_regs *regs)
+{
+       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
+                   NULL, NULL);
+}
+
+unsigned long get_wchan(struct task_struct *p)
+{
+       unsigned long stack;
+       u64 fp,rip;
+       int count = 0;
+
+       if (!p || p == current || p->state==TASK_RUNNING)
+               return 0; 
+       stack = (unsigned long)task_stack_page(p);
+       if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
+               return 0;
+       fp = *(u64 *)(p->thread.rsp);
+       do { 
+               if (fp < (unsigned long)stack ||
+                   fp > (unsigned long)stack+THREAD_SIZE)
+                       return 0; 
+               rip = *(u64 *)(fp+8); 
+               if (!in_sched_functions(rip))
+                       return rip; 
+               fp = *(u64 *)fp; 
+       } while (count++ < 16); 
+       return 0;
+}
+
+long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
+{ 
+       int ret = 0; 
+       int doit = task == current;
+       int cpu;
+
+       switch (code) { 
+       case ARCH_SET_GS:
+               if (addr >= TASK_SIZE_OF(task))
+                       return -EPERM; 
+               cpu = get_cpu();
+               /* handle small bases via the GDT because that's faster to 
+                  switch. */
+               if (addr <= 0xffffffff) {  
+                       set_32bit_tls(task, GS_TLS, addr); 
+                       if (doit) { 
+                               load_TLS(&task->thread, cpu);
+                               load_gs_index(GS_TLS_SEL); 
+                       }
+                       task->thread.gsindex = GS_TLS_SEL; 
+                       task->thread.gs = 0;
+               } else { 
+                       task->thread.gsindex = 0;
+                       task->thread.gs = addr;
+                       if (doit) {
+                               load_gs_index(0);
+                               ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
+                       } 
+               }
+               put_cpu();
+               break;
+       case ARCH_SET_FS:
+               /* Not strictly needed for fs, but do it for symmetry
+                  with gs */
+               if (addr >= TASK_SIZE_OF(task))
+                       return -EPERM; 
+               cpu = get_cpu();
+               /* handle small bases via the GDT because that's faster to 
+                  switch. */
+               if (addr <= 0xffffffff) { 
+                       set_32bit_tls(task, FS_TLS, addr);
+                       if (doit) { 
+                               load_TLS(&task->thread, cpu); 
+                               asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
+                       }
+                       task->thread.fsindex = FS_TLS_SEL;
+                       task->thread.fs = 0;
+               } else { 
+                       task->thread.fsindex = 0;
+                       task->thread.fs = addr;
+                       if (doit) {
+                               /* set the selector to 0 to not confuse
+                                  __switch_to */
+                               asm volatile("movl %0,%%fs" :: "r" (0));
+                               ret = checking_wrmsrl(MSR_FS_BASE, addr);
+                       }
+               }
+               put_cpu();
+               break;
+       case ARCH_GET_FS: { 
+               unsigned long base; 
+               if (task->thread.fsindex == FS_TLS_SEL)
+                       base = read_32bit_tls(task, FS_TLS);
+               else if (doit)
+                       rdmsrl(MSR_FS_BASE, base);
+               else
+                       base = task->thread.fs;
+               ret = put_user(base, (unsigned long __user *)addr); 
+               break; 
+       }
+       case ARCH_GET_GS: { 
+               unsigned long base;
+               unsigned gsindex;
+               if (task->thread.gsindex == GS_TLS_SEL)
+                       base = read_32bit_tls(task, GS_TLS);
+               else if (doit) {
+                       asm("movl %%gs,%0" : "=r" (gsindex));
+                       if (gsindex)
+                               rdmsrl(MSR_KERNEL_GS_BASE, base);
+                       else
+                               base = task->thread.gs;
+               }
+               else
+                       base = task->thread.gs;
+               ret = put_user(base, (unsigned long __user *)addr); 
+               break;
+       }
+
+       default:
+               ret = -EINVAL;
+               break;
+       } 
+
+       return ret;     
+} 
+
+long sys_arch_prctl(int code, unsigned long addr)
+{
+       return do_arch_prctl(current, code, addr);
+} 
+
+/* 
+ * Capture the user space registers if the task is not running (in user space)
+ */
+int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
+{
+       struct pt_regs *pp, ptregs;
+
+       pp = task_pt_regs(tsk);
+
+       ptregs = *pp; 
+       ptregs.cs &= 0xffff;
+       ptregs.ss &= 0xffff;
+
+       elf_core_copy_regs(regs, &ptregs);
+       return 1;
+}
+
+unsigned long arch_align_stack(unsigned long sp)
+{
+       if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+               sp -= get_random_int() % 8192;
+       return sp & ~0xf;
+}
diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c
new file mode 100644 (file)
index 0000000..eea3702
--- /dev/null
@@ -0,0 +1,627 @@
+/* ptrace.c */
+/* By Ross Biro 1/23/92 */
+/*
+ * Pentium III FXSR, SSE support
+ *     Gareth Hughes <gareth@valinux.com>, May 2000
+ * 
+ * x86-64 port 2000-2002 Andi Kleen
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/user.h>
+#include <linux/security.h>
+#include <linux/audit.h>
+#include <linux/seccomp.h>
+#include <linux/signal.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/debugreg.h>
+#include <asm/ldt.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/ia32.h>
+
+/*
+ * does not yet catch signals sent when the child dies.
+ * in exit.c or in signal.c.
+ */
+
+/*
+ * Determines which flags the user has access to [1 = access, 0 = no access].
+ * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9).
+ * Also masks reserved bits (63-22, 15, 5, 3, 1).
+ */
+#define FLAG_MASK 0x54dd5UL
+
+/* set's the trap flag. */
+#define TRAP_FLAG 0x100UL
+
+/*
+ * eflags and offset of eflags on child stack..
+ */
+#define EFLAGS offsetof(struct pt_regs, eflags)
+#define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs)))
+
+/*
+ * this routine will get a word off of the processes privileged stack. 
+ * the offset is how far from the base addr as stored in the TSS.  
+ * this routine assumes that all the privileged stacks are in our
+ * data space.
+ */   
+static inline unsigned long get_stack_long(struct task_struct *task, int offset)
+{
+       unsigned char *stack;
+
+       stack = (unsigned char *)task->thread.rsp0;
+       stack += offset;
+       return (*((unsigned long *)stack));
+}
+
+/*
+ * this routine will put a word on the processes privileged stack. 
+ * the offset is how far from the base addr as stored in the TSS.  
+ * this routine assumes that all the privileged stacks are in our
+ * data space.
+ */
+static inline long put_stack_long(struct task_struct *task, int offset,
+       unsigned long data)
+{
+       unsigned char * stack;
+
+       stack = (unsigned char *) task->thread.rsp0;
+       stack += offset;
+       *(unsigned long *) stack = data;
+       return 0;
+}
+
+#define LDT_SEGMENT 4
+
+unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs)
+{
+       unsigned long addr, seg;
+
+       addr = regs->rip;
+       seg = regs->cs & 0xffff;
+
+       /*
+        * We'll assume that the code segments in the GDT
+        * are all zero-based. That is largely true: the
+        * TLS segments are used for data, and the PNPBIOS
+        * and APM bios ones we just ignore here.
+        */
+       if (seg & LDT_SEGMENT) {
+               u32 *desc;
+               unsigned long base;
+
+               seg &= ~7UL;
+
+               down(&child->mm->context.sem);
+               if (unlikely((seg >> 3) >= child->mm->context.size))
+                       addr = -1L; /* bogus selector, access would fault */
+               else {
+                       desc = child->mm->context.ldt + seg;
+                       base = ((desc[0] >> 16) |
+                               ((desc[1] & 0xff) << 16) |
+                               (desc[1] & 0xff000000));
+
+                       /* 16-bit code segment? */
+                       if (!((desc[1] >> 22) & 1))
+                               addr &= 0xffff;
+                       addr += base;
+               }
+               up(&child->mm->context.sem);
+       }
+
+       return addr;
+}
+
+static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
+{
+       int i, copied;
+       unsigned char opcode[15];
+       unsigned long addr = convert_rip_to_linear(child, regs);
+
+       copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
+       for (i = 0; i < copied; i++) {
+               switch (opcode[i]) {
+               /* popf and iret */
+               case 0x9d: case 0xcf:
+                       return 1;
+
+                       /* CHECKME: 64 65 */
+
+               /* opcode and address size prefixes */
+               case 0x66: case 0x67:
+                       continue;
+               /* irrelevant prefixes (segment overrides and repeats) */
+               case 0x26: case 0x2e:
+               case 0x36: case 0x3e:
+               case 0x64: case 0x65:
+               case 0xf2: case 0xf3:
+                       continue;
+
+               case 0x40 ... 0x4f:
+                       if (regs->cs != __USER_CS)
+                               /* 32-bit mode: register increment */
+                               return 0;
+                       /* 64-bit mode: REX prefix */
+                       continue;
+
+                       /* CHECKME: f2, f3 */
+
+               /*
+                * pushf: NOTE! We should probably not let
+                * the user see the TF bit being set. But
+                * it's more pain than it's worth to avoid
+                * it, and a debugger could emulate this
+                * all in user space if it _really_ cares.
+                */
+               case 0x9c:
+               default:
+                       return 0;
+               }
+       }
+       return 0;
+}
+
+static void set_singlestep(struct task_struct *child)
+{
+       struct pt_regs *regs = task_pt_regs(child);
+
+       /*
+        * Always set TIF_SINGLESTEP - this guarantees that
+        * we single-step system calls etc..  This will also
+        * cause us to set TF when returning to user mode.
+        */
+       set_tsk_thread_flag(child, TIF_SINGLESTEP);
+
+       /*
+        * If TF was already set, don't do anything else
+        */
+       if (regs->eflags & TRAP_FLAG)
+               return;
+
+       /* Set TF on the kernel stack.. */
+       regs->eflags |= TRAP_FLAG;
+
+       /*
+        * ..but if TF is changed by the instruction we will trace,
+        * don't mark it as being "us" that set it, so that we
+        * won't clear it by hand later.
+        */
+       if (is_setting_trap_flag(child, regs))
+               return;
+
+       child->ptrace |= PT_DTRACE;
+}
+
+static void clear_singlestep(struct task_struct *child)
+{
+       /* Always clear TIF_SINGLESTEP... */
+       clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+
+       /* But touch TF only if it was set by us.. */
+       if (child->ptrace & PT_DTRACE) {
+               struct pt_regs *regs = task_pt_regs(child);
+               regs->eflags &= ~TRAP_FLAG;
+               child->ptrace &= ~PT_DTRACE;
+       }
+}
+
+/*
+ * Called by kernel/ptrace.c when detaching..
+ *
+ * Make sure the single step bit is not set.
+ */
+void ptrace_disable(struct task_struct *child)
+{ 
+       clear_singlestep(child);
+}
+
+static int putreg(struct task_struct *child,
+       unsigned long regno, unsigned long value)
+{
+       unsigned long tmp; 
+       
+       switch (regno) {
+               case offsetof(struct user_regs_struct,fs):
+                       if (value && (value & 3) != 3)
+                               return -EIO;
+                       child->thread.fsindex = value & 0xffff; 
+                       return 0;
+               case offsetof(struct user_regs_struct,gs):
+                       if (value && (value & 3) != 3)
+                               return -EIO;
+                       child->thread.gsindex = value & 0xffff;
+                       return 0;
+               case offsetof(struct user_regs_struct,ds):
+                       if (value && (value & 3) != 3)
+                               return -EIO;
+                       child->thread.ds = value & 0xffff;
+                       return 0;
+               case offsetof(struct user_regs_struct,es): 
+                       if (value && (value & 3) != 3)
+                               return -EIO;
+                       child->thread.es = value & 0xffff;
+                       return 0;
+               case offsetof(struct user_regs_struct,ss):
+                       if ((value & 3) != 3)
+                               return -EIO;
+                       value &= 0xffff;
+                       return 0;
+               case offsetof(struct user_regs_struct,fs_base):
+                       if (value >= TASK_SIZE_OF(child))
+                               return -EIO;
+                       child->thread.fs = value;
+                       return 0;
+               case offsetof(struct user_regs_struct,gs_base):
+                       if (value >= TASK_SIZE_OF(child))
+                               return -EIO;
+                       child->thread.gs = value;
+                       return 0;
+               case offsetof(struct user_regs_struct, eflags):
+                       value &= FLAG_MASK;
+                       tmp = get_stack_long(child, EFL_OFFSET); 
+                       tmp &= ~FLAG_MASK; 
+                       value |= tmp;
+                       break;
+               case offsetof(struct user_regs_struct,cs): 
+                       if ((value & 3) != 3)
+                               return -EIO;
+                       value &= 0xffff;
+                       break;
+       }
+       put_stack_long(child, regno - sizeof(struct pt_regs), value);
+       return 0;
+}
+
+static unsigned long getreg(struct task_struct *child, unsigned long regno)
+{
+       unsigned long val;
+       switch (regno) {
+               case offsetof(struct user_regs_struct, fs):
+                       return child->thread.fsindex;
+               case offsetof(struct user_regs_struct, gs):
+                       return child->thread.gsindex;
+               case offsetof(struct user_regs_struct, ds):
+                       return child->thread.ds;
+               case offsetof(struct user_regs_struct, es):
+                       return child->thread.es; 
+               case offsetof(struct user_regs_struct, fs_base):
+                       return child->thread.fs;
+               case offsetof(struct user_regs_struct, gs_base):
+                       return child->thread.gs;
+               default:
+                       regno = regno - sizeof(struct pt_regs);
+                       val = get_stack_long(child, regno);
+                       if (test_tsk_thread_flag(child, TIF_IA32))
+                               val &= 0xffffffff;
+                       return val;
+       }
+
+}
+
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
+{
+       long i, ret;
+       unsigned ui;
+
+       switch (request) {
+       /* when I and D space are separate, these will need to be fixed. */
+       case PTRACE_PEEKTEXT: /* read word at location addr. */ 
+       case PTRACE_PEEKDATA:
+               ret = generic_ptrace_peekdata(child, addr, data);
+               break;
+
+       /* read the word at location addr in the USER area. */
+       case PTRACE_PEEKUSR: {
+               unsigned long tmp;
+
+               ret = -EIO;
+               if ((addr & 7) ||
+                   addr > sizeof(struct user) - 7)
+                       break;
+
+               switch (addr) { 
+               case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
+                       tmp = getreg(child, addr);
+                       break;
+               case offsetof(struct user, u_debugreg[0]):
+                       tmp = child->thread.debugreg0;
+                       break;
+               case offsetof(struct user, u_debugreg[1]):
+                       tmp = child->thread.debugreg1;
+                       break;
+               case offsetof(struct user, u_debugreg[2]):
+                       tmp = child->thread.debugreg2;
+                       break;
+               case offsetof(struct user, u_debugreg[3]):
+                       tmp = child->thread.debugreg3;
+                       break;
+               case offsetof(struct user, u_debugreg[6]):
+                       tmp = child->thread.debugreg6;
+                       break;
+               case offsetof(struct user, u_debugreg[7]):
+                       tmp = child->thread.debugreg7;
+                       break;
+               default:
+                       tmp = 0;
+                       break;
+               }
+               ret = put_user(tmp,(unsigned long __user *) data);
+               break;
+       }
+
+       /* when I and D space are separate, this will have to be fixed. */
+       case PTRACE_POKETEXT: /* write the word at location addr. */
+       case PTRACE_POKEDATA:
+               ret = generic_ptrace_pokedata(child, addr, data);
+               break;
+
+       case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
+       {
+               int dsize = test_tsk_thread_flag(child, TIF_IA32) ? 3 : 7;
+               ret = -EIO;
+               if ((addr & 7) ||
+                   addr > sizeof(struct user) - 7)
+                       break;
+
+               switch (addr) { 
+               case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
+                       ret = putreg(child, addr, data);
+                       break;
+               /* Disallows to set a breakpoint into the vsyscall */
+               case offsetof(struct user, u_debugreg[0]):
+                       if (data >= TASK_SIZE_OF(child) - dsize) break;
+                       child->thread.debugreg0 = data;
+                       ret = 0;
+                       break;
+               case offsetof(struct user, u_debugreg[1]):
+                       if (data >= TASK_SIZE_OF(child) - dsize) break;
+                       child->thread.debugreg1 = data;
+                       ret = 0;
+                       break;
+               case offsetof(struct user, u_debugreg[2]):
+                       if (data >= TASK_SIZE_OF(child) - dsize) break;
+                       child->thread.debugreg2 = data;
+                       ret = 0;
+                       break;
+               case offsetof(struct user, u_debugreg[3]):
+                       if (data >= TASK_SIZE_OF(child) - dsize) break;
+                       child->thread.debugreg3 = data;
+                       ret = 0;
+                       break;
+               case offsetof(struct user, u_debugreg[6]):
+                                 if (data >> 32)
+                               break; 
+                       child->thread.debugreg6 = data;
+                       ret = 0;
+                       break;
+               case offsetof(struct user, u_debugreg[7]):
+                       /* See arch/i386/kernel/ptrace.c for an explanation of
+                        * this awkward check.*/
+                       data &= ~DR_CONTROL_RESERVED;
+                       for(i=0; i<4; i++)
+                               if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
+                                       break;
+                       if (i == 4) {
+                         child->thread.debugreg7 = data;
+                         if (data)
+                               set_tsk_thread_flag(child, TIF_DEBUG);
+                         else
+                               clear_tsk_thread_flag(child, TIF_DEBUG);
+                         ret = 0;
+                       }
+                 break;
+               }
+               break;
+       }
+       case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
+       case PTRACE_CONT:    /* restart after signal. */
+
+               ret = -EIO;
+               if (!valid_signal(data))
+                       break;
+               if (request == PTRACE_SYSCALL)
+                       set_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
+               else
+                       clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
+               clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+               child->exit_code = data;
+               /* make sure the single step bit is not set. */
+               clear_singlestep(child);
+               wake_up_process(child);
+               ret = 0;
+               break;
+
+#ifdef CONFIG_IA32_EMULATION
+               /* This makes only sense with 32bit programs. Allow a
+                  64bit debugger to fully examine them too. Better
+                  don't use it against 64bit processes, use
+                  PTRACE_ARCH_PRCTL instead. */
+       case PTRACE_SET_THREAD_AREA: {
+               struct user_desc __user *p;
+               int old; 
+               p = (struct user_desc __user *)data;
+               get_user(old,  &p->entry_number); 
+               put_user(addr, &p->entry_number);
+               ret = do_set_thread_area(&child->thread, p);
+               put_user(old,  &p->entry_number); 
+               break;
+       case PTRACE_GET_THREAD_AREA:
+               p = (struct user_desc __user *)data;
+               get_user(old,  &p->entry_number); 
+               put_user(addr, &p->entry_number);
+               ret = do_get_thread_area(&child->thread, p);
+               put_user(old,  &p->entry_number); 
+               break;
+       } 
+#endif
+               /* normal 64bit interface to access TLS data. 
+                  Works just like arch_prctl, except that the arguments
+                  are reversed. */
+       case PTRACE_ARCH_PRCTL: 
+               ret = do_arch_prctl(child, data, addr);
+               break;
+
+/*
+ * make the child exit.  Best I can do is send it a sigkill. 
+ * perhaps it should be put in the status that it wants to 
+ * exit.
+ */
+       case PTRACE_KILL:
+               ret = 0;
+               if (child->exit_state == EXIT_ZOMBIE)   /* already dead */
+                       break;
+               clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+               child->exit_code = SIGKILL;
+               /* make sure the single step bit is not set. */
+               clear_singlestep(child);
+               wake_up_process(child);
+               break;
+
+       case PTRACE_SINGLESTEP:    /* set the trap flag. */
+               ret = -EIO;
+               if (!valid_signal(data))
+                       break;
+               clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
+               set_singlestep(child);
+               child->exit_code = data;
+               /* give it a chance to run. */
+               wake_up_process(child);
+               ret = 0;
+               break;
+
+       case PTRACE_DETACH:
+               /* detach a process that was attached. */
+               ret = ptrace_detach(child, data);
+               break;
+
+       case PTRACE_GETREGS: { /* Get all gp regs from the child. */
+               if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
+                              sizeof(struct user_regs_struct))) {
+                       ret = -EIO;
+                       break;
+               }
+               ret = 0;
+               for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
+                       ret |= __put_user(getreg(child, ui),(unsigned long __user *) data);
+                       data += sizeof(long);
+               }
+               break;
+       }
+
+       case PTRACE_SETREGS: { /* Set all gp regs in the child. */
+               unsigned long tmp;
+               if (!access_ok(VERIFY_READ, (unsigned __user *)data,
+                              sizeof(struct user_regs_struct))) {
+                       ret = -EIO;
+                       break;
+               }
+               ret = 0;
+               for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
+                       ret = __get_user(tmp, (unsigned long __user *) data);
+                       if (ret)
+                               break;
+                       ret = putreg(child, ui, tmp);
+                       if (ret)
+                               break;
+                       data += sizeof(long);
+               }
+               break;
+       }
+
+       case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */
+               if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
+                              sizeof(struct user_i387_struct))) {
+                       ret = -EIO;
+                       break;
+               }
+               ret = get_fpregs((struct user_i387_struct __user *)data, child);
+               break;
+       }
+
+       case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */
+               if (!access_ok(VERIFY_READ, (unsigned __user *)data,
+                              sizeof(struct user_i387_struct))) {
+                       ret = -EIO;
+                       break;
+               }
+               set_stopped_child_used_math(child);
+               ret = set_fpregs(child, (struct user_i387_struct __user *)data);
+               break;
+       }
+
+       default:
+               ret = ptrace_request(child, request, addr, data);
+               break;
+       }
+       return ret;
+}
+
+static void syscall_trace(struct pt_regs *regs)
+{
+
+#if 0
+       printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n",
+              current->comm,
+              regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0),
+              current_thread_info()->flags, current->ptrace); 
+#endif
+
+       ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
+                               ? 0x80 : 0));
+       /*
+        * this isn't the same as continuing with a signal, but it will do
+        * for normal use.  strace only continues with a signal if the
+        * stopping signal is not SIGTRAP.  -brl
+        */
+       if (current->exit_code) {
+               send_sig(current->exit_code, current, 1);
+               current->exit_code = 0;
+       }
+}
+
+asmlinkage void syscall_trace_enter(struct pt_regs *regs)
+{
+       /* do the secure computing check first */
+       secure_computing(regs->orig_rax);
+
+       if (test_thread_flag(TIF_SYSCALL_TRACE)
+           && (current->ptrace & PT_PTRACED))
+               syscall_trace(regs);
+
+       if (unlikely(current->audit_context)) {
+               if (test_thread_flag(TIF_IA32)) {
+                       audit_syscall_entry(AUDIT_ARCH_I386,
+                                           regs->orig_rax,
+                                           regs->rbx, regs->rcx,
+                                           regs->rdx, regs->rsi);
+               } else {
+                       audit_syscall_entry(AUDIT_ARCH_X86_64,
+                                           regs->orig_rax,
+                                           regs->rdi, regs->rsi,
+                                           regs->rdx, regs->r10);
+               }
+       }
+}
+
+asmlinkage void syscall_trace_leave(struct pt_regs *regs)
+{
+       if (unlikely(current->audit_context))
+               audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax);
+
+       if ((test_thread_flag(TIF_SYSCALL_TRACE)
+            || test_thread_flag(TIF_SINGLESTEP))
+           && (current->ptrace & PT_PTRACED))
+               syscall_trace(regs);
+}
diff --git a/arch/x86/kernel/reboot_64.c b/arch/x86/kernel/reboot_64.c
new file mode 100644 (file)
index 0000000..368db2b
--- /dev/null
@@ -0,0 +1,171 @@
+/* Various gunk just to reboot the machine. */ 
+#include <linux/module.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/pm.h>
+#include <linux/kdebug.h>
+#include <linux/sched.h>
+#include <asm/io.h>
+#include <asm/delay.h>
+#include <asm/hw_irq.h>
+#include <asm/system.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/apic.h>
+#include <asm/iommu.h>
+
+/*
+ * Power off function, if any
+ */
+void (*pm_power_off)(void);
+EXPORT_SYMBOL(pm_power_off);
+
+static long no_idt[3];
+static enum { 
+       BOOT_TRIPLE = 't',
+       BOOT_KBD = 'k'
+} reboot_type = BOOT_KBD;
+static int reboot_mode = 0;
+int reboot_force;
+
+/* reboot=t[riple] | k[bd] [, [w]arm | [c]old]
+   warm   Don't set the cold reboot flag
+   cold   Set the cold reboot flag
+   triple Force a triple fault (init)
+   kbd    Use the keyboard controller. cold reset (default)
+   force  Avoid anything that could hang.
+ */ 
+static int __init reboot_setup(char *str)
+{
+       for (;;) {
+               switch (*str) {
+               case 'w': 
+                       reboot_mode = 0x1234;
+                       break;
+
+               case 'c':
+                       reboot_mode = 0;
+                       break;
+
+               case 't':
+               case 'b':
+               case 'k':
+                       reboot_type = *str;
+                       break;
+               case 'f':
+                       reboot_force = 1;
+                       break;
+               }
+               if((str = strchr(str,',')) != NULL)
+                       str++;
+               else
+                       break;
+       }
+       return 1;
+}
+
+__setup("reboot=", reboot_setup);
+
+static inline void kb_wait(void)
+{
+       int i;
+
+       for (i=0; i<0x10000; i++)
+               if ((inb_p(0x64) & 0x02) == 0)
+                       break;
+}
+
+void machine_shutdown(void)
+{
+       unsigned long flags;
+
+       /* Stop the cpus and apics */
+#ifdef CONFIG_SMP
+       int reboot_cpu_id;
+
+       /* The boot cpu is always logical cpu 0 */
+       reboot_cpu_id = 0;
+
+       /* Make certain the cpu I'm about to reboot on is online */
+       if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
+               reboot_cpu_id = smp_processor_id();
+       }
+
+       /* Make certain I only run on the appropriate processor */
+       set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
+
+       /* O.K Now that I'm on the appropriate processor,
+        * stop all of the others.
+        */
+       smp_send_stop();
+#endif
+
+       local_irq_save(flags);
+
+#ifndef CONFIG_SMP
+       disable_local_APIC();
+#endif
+
+       disable_IO_APIC();
+
+       local_irq_restore(flags);
+
+       pci_iommu_shutdown();
+}
+
+void machine_emergency_restart(void)
+{
+       int i;
+
+       /* Tell the BIOS if we want cold or warm reboot */
+       *((unsigned short *)__va(0x472)) = reboot_mode;
+       
+       for (;;) {
+               /* Could also try the reset bit in the Hammer NB */
+               switch (reboot_type) { 
+               case BOOT_KBD:
+               for (i=0; i<10; i++) {
+                       kb_wait();
+                       udelay(50);
+                       outb(0xfe,0x64);         /* pulse reset low */
+                       udelay(50);
+               }
+
+               case BOOT_TRIPLE: 
+                       __asm__ __volatile__("lidt (%0)": :"r" (&no_idt));
+                       __asm__ __volatile__("int3");
+
+                       reboot_type = BOOT_KBD;
+                       break;
+               }      
+       }      
+}
+
+void machine_restart(char * __unused)
+{
+       printk("machine restart\n");
+
+       if (!reboot_force) {
+               machine_shutdown();
+       }
+       machine_emergency_restart();
+}
+
+void machine_halt(void)
+{
+}
+
+void machine_power_off(void)
+{
+       if (pm_power_off) {
+               if (!reboot_force) {
+                       machine_shutdown();
+               }
+               pm_power_off();
+       }
+}
+
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
new file mode 100644 (file)
index 0000000..14e9587
--- /dev/null
@@ -0,0 +1,276 @@
+/*
+ * relocate_kernel.S - put the kernel image in place to boot
+ * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page.h>
+#include <asm/kexec.h>
+
+/*
+ * Must be relocatable PIC code callable as a C function
+ */
+
+#define PTR(x) (x << 3)
+#define PAGE_ALIGNED (1 << PAGE_SHIFT)
+#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
+
+       .text
+       .align PAGE_ALIGNED
+       .code64
+       .globl relocate_kernel
+relocate_kernel:
+       /* %rdi indirection_page
+        * %rsi page_list
+        * %rdx start address
+        */
+
+       /* map the control page at its virtual address */
+
+       movq    $0x0000ff8000000000, %r10        /* mask */
+       mov     $(39 - 3), %cl                   /* bits to shift */
+       movq    PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
+
+       movq    %r11, %r9
+       andq    %r10, %r9
+       shrq    %cl, %r9
+
+       movq    PTR(VA_PGD)(%rsi), %r8
+       addq    %r8, %r9
+       movq    PTR(PA_PUD_0)(%rsi), %r8
+       orq     $PAGE_ATTR, %r8
+       movq    %r8, (%r9)
+
+       shrq    $9, %r10
+       sub     $9, %cl
+
+       movq    %r11, %r9
+       andq    %r10, %r9
+       shrq    %cl, %r9
+
+       movq    PTR(VA_PUD_0)(%rsi), %r8
+       addq    %r8, %r9
+       movq    PTR(PA_PMD_0)(%rsi), %r8
+       orq     $PAGE_ATTR, %r8
+       movq    %r8, (%r9)
+
+       shrq    $9, %r10
+       sub     $9, %cl
+
+       movq    %r11, %r9
+       andq    %r10, %r9
+       shrq    %cl, %r9
+
+       movq    PTR(VA_PMD_0)(%rsi), %r8
+       addq    %r8, %r9
+       movq    PTR(PA_PTE_0)(%rsi), %r8
+       orq     $PAGE_ATTR, %r8
+       movq    %r8, (%r9)
+
+       shrq    $9, %r10
+       sub     $9, %cl
+
+       movq    %r11, %r9
+       andq    %r10, %r9
+       shrq    %cl, %r9
+
+       movq    PTR(VA_PTE_0)(%rsi), %r8
+       addq    %r8, %r9
+       movq    PTR(PA_CONTROL_PAGE)(%rsi), %r8
+       orq     $PAGE_ATTR, %r8
+       movq    %r8, (%r9)
+
+       /* identity map the control page at its physical address */
+
+       movq    $0x0000ff8000000000, %r10        /* mask */
+       mov     $(39 - 3), %cl                   /* bits to shift */
+       movq    PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
+
+       movq    %r11, %r9
+       andq    %r10, %r9
+       shrq    %cl, %r9
+
+       movq    PTR(VA_PGD)(%rsi), %r8
+       addq    %r8, %r9
+       movq    PTR(PA_PUD_1)(%rsi), %r8
+       orq     $PAGE_ATTR, %r8
+       movq    %r8, (%r9)
+
+       shrq    $9, %r10
+       sub     $9, %cl
+
+       movq    %r11, %r9
+       andq    %r10, %r9
+       shrq    %cl, %r9
+
+       movq    PTR(VA_PUD_1)(%rsi), %r8
+       addq    %r8, %r9
+       movq    PTR(PA_PMD_1)(%rsi), %r8
+       orq     $PAGE_ATTR, %r8
+       movq    %r8, (%r9)
+
+       shrq    $9, %r10
+       sub     $9, %cl
+
+       movq    %r11, %r9
+       andq    %r10, %r9
+       shrq    %cl, %r9
+
+       movq    PTR(VA_PMD_1)(%rsi), %r8
+       addq    %r8, %r9
+       movq    PTR(PA_PTE_1)(%rsi), %r8
+       orq     $PAGE_ATTR, %r8
+       movq    %r8, (%r9)
+
+       shrq    $9, %r10
+       sub     $9, %cl
+
+       movq    %r11, %r9
+       andq    %r10, %r9
+       shrq    %cl, %r9
+
+       movq    PTR(VA_PTE_1)(%rsi), %r8
+       addq    %r8, %r9
+       movq    PTR(PA_CONTROL_PAGE)(%rsi), %r8
+       orq     $PAGE_ATTR, %r8
+       movq    %r8, (%r9)
+
+relocate_new_kernel:
+       /* %rdi indirection_page
+        * %rsi page_list
+        * %rdx start address
+        */
+
+       /* zero out flags, and disable interrupts */
+       pushq $0
+       popfq
+
+       /* get physical address of control page now */
+       /* this is impossible after page table switch */
+       movq    PTR(PA_CONTROL_PAGE)(%rsi), %r8
+
+       /* get physical address of page table now too */
+       movq    PTR(PA_TABLE_PAGE)(%rsi), %rcx
+
+       /* switch to new set of page tables */
+       movq    PTR(PA_PGD)(%rsi), %r9
+       movq    %r9, %cr3
+
+       /* setup a new stack at the end of the physical control page */
+       lea     4096(%r8), %rsp
+
+       /* jump to identity mapped page */
+       addq    $(identity_mapped - relocate_kernel), %r8
+       pushq   %r8
+       ret
+
+identity_mapped:
+       /* store the start address on the stack */
+       pushq   %rdx
+
+       /* Set cr0 to a known state:
+        * 31 1 == Paging enabled
+        * 18 0 == Alignment check disabled
+        * 16 0 == Write protect disabled
+        * 3  0 == No task switch
+        * 2  0 == Don't do FP software emulation.
+        * 0  1 == Proctected mode enabled
+        */
+       movq    %cr0, %rax
+       andq    $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax
+       orl     $((1<<31)|(1<<0)), %eax
+       movq    %rax, %cr0
+
+       /* Set cr4 to a known state:
+        * 10 0 == xmm exceptions disabled
+        * 9  0 == xmm registers instructions disabled
+        * 8  0 == performance monitoring counter disabled
+        * 7  0 == page global disabled
+        * 6  0 == machine check exceptions disabled
+        * 5  1 == physical address extension enabled
+        * 4  0 == page size extensions disabled
+        * 3  0 == Debug extensions disabled
+        * 2  0 == Time stamp disable (disabled)
+        * 1  0 == Protected mode virtual interrupts disabled
+        * 0  0 == VME disabled
+        */
+
+       movq    $((1<<5)), %rax
+       movq    %rax, %cr4
+
+       jmp 1f
+1:
+
+       /* Switch to the identity mapped page tables,
+        * and flush the TLB.
+       */
+       movq    %rcx, %cr3
+
+       /* Do the copies */
+       movq    %rdi, %rcx      /* Put the page_list in %rcx */
+       xorq    %rdi, %rdi
+       xorq    %rsi, %rsi
+       jmp     1f
+
+0:     /* top, read another word for the indirection page */
+
+       movq    (%rbx), %rcx
+       addq    $8,     %rbx
+1:
+       testq   $0x1,   %rcx  /* is it a destination page? */
+       jz      2f
+       movq    %rcx,   %rdi
+       andq    $0xfffffffffffff000, %rdi
+       jmp     0b
+2:
+       testq   $0x2,   %rcx  /* is it an indirection page? */
+       jz      2f
+       movq    %rcx,   %rbx
+       andq    $0xfffffffffffff000, %rbx
+       jmp     0b
+2:
+       testq   $0x4,   %rcx  /* is it the done indicator? */
+       jz      2f
+       jmp     3f
+2:
+       testq   $0x8,   %rcx  /* is it the source indicator? */
+       jz      0b            /* Ignore it otherwise */
+       movq    %rcx,   %rsi  /* For ever source page do a copy */
+       andq    $0xfffffffffffff000, %rsi
+
+       movq    $512,   %rcx
+       rep ; movsq
+       jmp     0b
+3:
+
+       /* To be certain of avoiding problems with self-modifying code
+        * I need to execute a serializing instruction here.
+        * So I flush the TLB by reloading %cr3 here, it's handy,
+        * and not processor dependent.
+        */
+       movq    %cr3, %rax
+       movq    %rax, %cr3
+
+       /* set all of the registers to known values */
+       /* leave %rsp alone */
+
+       xorq    %rax, %rax
+       xorq    %rbx, %rbx
+       xorq    %rcx, %rcx
+       xorq    %rdx, %rdx
+       xorq    %rsi, %rsi
+       xorq    %rdi, %rdi
+       xorq    %rbp, %rbp
+       xorq    %r8,  %r8
+       xorq    %r9,  %r9
+       xorq    %r10, %r9
+       xorq    %r11, %r11
+       xorq    %r12, %r12
+       xorq    %r13, %r13
+       xorq    %r14, %r14
+       xorq    %r15, %r15
+
+       ret
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
new file mode 100644 (file)
index 0000000..1200aaa
--- /dev/null
@@ -0,0 +1,289 @@
+/* 
+ * X86-64 specific CPU setup.
+ * Copyright (C) 1995  Linus Torvalds
+ * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
+ * See setup.c for older changelog.
+ */ 
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/bootmem.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <asm/bootsetup.h>
+#include <asm/pda.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+#include <asm/atomic.h>
+#include <asm/mmu_context.h>
+#include <asm/smp.h>
+#include <asm/i387.h>
+#include <asm/percpu.h>
+#include <asm/proto.h>
+#include <asm/sections.h>
+
+char x86_boot_params[BOOT_PARAM_SIZE] __initdata;
+
+cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
+
+struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(_cpu_pda);
+struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
+
+struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
+
+char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
+
+unsigned long __supported_pte_mask __read_mostly = ~0UL;
+static int do_not_nx __cpuinitdata = 0;
+
+/* noexec=on|off
+Control non executable mappings for 64bit processes.
+
+on     Enable(default)
+off    Disable
+*/ 
+static int __init nonx_setup(char *str)
+{
+       if (!str)
+               return -EINVAL;
+       if (!strncmp(str, "on", 2)) {
+                __supported_pte_mask |= _PAGE_NX; 
+               do_not_nx = 0; 
+       } else if (!strncmp(str, "off", 3)) {
+               do_not_nx = 1;
+               __supported_pte_mask &= ~_PAGE_NX;
+        }
+       return 0;
+} 
+early_param("noexec", nonx_setup);
+
+int force_personality32 = 0; 
+
+/* noexec32=on|off
+Control non executable heap for 32bit processes.
+To control the stack too use noexec=off
+
+on     PROT_READ does not imply PROT_EXEC for 32bit processes
+off    PROT_READ implies PROT_EXEC (default)
+*/
+static int __init nonx32_setup(char *str)
+{
+       if (!strcmp(str, "on"))
+               force_personality32 &= ~READ_IMPLIES_EXEC;
+       else if (!strcmp(str, "off"))
+               force_personality32 |= READ_IMPLIES_EXEC;
+       return 1;
+}
+__setup("noexec32=", nonx32_setup);
+
+/*
+ * Great future plan:
+ * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
+ * Always point %gs to its beginning
+ */
+void __init setup_per_cpu_areas(void)
+{ 
+       int i;
+       unsigned long size;
+
+#ifdef CONFIG_HOTPLUG_CPU
+       prefill_possible_map();
+#endif
+
+       /* Copy section for each CPU (we discard the original) */
+       size = PERCPU_ENOUGH_ROOM;
+
+       printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
+       for_each_cpu_mask (i, cpu_possible_map) {
+               char *ptr;
+
+               if (!NODE_DATA(cpu_to_node(i))) {
+                       printk("cpu with no node %d, num_online_nodes %d\n",
+                              i, num_online_nodes());
+                       ptr = alloc_bootmem_pages(size);
+               } else { 
+                       ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size);
+               }
+               if (!ptr)
+                       panic("Cannot allocate cpu data for CPU %d\n", i);
+               cpu_pda(i)->data_offset = ptr - __per_cpu_start;
+               memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+       }
+} 
+
+void pda_init(int cpu)
+{ 
+       struct x8664_pda *pda = cpu_pda(cpu);
+
+       /* Setup up data that may be needed in __get_free_pages early */
+       asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); 
+       /* Memory clobbers used to order PDA accessed */
+       mb();
+       wrmsrl(MSR_GS_BASE, pda);
+       mb();
+
+       pda->cpunumber = cpu; 
+       pda->irqcount = -1;
+       pda->kernelstack = 
+               (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; 
+       pda->active_mm = &init_mm;
+       pda->mmu_state = 0;
+
+       if (cpu == 0) {
+               /* others are initialized in smpboot.c */
+               pda->pcurrent = &init_task;
+               pda->irqstackptr = boot_cpu_stack; 
+       } else {
+               pda->irqstackptr = (char *)
+                       __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
+               if (!pda->irqstackptr)
+                       panic("cannot allocate irqstack for cpu %d", cpu); 
+       }
+
+
+       pda->irqstackptr += IRQSTACKSIZE-64;
+} 
+
+char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
+__attribute__((section(".bss.page_aligned")));
+
+extern asmlinkage void ignore_sysret(void);
+
+/* May not be marked __init: used by software suspend */
+void syscall_init(void)
+{
+       /* 
+        * LSTAR and STAR live in a bit strange symbiosis.
+        * They both write to the same internal register. STAR allows to set CS/DS
+        * but only a 32bit target. LSTAR sets the 64bit rip.    
+        */ 
+       wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32); 
+       wrmsrl(MSR_LSTAR, system_call); 
+       wrmsrl(MSR_CSTAR, ignore_sysret);
+
+#ifdef CONFIG_IA32_EMULATION                   
+       syscall32_cpu_init ();
+#endif
+
+       /* Flags to clear on syscall */
+       wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); 
+}
+
+void __cpuinit check_efer(void)
+{
+       unsigned long efer;
+
+       rdmsrl(MSR_EFER, efer); 
+        if (!(efer & EFER_NX) || do_not_nx) { 
+                __supported_pte_mask &= ~_PAGE_NX; 
+        }       
+}
+
+unsigned long kernel_eflags;
+
+/*
+ * cpu_init() initializes state that is per-CPU. Some data is already
+ * initialized (naturally) in the bootstrap process, such as the GDT
+ * and IDT. We reload them nevertheless, this function acts as a
+ * 'CPU state barrier', nothing should get across.
+ * A lot of state is already set up in PDA init.
+ */
+void __cpuinit cpu_init (void)
+{
+       int cpu = stack_smp_processor_id();
+       struct tss_struct *t = &per_cpu(init_tss, cpu);
+       struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
+       unsigned long v; 
+       char *estacks = NULL; 
+       struct task_struct *me;
+       int i;
+
+       /* CPU 0 is initialised in head64.c */
+       if (cpu != 0) {
+               pda_init(cpu);
+       } else 
+               estacks = boot_exception_stacks; 
+
+       me = current;
+
+       if (cpu_test_and_set(cpu, cpu_initialized))
+               panic("CPU#%d already initialized!\n", cpu);
+
+       printk("Initializing CPU#%d\n", cpu);
+
+       clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+
+       /*
+        * Initialize the per-CPU GDT with the boot GDT,
+        * and set up the GDT descriptor:
+        */
+       if (cpu)
+               memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
+
+       cpu_gdt_descr[cpu].size = GDT_SIZE;
+       asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu]));
+       asm volatile("lidt %0" :: "m" (idt_descr));
+
+       memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
+       syscall_init();
+
+       wrmsrl(MSR_FS_BASE, 0);
+       wrmsrl(MSR_KERNEL_GS_BASE, 0);
+       barrier(); 
+
+       check_efer();
+
+       /*
+        * set up and load the per-CPU TSS
+        */
+       for (v = 0; v < N_EXCEPTION_STACKS; v++) {
+               static const unsigned int order[N_EXCEPTION_STACKS] = {
+                       [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
+                       [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
+               };
+               if (cpu) {
+                       estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
+                       if (!estacks)
+                               panic("Cannot allocate exception stack %ld %d\n",
+                                     v, cpu); 
+               }
+               estacks += PAGE_SIZE << order[v];
+               orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
+       }
+
+       t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+       /*
+        * <= is required because the CPU will access up to
+        * 8 bits beyond the end of the IO permission bitmap.
+        */
+       for (i = 0; i <= IO_BITMAP_LONGS; i++)
+               t->io_bitmap[i] = ~0UL;
+
+       atomic_inc(&init_mm.mm_count);
+       me->active_mm = &init_mm;
+       if (me->mm)
+               BUG();
+       enter_lazy_tlb(&init_mm, me);
+
+       set_tss_desc(cpu, t);
+       load_TR_desc();
+       load_LDT(&init_mm.context);
+
+       /*
+        * Clear all 6 debug registers:
+        */
+
+       set_debugreg(0UL, 0);
+       set_debugreg(0UL, 1);
+       set_debugreg(0UL, 2);
+       set_debugreg(0UL, 3);
+       set_debugreg(0UL, 6);
+       set_debugreg(0UL, 7);
+
+       fpu_init(); 
+
+       raw_local_save_flags(kernel_eflags);
+}
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
new file mode 100644 (file)
index 0000000..af838f6
--- /dev/null
@@ -0,0 +1,1117 @@
+/*
+ *  linux/arch/x86-64/kernel/setup.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Nov 2001 Dave Jones <davej@suse.de>
+ *  Forked from i386 setup code.
+ */
+
+/*
+ * This file handles the architecture-dependent parts of initialization
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/a.out.h>
+#include <linux/screen_info.h>
+#include <linux/ioport.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/initrd.h>
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <asm/processor.h>
+#include <linux/console.h>
+#include <linux/seq_file.h>
+#include <linux/crash_dump.h>
+#include <linux/root_dev.h>
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <linux/kallsyms.h>
+#include <linux/edd.h>
+#include <linux/mmzone.h>
+#include <linux/kexec.h>
+#include <linux/cpufreq.h>
+#include <linux/dmi.h>
+#include <linux/dma-mapping.h>
+#include <linux/ctype.h>
+
+#include <asm/mtrr.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/msr.h>
+#include <asm/desc.h>
+#include <video/edid.h>
+#include <asm/e820.h>
+#include <asm/dma.h>
+#include <asm/mpspec.h>
+#include <asm/mmu_context.h>
+#include <asm/bootsetup.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <asm/mach_apic.h>
+#include <asm/numa.h>
+#include <asm/sections.h>
+#include <asm/dmi.h>
+
+/*
+ * Machine setup..
+ */
+
+struct cpuinfo_x86 boot_cpu_data __read_mostly;
+EXPORT_SYMBOL(boot_cpu_data);
+
+unsigned long mmu_cr4_features;
+
+/* Boot loader ID as an integer, for the benefit of proc_dointvec */
+int bootloader_type;
+
+unsigned long saved_video_mode;
+
+int force_mwait __cpuinitdata;
+
+/* 
+ * Early DMI memory
+ */
+int dmi_alloc_index;
+char dmi_alloc_data[DMI_MAX_DATA];
+
+/*
+ * Setup options
+ */
+struct screen_info screen_info;
+EXPORT_SYMBOL(screen_info);
+struct sys_desc_table_struct {
+       unsigned short length;
+       unsigned char table[0];
+};
+
+struct edid_info edid_info;
+EXPORT_SYMBOL_GPL(edid_info);
+
+extern int root_mountflags;
+
+char __initdata command_line[COMMAND_LINE_SIZE];
+
+struct resource standard_io_resources[] = {
+       { .name = "dma1", .start = 0x00, .end = 0x1f,
+               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+       { .name = "pic1", .start = 0x20, .end = 0x21,
+               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+       { .name = "timer0", .start = 0x40, .end = 0x43,
+               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+       { .name = "timer1", .start = 0x50, .end = 0x53,
+               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+       { .name = "keyboard", .start = 0x60, .end = 0x6f,
+               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+       { .name = "dma page reg", .start = 0x80, .end = 0x8f,
+               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+       { .name = "pic2", .start = 0xa0, .end = 0xa1,
+               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+       { .name = "dma2", .start = 0xc0, .end = 0xdf,
+               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+       { .name = "fpu", .start = 0xf0, .end = 0xff,
+               .flags = IORESOURCE_BUSY | IORESOURCE_IO }
+};
+
+#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
+
+struct resource data_resource = {
+       .name = "Kernel data",
+       .start = 0,
+       .end = 0,
+       .flags = IORESOURCE_RAM,
+};
+struct resource code_resource = {
+       .name = "Kernel code",
+       .start = 0,
+       .end = 0,
+       .flags = IORESOURCE_RAM,
+};
+
+#ifdef CONFIG_PROC_VMCORE
+/* elfcorehdr= specifies the location of elf core header
+ * stored by the crashed kernel. This option will be passed
+ * by kexec loader to the capture kernel.
+ */
+static int __init setup_elfcorehdr(char *arg)
+{
+       char *end;
+       if (!arg)
+               return -EINVAL;
+       elfcorehdr_addr = memparse(arg, &end);
+       return end > arg ? 0 : -EINVAL;
+}
+early_param("elfcorehdr", setup_elfcorehdr);
+#endif
+
+#ifndef CONFIG_NUMA
+static void __init
+contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+{
+       unsigned long bootmap_size, bootmap;
+
+       bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
+       bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
+       if (bootmap == -1L)
+               panic("Cannot find bootmem map of size %ld\n",bootmap_size);
+       bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
+       e820_register_active_regions(0, start_pfn, end_pfn);
+       free_bootmem_with_active_regions(0, end_pfn);
+       reserve_bootmem(bootmap, bootmap_size);
+} 
+#endif
+
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+struct edd edd;
+#ifdef CONFIG_EDD_MODULE
+EXPORT_SYMBOL(edd);
+#endif
+/**
+ * copy_edd() - Copy the BIOS EDD information
+ *              from boot_params into a safe place.
+ *
+ */
+static inline void copy_edd(void)
+{
+     memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
+     memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
+     edd.mbr_signature_nr = EDD_MBR_SIG_NR;
+     edd.edd_info_nr = EDD_NR;
+}
+#else
+static inline void copy_edd(void)
+{
+}
+#endif
+
+#define EBDA_ADDR_POINTER 0x40E
+
+unsigned __initdata ebda_addr;
+unsigned __initdata ebda_size;
+
+static void discover_ebda(void)
+{
+       /*
+        * there is a real-mode segmented pointer pointing to the 
+        * 4K EBDA area at 0x40E
+        */
+       ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
+       ebda_addr <<= 4;
+
+       ebda_size = *(unsigned short *)__va(ebda_addr);
+
+       /* Round EBDA up to pages */
+       if (ebda_size == 0)
+               ebda_size = 1;
+       ebda_size <<= 10;
+       ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
+       if (ebda_size > 64*1024)
+               ebda_size = 64*1024;
+}
+
+void __init setup_arch(char **cmdline_p)
+{
+       printk(KERN_INFO "Command line: %s\n", boot_command_line);
+
+       ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
+       screen_info = SCREEN_INFO;
+       edid_info = EDID_INFO;
+       saved_video_mode = SAVED_VIDEO_MODE;
+       bootloader_type = LOADER_TYPE;
+
+#ifdef CONFIG_BLK_DEV_RAM
+       rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
+       rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
+       rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
+#endif
+       setup_memory_region();
+       copy_edd();
+
+       if (!MOUNT_ROOT_RDONLY)
+               root_mountflags &= ~MS_RDONLY;
+       init_mm.start_code = (unsigned long) &_text;
+       init_mm.end_code = (unsigned long) &_etext;
+       init_mm.end_data = (unsigned long) &_edata;
+       init_mm.brk = (unsigned long) &_end;
+
+       code_resource.start = virt_to_phys(&_text);
+       code_resource.end = virt_to_phys(&_etext)-1;
+       data_resource.start = virt_to_phys(&_etext);
+       data_resource.end = virt_to_phys(&_edata)-1;
+
+       early_identify_cpu(&boot_cpu_data);
+
+       strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+       *cmdline_p = command_line;
+
+       parse_early_param();
+
+       finish_e820_parsing();
+
+       e820_register_active_regions(0, 0, -1UL);
+       /*
+        * partially used pages are not usable - thus
+        * we are rounding upwards:
+        */
+       end_pfn = e820_end_of_ram();
+       num_physpages = end_pfn;
+
+       check_efer();
+
+       discover_ebda();
+
+       init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
+
+       dmi_scan_machine();
+
+#ifdef CONFIG_ACPI
+       /*
+        * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
+        * Call this early for SRAT node setup.
+        */
+       acpi_boot_table_init();
+#endif
+
+       /* How many end-of-memory variables you have, grandma! */
+       max_low_pfn = end_pfn;
+       max_pfn = end_pfn;
+       high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
+
+       /* Remove active ranges so rediscovery with NUMA-awareness happens */
+       remove_all_active_ranges();
+
+#ifdef CONFIG_ACPI_NUMA
+       /*
+        * Parse SRAT to discover nodes.
+        */
+       acpi_numa_init();
+#endif
+
+#ifdef CONFIG_NUMA
+       numa_initmem_init(0, end_pfn); 
+#else
+       contig_initmem_init(0, end_pfn);
+#endif
+
+       /* Reserve direct mapping */
+       reserve_bootmem_generic(table_start << PAGE_SHIFT, 
+                               (table_end - table_start) << PAGE_SHIFT);
+
+       /* reserve kernel */
+       reserve_bootmem_generic(__pa_symbol(&_text),
+                               __pa_symbol(&_end) - __pa_symbol(&_text));
+
+       /*
+        * reserve physical page 0 - it's a special BIOS page on many boxes,
+        * enabling clean reboots, SMP operation, laptop functions.
+        */
+       reserve_bootmem_generic(0, PAGE_SIZE);
+
+       /* reserve ebda region */
+       if (ebda_addr)
+               reserve_bootmem_generic(ebda_addr, ebda_size);
+#ifdef CONFIG_NUMA
+       /* reserve nodemap region */
+       if (nodemap_addr)
+               reserve_bootmem_generic(nodemap_addr, nodemap_size);
+#endif
+
+#ifdef CONFIG_SMP
+       /* Reserve SMP trampoline */
+       reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE);
+#endif
+
+#ifdef CONFIG_ACPI_SLEEP
+       /*
+        * Reserve low memory region for sleep support.
+        */
+       acpi_reserve_bootmem();
+#endif
+       /*
+        * Find and reserve possible boot-time SMP configuration:
+        */
+       find_smp_config();
+#ifdef CONFIG_BLK_DEV_INITRD
+       if (LOADER_TYPE && INITRD_START) {
+               if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
+                       reserve_bootmem_generic(INITRD_START, INITRD_SIZE);
+                       initrd_start = INITRD_START + PAGE_OFFSET;
+                       initrd_end = initrd_start+INITRD_SIZE;
+               }
+               else {
+                       printk(KERN_ERR "initrd extends beyond end of memory "
+                           "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
+                           (unsigned long)(INITRD_START + INITRD_SIZE),
+                           (unsigned long)(end_pfn << PAGE_SHIFT));
+                       initrd_start = 0;
+               }
+       }
+#endif
+#ifdef CONFIG_KEXEC
+       if (crashk_res.start != crashk_res.end) {
+               reserve_bootmem_generic(crashk_res.start,
+                       crashk_res.end - crashk_res.start + 1);
+       }
+#endif
+
+       paging_init();
+
+#ifdef CONFIG_PCI
+       early_quirks();
+#endif
+
+       /*
+        * set this early, so we dont allocate cpu0
+        * if MADT list doesnt list BSP first
+        * mpparse.c/MP_processor_info() allocates logical cpu numbers.
+        */
+       cpu_set(0, cpu_present_map);
+#ifdef CONFIG_ACPI
+       /*
+        * Read APIC and some other early information from ACPI tables.
+        */
+       acpi_boot_init();
+#endif
+
+       init_cpu_to_node();
+
+       /*
+        * get boot-time SMP configuration:
+        */
+       if (smp_found_config)
+               get_smp_config();
+       init_apic_mappings();
+
+       /*
+        * We trust e820 completely. No explicit ROM probing in memory.
+        */
+       e820_reserve_resources(); 
+       e820_mark_nosave_regions();
+
+       {
+       unsigned i;
+       /* request I/O space for devices used on all i[345]86 PCs */
+       for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+               request_resource(&ioport_resource, &standard_io_resources[i]);
+       }
+
+       e820_setup_gap();
+
+#ifdef CONFIG_VT
+#if defined(CONFIG_VGA_CONSOLE)
+       conswitchp = &vga_con;
+#elif defined(CONFIG_DUMMY_CONSOLE)
+       conswitchp = &dummy_con;
+#endif
+#endif
+}
+
+static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
+{
+       unsigned int *v;
+
+       if (c->extended_cpuid_level < 0x80000004)
+               return 0;
+
+       v = (unsigned int *) c->x86_model_id;
+       cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
+       cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
+       cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
+       c->x86_model_id[48] = 0;
+       return 1;
+}
+
+
+static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
+{
+       unsigned int n, dummy, eax, ebx, ecx, edx;
+
+       n = c->extended_cpuid_level;
+
+       if (n >= 0x80000005) {
+               cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
+               printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
+                       edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
+               c->x86_cache_size=(ecx>>24)+(edx>>24);
+               /* On K8 L1 TLB is inclusive, so don't count it */
+               c->x86_tlbsize = 0;
+       }
+
+       if (n >= 0x80000006) {
+               cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
+               ecx = cpuid_ecx(0x80000006);
+               c->x86_cache_size = ecx >> 16;
+               c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
+
+               printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
+               c->x86_cache_size, ecx & 0xFF);
+       }
+
+       if (n >= 0x80000007)
+               cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); 
+       if (n >= 0x80000008) {
+               cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); 
+               c->x86_virt_bits = (eax >> 8) & 0xff;
+               c->x86_phys_bits = eax & 0xff;
+       }
+}
+
+#ifdef CONFIG_NUMA
+static int nearby_node(int apicid)
+{
+       int i;
+       for (i = apicid - 1; i >= 0; i--) {
+               int node = apicid_to_node[i];
+               if (node != NUMA_NO_NODE && node_online(node))
+                       return node;
+       }
+       for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
+               int node = apicid_to_node[i];
+               if (node != NUMA_NO_NODE && node_online(node))
+                       return node;
+       }
+       return first_node(node_online_map); /* Shouldn't happen */
+}
+#endif
+
+/*
+ * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
+ * Assumes number of cores is a power of two.
+ */
+static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+       unsigned bits;
+#ifdef CONFIG_NUMA
+       int cpu = smp_processor_id();
+       int node = 0;
+       unsigned apicid = hard_smp_processor_id();
+#endif
+       unsigned ecx = cpuid_ecx(0x80000008);
+
+       c->x86_max_cores = (ecx & 0xff) + 1;
+
+       /* CPU telling us the core id bits shift? */
+       bits = (ecx >> 12) & 0xF;
+
+       /* Otherwise recompute */
+       if (bits == 0) {
+               while ((1 << bits) < c->x86_max_cores)
+                       bits++;
+       }
+
+       /* Low order bits define the core id (index of core in socket) */
+       c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
+       /* Convert the APIC ID into the socket ID */
+       c->phys_proc_id = phys_pkg_id(bits);
+
+#ifdef CONFIG_NUMA
+       node = c->phys_proc_id;
+       if (apicid_to_node[apicid] != NUMA_NO_NODE)
+               node = apicid_to_node[apicid];
+       if (!node_online(node)) {
+               /* Two possibilities here:
+                  - The CPU is missing memory and no node was created.
+                  In that case try picking one from a nearby CPU
+                  - The APIC IDs differ from the HyperTransport node IDs
+                  which the K8 northbridge parsing fills in.
+                  Assume they are all increased by a constant offset,
+                  but in the same order as the HT nodeids.
+                  If that doesn't result in a usable node fall back to the
+                  path for the previous case.  */
+               int ht_nodeid = apicid - (cpu_data[0].phys_proc_id << bits);
+               if (ht_nodeid >= 0 &&
+                   apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+                       node = apicid_to_node[ht_nodeid];
+               /* Pick a nearby node */
+               if (!node_online(node))
+                       node = nearby_node(apicid);
+       }
+       numa_set_node(cpu, node);
+
+       printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
+#endif
+#endif
+}
+
+static void __cpuinit init_amd(struct cpuinfo_x86 *c)
+{
+       unsigned level;
+
+#ifdef CONFIG_SMP
+       unsigned long value;
+
+       /*
+        * Disable TLB flush filter by setting HWCR.FFDIS on K8
+        * bit 6 of msr C001_0015
+        *
+        * Errata 63 for SH-B3 steppings
+        * Errata 122 for all steppings (F+ have it disabled by default)
+        */
+       if (c->x86 == 15) {
+               rdmsrl(MSR_K8_HWCR, value);
+               value |= 1 << 6;
+               wrmsrl(MSR_K8_HWCR, value);
+       }
+#endif
+
+       /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+          3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
+       clear_bit(0*32+31, &c->x86_capability);
+       
+       /* On C+ stepping K8 rep microcode works well for copy/memset */
+       level = cpuid_eax(1);
+       if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
+               set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
+       if (c->x86 == 0x10)
+               set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
+
+       /* Enable workaround for FXSAVE leak */
+       if (c->x86 >= 6)
+               set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
+
+       level = get_model_name(c);
+       if (!level) {
+               switch (c->x86) { 
+               case 15:
+                       /* Should distinguish Models here, but this is only
+                          a fallback anyways. */
+                       strcpy(c->x86_model_id, "Hammer");
+                       break; 
+               } 
+       } 
+       display_cacheinfo(c);
+
+       /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
+       if (c->x86_power & (1<<8))
+               set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
+
+       /* Multi core CPU? */
+       if (c->extended_cpuid_level >= 0x80000008)
+               amd_detect_cmp(c);
+
+       if (c->extended_cpuid_level >= 0x80000006 &&
+               (cpuid_edx(0x80000006) & 0xf000))
+               num_cache_leaves = 4;
+       else
+               num_cache_leaves = 3;
+
+       if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
+               set_bit(X86_FEATURE_K8, &c->x86_capability);
+
+       /* RDTSC can be speculated around */
+       clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
+
+       /* Family 10 doesn't support C states in MWAIT so don't use it */
+       if (c->x86 == 0x10 && !force_mwait)
+               clear_bit(X86_FEATURE_MWAIT, &c->x86_capability);
+}
+
+static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+       u32     eax, ebx, ecx, edx;
+       int     index_msb, core_bits;
+
+       cpuid(1, &eax, &ebx, &ecx, &edx);
+
+
+       if (!cpu_has(c, X86_FEATURE_HT))
+               return;
+       if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
+               goto out;
+
+       smp_num_siblings = (ebx & 0xff0000) >> 16;
+
+       if (smp_num_siblings == 1) {
+               printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
+       } else if (smp_num_siblings > 1 ) {
+
+               if (smp_num_siblings > NR_CPUS) {
+                       printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
+                       smp_num_siblings = 1;
+                       return;
+               }
+
+               index_msb = get_count_order(smp_num_siblings);
+               c->phys_proc_id = phys_pkg_id(index_msb);
+
+               smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+
+               index_msb = get_count_order(smp_num_siblings) ;
+
+               core_bits = get_count_order(c->x86_max_cores);
+
+               c->cpu_core_id = phys_pkg_id(index_msb) &
+                                              ((1 << core_bits) - 1);
+       }
+out:
+       if ((c->x86_max_cores * smp_num_siblings) > 1) {
+               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n", c->phys_proc_id);
+               printk(KERN_INFO  "CPU: Processor Core ID: %d\n", c->cpu_core_id);
+       }
+
+#endif
+}
+
+/*
+ * find out the number of processor cores on the die
+ */
+static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
+{
+       unsigned int eax, t;
+
+       if (c->cpuid_level < 4)
+               return 1;
+
+       cpuid_count(4, 0, &eax, &t, &t, &t);
+
+       if (eax & 0x1f)
+               return ((eax >> 26) + 1);
+       else
+               return 1;
+}
+
+static void srat_detect_node(void)
+{
+#ifdef CONFIG_NUMA
+       unsigned node;
+       int cpu = smp_processor_id();
+       int apicid = hard_smp_processor_id();
+
+       /* Don't do the funky fallback heuristics the AMD version employs
+          for now. */
+       node = apicid_to_node[apicid];
+       if (node == NUMA_NO_NODE)
+               node = first_node(node_online_map);
+       numa_set_node(cpu, node);
+
+       printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
+#endif
+}
+
+static void __cpuinit init_intel(struct cpuinfo_x86 *c)
+{
+       /* Cache sizes */
+       unsigned n;
+
+       init_intel_cacheinfo(c);
+       if (c->cpuid_level > 9 ) {
+               unsigned eax = cpuid_eax(10);
+               /* Check for version and the number of counters */
+               if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
+                       set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
+       }
+
+       if (cpu_has_ds) {
+               unsigned int l1, l2;
+               rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
+               if (!(l1 & (1<<11)))
+                       set_bit(X86_FEATURE_BTS, c->x86_capability);
+               if (!(l1 & (1<<12)))
+                       set_bit(X86_FEATURE_PEBS, c->x86_capability);
+       }
+
+       n = c->extended_cpuid_level;
+       if (n >= 0x80000008) {
+               unsigned eax = cpuid_eax(0x80000008);
+               c->x86_virt_bits = (eax >> 8) & 0xff;
+               c->x86_phys_bits = eax & 0xff;
+               /* CPUID workaround for Intel 0F34 CPU */
+               if (c->x86_vendor == X86_VENDOR_INTEL &&
+                   c->x86 == 0xF && c->x86_model == 0x3 &&
+                   c->x86_mask == 0x4)
+                       c->x86_phys_bits = 36;
+       }
+
+       if (c->x86 == 15)
+               c->x86_cache_alignment = c->x86_clflush_size * 2;
+       if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
+           (c->x86 == 0x6 && c->x86_model >= 0x0e))
+               set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
+       if (c->x86 == 6)
+               set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
+       if (c->x86 == 15)
+               set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
+       else
+               clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
+       c->x86_max_cores = intel_num_cpu_cores(c);
+
+       srat_detect_node();
+}
+
+static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
+{
+       char *v = c->x86_vendor_id;
+
+       if (!strcmp(v, "AuthenticAMD"))
+               c->x86_vendor = X86_VENDOR_AMD;
+       else if (!strcmp(v, "GenuineIntel"))
+               c->x86_vendor = X86_VENDOR_INTEL;
+       else
+               c->x86_vendor = X86_VENDOR_UNKNOWN;
+}
+
+struct cpu_model_info {
+       int vendor;
+       int family;
+       char *model_names[16];
+};
+
+/* Do some early cpuid on the boot CPU to get some parameter that are
+   needed before check_bugs. Everything advanced is in identify_cpu
+   below. */
+void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
+{
+       u32 tfms;
+
+       c->loops_per_jiffy = loops_per_jiffy;
+       c->x86_cache_size = -1;
+       c->x86_vendor = X86_VENDOR_UNKNOWN;
+       c->x86_model = c->x86_mask = 0; /* So far unknown... */
+       c->x86_vendor_id[0] = '\0'; /* Unset */
+       c->x86_model_id[0] = '\0';  /* Unset */
+       c->x86_clflush_size = 64;
+       c->x86_cache_alignment = c->x86_clflush_size;
+       c->x86_max_cores = 1;
+       c->extended_cpuid_level = 0;
+       memset(&c->x86_capability, 0, sizeof c->x86_capability);
+
+       /* Get vendor name */
+       cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
+             (unsigned int *)&c->x86_vendor_id[0],
+             (unsigned int *)&c->x86_vendor_id[8],
+             (unsigned int *)&c->x86_vendor_id[4]);
+               
+       get_cpu_vendor(c);
+
+       /* Initialize the standard set of capabilities */
+       /* Note that the vendor-specific code below might override */
+
+       /* Intel-defined flags: level 0x00000001 */
+       if (c->cpuid_level >= 0x00000001) {
+               __u32 misc;
+               cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
+                     &c->x86_capability[0]);
+               c->x86 = (tfms >> 8) & 0xf;
+               c->x86_model = (tfms >> 4) & 0xf;
+               c->x86_mask = tfms & 0xf;
+               if (c->x86 == 0xf)
+                       c->x86 += (tfms >> 20) & 0xff;
+               if (c->x86 >= 0x6)
+                       c->x86_model += ((tfms >> 16) & 0xF) << 4;
+               if (c->x86_capability[0] & (1<<19)) 
+                       c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
+       } else {
+               /* Have CPUID level 0 only - unheard of */
+               c->x86 = 4;
+       }
+
+#ifdef CONFIG_SMP
+       c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
+#endif
+}
+
+/*
+ * This does the hard work of actually picking apart the CPU stuff...
+ */
+void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+{
+       int i;
+       u32 xlvl;
+
+       early_identify_cpu(c);
+
+       /* AMD-defined flags: level 0x80000001 */
+       xlvl = cpuid_eax(0x80000000);
+       c->extended_cpuid_level = xlvl;
+       if ((xlvl & 0xffff0000) == 0x80000000) {
+               if (xlvl >= 0x80000001) {
+                       c->x86_capability[1] = cpuid_edx(0x80000001);
+                       c->x86_capability[6] = cpuid_ecx(0x80000001);
+               }
+               if (xlvl >= 0x80000004)
+                       get_model_name(c); /* Default name */
+       }
+
+       /* Transmeta-defined flags: level 0x80860001 */
+       xlvl = cpuid_eax(0x80860000);
+       if ((xlvl & 0xffff0000) == 0x80860000) {
+               /* Don't set x86_cpuid_level here for now to not confuse. */
+               if (xlvl >= 0x80860001)
+                       c->x86_capability[2] = cpuid_edx(0x80860001);
+       }
+
+       init_scattered_cpuid_features(c);
+
+       c->apicid = phys_pkg_id(0);
+
+       /*
+        * Vendor-specific initialization.  In this section we
+        * canonicalize the feature flags, meaning if there are
+        * features a certain CPU supports which CPUID doesn't
+        * tell us, CPUID claiming incorrect flags, or other bugs,
+        * we handle them here.
+        *
+        * At the end of this section, c->x86_capability better
+        * indicate the features this CPU genuinely supports!
+        */
+       switch (c->x86_vendor) {
+       case X86_VENDOR_AMD:
+               init_amd(c);
+               break;
+
+       case X86_VENDOR_INTEL:
+               init_intel(c);
+               break;
+
+       case X86_VENDOR_UNKNOWN:
+       default:
+               display_cacheinfo(c);
+               break;
+       }
+
+       select_idle_routine(c);
+       detect_ht(c); 
+
+       /*
+        * On SMP, boot_cpu_data holds the common feature set between
+        * all CPUs; so make sure that we indicate which features are
+        * common between the CPUs.  The first time this routine gets
+        * executed, c == &boot_cpu_data.
+        */
+       if (c != &boot_cpu_data) {
+               /* AND the already accumulated flags with these */
+               for (i = 0 ; i < NCAPINTS ; i++)
+                       boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
+       }
+
+#ifdef CONFIG_X86_MCE
+       mcheck_init(c);
+#endif
+       if (c != &boot_cpu_data)
+               mtrr_ap_init();
+#ifdef CONFIG_NUMA
+       numa_add_cpu(smp_processor_id());
+#endif
+}
+
+void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
+{
+       if (c->x86_model_id[0])
+               printk("%s", c->x86_model_id);
+
+       if (c->x86_mask || c->cpuid_level >= 0) 
+               printk(" stepping %02x\n", c->x86_mask);
+       else
+               printk("\n");
+}
+
+/*
+ *     Get CPU information for use by the procfs.
+ */
+
+static int show_cpuinfo(struct seq_file *m, void *v)
+{
+       struct cpuinfo_x86 *c = v;
+
+       /* 
+        * These flag bits must match the definitions in <asm/cpufeature.h>.
+        * NULL means this bit is undefined or reserved; either way it doesn't
+        * have meaning as far as Linux is concerned.  Note that it's important
+        * to realize there is a difference between this table and CPUID -- if
+        * applications want to get the raw CPUID data, they should access
+        * /dev/cpu/<cpu_nr>/cpuid instead.
+        */
+       static char *x86_cap_flags[] = {
+               /* Intel-defined */
+               "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
+               "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
+               "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
+               "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
+
+               /* AMD-defined */
+               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+               NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
+               NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
+               NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
+               "3dnowext", "3dnow",
+
+               /* Transmeta-defined */
+               "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
+               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+
+               /* Other (Linux-defined) */
+               "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
+               NULL, NULL, NULL, NULL,
+               "constant_tsc", "up", NULL, "arch_perfmon",
+               "pebs", "bts", NULL, "sync_rdtsc",
+               "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+
+               /* Intel-defined (#2) */
+               "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
+               "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
+               NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt",
+               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+
+               /* VIA/Cyrix/Centaur-defined */
+               NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
+               "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
+               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+
+               /* AMD-defined (#2) */
+               "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8_legacy",
+               "altmovcr8", "abm", "sse4a",
+               "misalignsse", "3dnowprefetch",
+               "osvw", "ibs", NULL, NULL, NULL, NULL,
+               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+
+               /* Auxiliary (Linux-defined) */
+               "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+       };
+       static char *x86_power_flags[] = { 
+               "ts",   /* temperature sensor */
+               "fid",  /* frequency id control */
+               "vid",  /* voltage id control */
+               "ttp",  /* thermal trip */
+               "tm",
+               "stc",
+               "100mhzsteps",
+               "hwpstate",
+               "",     /* tsc invariant mapped to constant_tsc */
+               /* nothing */
+       };
+
+
+#ifdef CONFIG_SMP
+       if (!cpu_online(c-cpu_data))
+               return 0;
+#endif
+
+       seq_printf(m,"processor\t: %u\n"
+                    "vendor_id\t: %s\n"
+                    "cpu family\t: %d\n"
+                    "model\t\t: %d\n"
+                    "model name\t: %s\n",
+                    (unsigned)(c-cpu_data),
+                    c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
+                    c->x86,
+                    (int)c->x86_model,
+                    c->x86_model_id[0] ? c->x86_model_id : "unknown");
+       
+       if (c->x86_mask || c->cpuid_level >= 0)
+               seq_printf(m, "stepping\t: %d\n", c->x86_mask);
+       else
+               seq_printf(m, "stepping\t: unknown\n");
+       
+       if (cpu_has(c,X86_FEATURE_TSC)) {
+               unsigned int freq = cpufreq_quick_get((unsigned)(c-cpu_data));
+               if (!freq)
+                       freq = cpu_khz;
+               seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
+                            freq / 1000, (freq % 1000));
+       }
+
+       /* Cache size */
+       if (c->x86_cache_size >= 0) 
+               seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
+       
+#ifdef CONFIG_SMP
+       if (smp_num_siblings * c->x86_max_cores > 1) {
+               int cpu = c - cpu_data;
+               seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
+               seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu]));
+               seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
+               seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
+       }
+#endif 
+
+       seq_printf(m,
+               "fpu\t\t: yes\n"
+               "fpu_exception\t: yes\n"
+               "cpuid level\t: %d\n"
+               "wp\t\t: yes\n"
+               "flags\t\t:",
+                  c->cpuid_level);
+
+       { 
+               int i; 
+               for ( i = 0 ; i < 32*NCAPINTS ; i++ )
+                       if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
+                               seq_printf(m, " %s", x86_cap_flags[i]);
+       }
+               
+       seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
+                  c->loops_per_jiffy/(500000/HZ),
+                  (c->loops_per_jiffy/(5000/HZ)) % 100);
+
+       if (c->x86_tlbsize > 0) 
+               seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
+       seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
+       seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
+
+       seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", 
+                  c->x86_phys_bits, c->x86_virt_bits);
+
+       seq_printf(m, "power management:");
+       {
+               unsigned i;
+               for (i = 0; i < 32; i++) 
+                       if (c->x86_power & (1 << i)) {
+                               if (i < ARRAY_SIZE(x86_power_flags) &&
+                                       x86_power_flags[i])
+                                       seq_printf(m, "%s%s",
+                                               x86_power_flags[i][0]?" ":"",
+                                               x86_power_flags[i]);
+                               else
+                                       seq_printf(m, " [%d]", i);
+                       }
+       }
+
+       seq_printf(m, "\n\n");
+
+       return 0;
+}
+
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+       return *pos < NR_CPUS ? cpu_data + *pos : NULL;
+}
+
+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
+{
+       ++*pos;
+       return c_start(m, pos);
+}
+
+static void c_stop(struct seq_file *m, void *v)
+{
+}
+
+struct seq_operations cpuinfo_op = {
+       .start =c_start,
+       .next = c_next,
+       .stop = c_stop,
+       .show = show_cpuinfo,
+};
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
new file mode 100644 (file)
index 0000000..739175b
--- /dev/null
@@ -0,0 +1,495 @@
+/*
+ *  linux/arch/x86_64/kernel/signal.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
+ *
+ *  1997-11-28  Modified for POSIX.1b signals by Richard Henderson
+ *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
+ *  2000-2002   x86-64 support by Andi Kleen
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/wait.h>
+#include <linux/ptrace.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/personality.h>
+#include <linux/compiler.h>
+#include <asm/ucontext.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+#include <asm/proto.h>
+#include <asm/ia32_unistd.h>
+#include <asm/mce.h>
+
+/* #define DEBUG_SIG 1 */
+
+#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
+
+int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+               sigset_t *set, struct pt_regs * regs); 
+int ia32_setup_frame(int sig, struct k_sigaction *ka,
+            sigset_t *set, struct pt_regs * regs); 
+
+asmlinkage long
+sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
+               struct pt_regs *regs)
+{
+       return do_sigaltstack(uss, uoss, regs->rsp);
+}
+
+
+/*
+ * Do a signal return; undo the signal stack.
+ */
+
+struct rt_sigframe
+{
+       char __user *pretcode;
+       struct ucontext uc;
+       struct siginfo info;
+};
+
+static int
+restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *prax)
+{
+       unsigned int err = 0;
+
+       /* Always make any pending restarted system calls return -EINTR */
+       current_thread_info()->restart_block.fn = do_no_restart_syscall;
+
+#define COPY(x)                err |= __get_user(regs->x, &sc->x)
+
+       COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx);
+       COPY(rdx); COPY(rcx); COPY(rip);
+       COPY(r8);
+       COPY(r9);
+       COPY(r10);
+       COPY(r11);
+       COPY(r12);
+       COPY(r13);
+       COPY(r14);
+       COPY(r15);
+
+       /* Kernel saves and restores only the CS segment register on signals,
+        * which is the bare minimum needed to allow mixed 32/64-bit code.
+        * App's signal handler can save/restore other segments if needed. */
+       {
+               unsigned cs;
+               err |= __get_user(cs, &sc->cs);
+               regs->cs = cs | 3;      /* Force into user mode */
+       }
+
+       {
+               unsigned int tmpflags;
+               err |= __get_user(tmpflags, &sc->eflags);
+               regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5);
+               regs->orig_rax = -1;            /* disable syscall checks */
+       }
+
+       {
+               struct _fpstate __user * buf;
+               err |= __get_user(buf, &sc->fpstate);
+
+               if (buf) {
+                       if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
+                               goto badframe;
+                       err |= restore_i387(buf);
+               } else {
+                       struct task_struct *me = current;
+                       if (used_math()) {
+                               clear_fpu(me);
+                               clear_used_math();
+                       }
+               }
+       }
+
+       err |= __get_user(*prax, &sc->rax);
+       return err;
+
+badframe:
+       return 1;
+}
+
+asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
+{
+       struct rt_sigframe __user *frame;
+       sigset_t set;
+       unsigned long eax;
+
+       frame = (struct rt_sigframe __user *)(regs->rsp - 8);
+       if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) {
+               goto badframe;
+       } 
+       if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) { 
+               goto badframe;
+       } 
+
+       sigdelsetmask(&set, ~_BLOCKABLE);
+       spin_lock_irq(&current->sighand->siglock);
+       current->blocked = set;
+       recalc_sigpending();
+       spin_unlock_irq(&current->sighand->siglock);
+       
+       if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
+               goto badframe;
+
+#ifdef DEBUG_SIG
+       printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs->rip,regs->rsp,frame,eax);
+#endif
+
+       if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT)
+               goto badframe;
+
+       return eax;
+
+badframe:
+       signal_fault(regs,frame,"sigreturn");
+       return 0;
+}      
+
+/*
+ * Set up a signal frame.
+ */
+
+static inline int
+setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me)
+{
+       int err = 0;
+
+       err |= __put_user(regs->cs, &sc->cs);
+       err |= __put_user(0, &sc->gs);
+       err |= __put_user(0, &sc->fs);
+
+       err |= __put_user(regs->rdi, &sc->rdi);
+       err |= __put_user(regs->rsi, &sc->rsi);
+       err |= __put_user(regs->rbp, &sc->rbp);
+       err |= __put_user(regs->rsp, &sc->rsp);
+       err |= __put_user(regs->rbx, &sc->rbx);
+       err |= __put_user(regs->rdx, &sc->rdx);
+       err |= __put_user(regs->rcx, &sc->rcx);
+       err |= __put_user(regs->rax, &sc->rax);
+       err |= __put_user(regs->r8, &sc->r8);
+       err |= __put_user(regs->r9, &sc->r9);
+       err |= __put_user(regs->r10, &sc->r10);
+       err |= __put_user(regs->r11, &sc->r11);
+       err |= __put_user(regs->r12, &sc->r12);
+       err |= __put_user(regs->r13, &sc->r13);
+       err |= __put_user(regs->r14, &sc->r14);
+       err |= __put_user(regs->r15, &sc->r15);
+       err |= __put_user(me->thread.trap_no, &sc->trapno);
+       err |= __put_user(me->thread.error_code, &sc->err);
+       err |= __put_user(regs->rip, &sc->rip);
+       err |= __put_user(regs->eflags, &sc->eflags);
+       err |= __put_user(mask, &sc->oldmask);
+       err |= __put_user(me->thread.cr2, &sc->cr2);
+
+       return err;
+}
+
+/*
+ * Determine which stack to use..
+ */
+
+static void __user *
+get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
+{
+       unsigned long rsp;
+
+       /* Default to using normal stack - redzone*/
+       rsp = regs->rsp - 128;
+
+       /* This is the X/Open sanctioned signal stack switching.  */
+       if (ka->sa.sa_flags & SA_ONSTACK) {
+               if (sas_ss_flags(rsp) == 0)
+                       rsp = current->sas_ss_sp + current->sas_ss_size;
+       }
+
+       return (void __user *)round_down(rsp - size, 16); 
+}
+
+static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+                          sigset_t *set, struct pt_regs * regs)
+{
+       struct rt_sigframe __user *frame;
+       struct _fpstate __user *fp = NULL; 
+       int err = 0;
+       struct task_struct *me = current;
+
+       if (used_math()) {
+               fp = get_stack(ka, regs, sizeof(struct _fpstate)); 
+               frame = (void __user *)round_down(
+                       (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
+
+               if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate)))
+                       goto give_sigsegv;
+
+               if (save_i387(fp) < 0) 
+                       err |= -1; 
+       } else
+               frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
+
+       if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+               goto give_sigsegv;
+
+       if (ka->sa.sa_flags & SA_SIGINFO) { 
+               err |= copy_siginfo_to_user(&frame->info, info);
+               if (err)
+                       goto give_sigsegv;
+       }
+               
+       /* Create the ucontext.  */
+       err |= __put_user(0, &frame->uc.uc_flags);
+       err |= __put_user(0, &frame->uc.uc_link);
+       err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
+       err |= __put_user(sas_ss_flags(regs->rsp),
+                         &frame->uc.uc_stack.ss_flags);
+       err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
+       err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
+       err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
+       if (sizeof(*set) == 16) { 
+               __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
+               __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); 
+       } else
+               err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+
+       /* Set up to return from userspace.  If provided, use a stub
+          already in userspace.  */
+       /* x86-64 should always use SA_RESTORER. */
+       if (ka->sa.sa_flags & SA_RESTORER) {
+               err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
+       } else {
+               /* could use a vstub here */
+               goto give_sigsegv; 
+       }
+
+       if (err)
+               goto give_sigsegv;
+
+#ifdef DEBUG_SIG
+       printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax);
+#endif
+
+       /* Set up registers for signal handler */
+       regs->rdi = sig;
+       /* In case the signal handler was declared without prototypes */ 
+       regs->rax = 0;  
+
+       /* This also works for non SA_SIGINFO handlers because they expect the
+          next argument after the signal number on the stack. */
+       regs->rsi = (unsigned long)&frame->info; 
+       regs->rdx = (unsigned long)&frame->uc; 
+       regs->rip = (unsigned long) ka->sa.sa_handler;
+
+       regs->rsp = (unsigned long)frame;
+
+       /* Set up the CS register to run signal handlers in 64-bit mode,
+          even if the handler happens to be interrupting 32-bit code. */
+       regs->cs = __USER_CS;
+
+       /* This, by contrast, has nothing to do with segment registers -
+          see include/asm-x86_64/uaccess.h for details. */
+       set_fs(USER_DS);
+
+       regs->eflags &= ~TF_MASK;
+       if (test_thread_flag(TIF_SINGLESTEP))
+               ptrace_notify(SIGTRAP);
+#ifdef DEBUG_SIG
+       printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n",
+               current->comm, current->pid, frame, regs->rip, frame->pretcode);
+#endif
+
+       return 0;
+
+give_sigsegv:
+       force_sigsegv(sig, current);
+       return -EFAULT;
+}
+
+/*
+ * OK, we're invoking a handler
+ */    
+
+static int
+handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
+               sigset_t *oldset, struct pt_regs *regs)
+{
+       int ret;
+
+#ifdef DEBUG_SIG
+       printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n",
+               current->pid, sig,
+               regs->rip, regs->rsp, regs);
+#endif
+
+       /* Are we from a system call? */
+       if ((long)regs->orig_rax >= 0) {
+               /* If so, check system call restarting.. */
+               switch (regs->rax) {
+                       case -ERESTART_RESTARTBLOCK:
+                       case -ERESTARTNOHAND:
+                               regs->rax = -EINTR;
+                               break;
+
+                       case -ERESTARTSYS:
+                               if (!(ka->sa.sa_flags & SA_RESTART)) {
+                                       regs->rax = -EINTR;
+                                       break;
+                               }
+                               /* fallthrough */
+                       case -ERESTARTNOINTR:
+                               regs->rax = regs->orig_rax;
+                               regs->rip -= 2;
+                               break;
+               }
+       }
+
+       /*
+        * If TF is set due to a debugger (PT_DTRACE), clear the TF
+        * flag so that register information in the sigcontext is
+        * correct.
+        */
+       if (unlikely(regs->eflags & TF_MASK)) {
+               if (likely(current->ptrace & PT_DTRACE)) {
+                       current->ptrace &= ~PT_DTRACE;
+                       regs->eflags &= ~TF_MASK;
+               }
+       }
+
+#ifdef CONFIG_IA32_EMULATION
+       if (test_thread_flag(TIF_IA32)) {
+               if (ka->sa.sa_flags & SA_SIGINFO)
+                       ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
+               else
+                       ret = ia32_setup_frame(sig, ka, oldset, regs);
+       } else 
+#endif
+       ret = setup_rt_frame(sig, ka, info, oldset, regs);
+
+       if (ret == 0) {
+               spin_lock_irq(&current->sighand->siglock);
+               sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
+               if (!(ka->sa.sa_flags & SA_NODEFER))
+                       sigaddset(&current->blocked,sig);
+               recalc_sigpending();
+               spin_unlock_irq(&current->sighand->siglock);
+       }
+
+       return ret;
+}
+
+/*
+ * Note that 'init' is a special process: it doesn't get signals it doesn't
+ * want to handle. Thus you cannot kill init even with a SIGKILL even by
+ * mistake.
+ */
+static void do_signal(struct pt_regs *regs)
+{
+       struct k_sigaction ka;
+       siginfo_t info;
+       int signr;
+       sigset_t *oldset;
+
+       /*
+        * We want the common case to go fast, which
+        * is why we may in certain cases get here from
+        * kernel mode. Just return without doing anything
+        * if so.
+        */
+       if (!user_mode(regs))
+               return;
+
+       if (test_thread_flag(TIF_RESTORE_SIGMASK))
+               oldset = &current->saved_sigmask;
+       else
+               oldset = &current->blocked;
+
+       signr = get_signal_to_deliver(&info, &ka, regs, NULL);
+       if (signr > 0) {
+               /* Reenable any watchpoints before delivering the
+                * signal to user space. The processor register will
+                * have been cleared if the watchpoint triggered
+                * inside the kernel.
+                */
+               if (current->thread.debugreg7)
+                       set_debugreg(current->thread.debugreg7, 7);
+
+               /* Whee!  Actually deliver the signal.  */
+               if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
+                       /* a signal was successfully delivered; the saved
+                        * sigmask will have been stored in the signal frame,
+                        * and will be restored by sigreturn, so we can simply
+                        * clear the TIF_RESTORE_SIGMASK flag */
+                       clear_thread_flag(TIF_RESTORE_SIGMASK);
+               }
+               return;
+       }
+
+       /* Did we come from a system call? */
+       if ((long)regs->orig_rax >= 0) {
+               /* Restart the system call - no handlers present */
+               long res = regs->rax;
+               switch (res) {
+               case -ERESTARTNOHAND:
+               case -ERESTARTSYS:
+               case -ERESTARTNOINTR:
+                       regs->rax = regs->orig_rax;
+                       regs->rip -= 2;
+                       break;
+               case -ERESTART_RESTARTBLOCK:
+                       regs->rax = test_thread_flag(TIF_IA32) ?
+                                       __NR_ia32_restart_syscall :
+                                       __NR_restart_syscall;
+                       regs->rip -= 2;
+                       break;
+               }
+       }
+
+       /* if there's no signal to deliver, we just put the saved sigmask
+          back. */
+       if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
+               clear_thread_flag(TIF_RESTORE_SIGMASK);
+               sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
+       }
+}
+
+void
+do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
+{
+#ifdef DEBUG_SIG
+       printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%p pending:%x\n",
+              thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); 
+#endif
+              
+       /* Pending single-step? */
+       if (thread_info_flags & _TIF_SINGLESTEP) {
+               regs->eflags |= TF_MASK;
+               clear_thread_flag(TIF_SINGLESTEP);
+       }
+
+#ifdef CONFIG_X86_MCE
+       /* notify userspace of pending MCEs */
+       if (thread_info_flags & _TIF_MCE_NOTIFY)
+               mce_notify_user();
+#endif /* CONFIG_X86_MCE */
+
+       /* deal with pending signal delivery */
+       if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
+               do_signal(regs);
+}
+
+void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
+{ 
+       struct task_struct *me = current; 
+       if (show_unhandled_signals && printk_ratelimit())
+               printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n",
+              me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); 
+
+       force_sig(SIGSEGV, me); 
+} 
diff --git a/arch/x86/kernel/smp_64.c b/arch/x86/kernel/smp_64.c
new file mode 100644 (file)
index 0000000..df4a828
--- /dev/null
@@ -0,0 +1,523 @@
+/*
+ *     Intel SMP support routines.
+ *
+ *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *     (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+ *      (c) 2002,2003 Andi Kleen, SuSE Labs.
+ *
+ *     This code is released under the GNU General Public License version 2 or
+ *     later.
+ */
+
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/interrupt.h>
+
+#include <asm/mtrr.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/mach_apic.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+#include <asm/apicdef.h>
+#include <asm/idle.h>
+
+/*
+ *     Smarter SMP flushing macros. 
+ *             c/o Linus Torvalds.
+ *
+ *     These mean you can really definitely utterly forget about
+ *     writing to user space from interrupts. (Its not allowed anyway).
+ *
+ *     Optimizations Manfred Spraul <manfred@colorfullife.com>
+ *
+ *     More scalable flush, from Andi Kleen
+ *
+ *     To avoid global state use 8 different call vectors.
+ *     Each CPU uses a specific vector to trigger flushes on other
+ *     CPUs. Depending on the received vector the target CPUs look into
+ *     the right per cpu variable for the flush data.
+ *
+ *     With more than 8 CPUs they are hashed to the 8 available
+ *     vectors. The limited global vector space forces us to this right now.
+ *     In future when interrupts are split into per CPU domains this could be
+ *     fixed, at the cost of triggering multiple IPIs in some cases.
+ */
+
+union smp_flush_state {
+       struct {
+               cpumask_t flush_cpumask;
+               struct mm_struct *flush_mm;
+               unsigned long flush_va;
+#define FLUSH_ALL      -1ULL
+               spinlock_t tlbstate_lock;
+       };
+       char pad[SMP_CACHE_BYTES];
+} ____cacheline_aligned;
+
+/* State is put into the per CPU data section, but padded
+   to a full cache line because other CPUs can access it and we don't
+   want false sharing in the per cpu data segment. */
+static DEFINE_PER_CPU(union smp_flush_state, flush_state);
+
+/*
+ * We cannot call mmdrop() because we are in interrupt context, 
+ * instead update mm->cpu_vm_mask.
+ */
+static inline void leave_mm(int cpu)
+{
+       if (read_pda(mmu_state) == TLBSTATE_OK)
+               BUG();
+       cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
+       load_cr3(swapper_pg_dir);
+}
+
+/*
+ *
+ * The flush IPI assumes that a thread switch happens in this order:
+ * [cpu0: the cpu that switches]
+ * 1) switch_mm() either 1a) or 1b)
+ * 1a) thread switch to a different mm
+ * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
+ *     Stop ipi delivery for the old mm. This is not synchronized with
+ *     the other cpus, but smp_invalidate_interrupt ignore flush ipis
+ *     for the wrong mm, and in the worst case we perform a superfluous
+ *     tlb flush.
+ * 1a2) set cpu mmu_state to TLBSTATE_OK
+ *     Now the smp_invalidate_interrupt won't call leave_mm if cpu0
+ *     was in lazy tlb mode.
+ * 1a3) update cpu active_mm
+ *     Now cpu0 accepts tlb flushes for the new mm.
+ * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
+ *     Now the other cpus will send tlb flush ipis.
+ * 1a4) change cr3.
+ * 1b) thread switch without mm change
+ *     cpu active_mm is correct, cpu0 already handles
+ *     flush ipis.
+ * 1b1) set cpu mmu_state to TLBSTATE_OK
+ * 1b2) test_and_set the cpu bit in cpu_vm_mask.
+ *     Atomically set the bit [other cpus will start sending flush ipis],
+ *     and test the bit.
+ * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
+ * 2) switch %%esp, ie current
+ *
+ * The interrupt must handle 2 special cases:
+ * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
+ * - the cpu performs speculative tlb reads, i.e. even if the cpu only
+ *   runs in kernel space, the cpu could load tlb entries for user space
+ *   pages.
+ *
+ * The good news is that cpu mmu_state is local to each cpu, no
+ * write/read ordering problems.
+ */
+
+/*
+ * TLB flush IPI:
+ *
+ * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
+ * 2) Leave the mm if we are in the lazy tlb mode.
+ *
+ * Interrupts are disabled.
+ */
+
+asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
+{
+       int cpu;
+       int sender;
+       union smp_flush_state *f;
+
+       cpu = smp_processor_id();
+       /*
+        * orig_rax contains the negated interrupt vector.
+        * Use that to determine where the sender put the data.
+        */
+       sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
+       f = &per_cpu(flush_state, sender);
+
+       if (!cpu_isset(cpu, f->flush_cpumask))
+               goto out;
+               /* 
+                * This was a BUG() but until someone can quote me the
+                * line from the intel manual that guarantees an IPI to
+                * multiple CPUs is retried _only_ on the erroring CPUs
+                * its staying as a return
+                *
+                * BUG();
+                */
+                
+       if (f->flush_mm == read_pda(active_mm)) {
+               if (read_pda(mmu_state) == TLBSTATE_OK) {
+                       if (f->flush_va == FLUSH_ALL)
+                               local_flush_tlb();
+                       else
+                               __flush_tlb_one(f->flush_va);
+               } else
+                       leave_mm(cpu);
+       }
+out:
+       ack_APIC_irq();
+       cpu_clear(cpu, f->flush_cpumask);
+}
+
+static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
+                                               unsigned long va)
+{
+       int sender;
+       union smp_flush_state *f;
+
+       /* Caller has disabled preemption */
+       sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
+       f = &per_cpu(flush_state, sender);
+
+       /* Could avoid this lock when
+          num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
+          probably not worth checking this for a cache-hot lock. */
+       spin_lock(&f->tlbstate_lock);
+
+       f->flush_mm = mm;
+       f->flush_va = va;
+       cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
+
+       /*
+        * We have to send the IPI only to
+        * CPUs affected.
+        */
+       send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
+
+       while (!cpus_empty(f->flush_cpumask))
+               cpu_relax();
+
+       f->flush_mm = NULL;
+       f->flush_va = 0;
+       spin_unlock(&f->tlbstate_lock);
+}
+
+int __cpuinit init_smp_flush(void)
+{
+       int i;
+       for_each_cpu_mask(i, cpu_possible_map) {
+               spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
+       }
+       return 0;
+}
+
+core_initcall(init_smp_flush);
+       
+void flush_tlb_current_task(void)
+{
+       struct mm_struct *mm = current->mm;
+       cpumask_t cpu_mask;
+
+       preempt_disable();
+       cpu_mask = mm->cpu_vm_mask;
+       cpu_clear(smp_processor_id(), cpu_mask);
+
+       local_flush_tlb();
+       if (!cpus_empty(cpu_mask))
+               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+       preempt_enable();
+}
+EXPORT_SYMBOL(flush_tlb_current_task);
+
+void flush_tlb_mm (struct mm_struct * mm)
+{
+       cpumask_t cpu_mask;
+
+       preempt_disable();
+       cpu_mask = mm->cpu_vm_mask;
+       cpu_clear(smp_processor_id(), cpu_mask);
+
+       if (current->active_mm == mm) {
+               if (current->mm)
+                       local_flush_tlb();
+               else
+                       leave_mm(smp_processor_id());
+       }
+       if (!cpus_empty(cpu_mask))
+               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
+
+       preempt_enable();
+}
+EXPORT_SYMBOL(flush_tlb_mm);
+
+void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       cpumask_t cpu_mask;
+
+       preempt_disable();
+       cpu_mask = mm->cpu_vm_mask;
+       cpu_clear(smp_processor_id(), cpu_mask);
+
+       if (current->active_mm == mm) {
+               if(current->mm)
+                       __flush_tlb_one(va);
+                else
+                       leave_mm(smp_processor_id());
+       }
+
+       if (!cpus_empty(cpu_mask))
+               flush_tlb_others(cpu_mask, mm, va);
+
+       preempt_enable();
+}
+EXPORT_SYMBOL(flush_tlb_page);
+
+static void do_flush_tlb_all(void* info)
+{
+       unsigned long cpu = smp_processor_id();
+
+       __flush_tlb_all();
+       if (read_pda(mmu_state) == TLBSTATE_LAZY)
+               leave_mm(cpu);
+}
+
+void flush_tlb_all(void)
+{
+       on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
+}
+
+/*
+ * this function sends a 'reschedule' IPI to another CPU.
+ * it goes straight through and wastes no time serializing
+ * anything. Worst case is that we lose a reschedule ...
+ */
+
+void smp_send_reschedule(int cpu)
+{
+       send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
+}
+
+/*
+ * Structure and data for smp_call_function(). This is designed to minimise
+ * static memory requirements. It also looks cleaner.
+ */
+static DEFINE_SPINLOCK(call_lock);
+
+struct call_data_struct {
+       void (*func) (void *info);
+       void *info;
+       atomic_t started;
+       atomic_t finished;
+       int wait;
+};
+
+static struct call_data_struct * call_data;
+
+void lock_ipi_call_lock(void)
+{
+       spin_lock_irq(&call_lock);
+}
+
+void unlock_ipi_call_lock(void)
+{
+       spin_unlock_irq(&call_lock);
+}
+
+/*
+ * this function sends a 'generic call function' IPI to one other CPU
+ * in the system.
+ *
+ * cpu is a standard Linux logical CPU number.
+ */
+static void
+__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+                               int nonatomic, int wait)
+{
+       struct call_data_struct data;
+       int cpus = 1;
+
+       data.func = func;
+       data.info = info;
+       atomic_set(&data.started, 0);
+       data.wait = wait;
+       if (wait)
+               atomic_set(&data.finished, 0);
+
+       call_data = &data;
+       wmb();
+       /* Send a message to all other CPUs and wait for them to respond */
+       send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
+
+       /* Wait for response */
+       while (atomic_read(&data.started) != cpus)
+               cpu_relax();
+
+       if (!wait)
+               return;
+
+       while (atomic_read(&data.finished) != cpus)
+               cpu_relax();
+}
+
+/*
+ * smp_call_function_single - Run a function on a specific CPU
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: Currently unused.
+ * @wait: If true, wait until function has completed on other CPUs.
+ *
+ * Retrurns 0 on success, else a negative status code.
+ *
+ * Does not return until the remote CPU is nearly ready to execute <func>
+ * or is or has executed.
+ */
+
+int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
+       int nonatomic, int wait)
+{
+       /* prevent preemption and reschedule on another processor */
+       int me = get_cpu();
+
+       /* Can deadlock when called with interrupts disabled */
+       WARN_ON(irqs_disabled());
+
+       if (cpu == me) {
+               local_irq_disable();
+               func(info);
+               local_irq_enable();
+               put_cpu();
+               return 0;
+       }
+
+       spin_lock(&call_lock);
+       __smp_call_function_single(cpu, func, info, nonatomic, wait);
+       spin_unlock(&call_lock);
+       put_cpu();
+       return 0;
+}
+EXPORT_SYMBOL(smp_call_function_single);
+
+/*
+ * this function sends a 'generic call function' IPI to all other CPUs
+ * in the system.
+ */
+static void __smp_call_function (void (*func) (void *info), void *info,
+                               int nonatomic, int wait)
+{
+       struct call_data_struct data;
+       int cpus = num_online_cpus()-1;
+
+       if (!cpus)
+               return;
+
+       data.func = func;
+       data.info = info;
+       atomic_set(&data.started, 0);
+       data.wait = wait;
+       if (wait)
+               atomic_set(&data.finished, 0);
+
+       call_data = &data;
+       wmb();
+       /* Send a message to all other CPUs and wait for them to respond */
+       send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+
+       /* Wait for response */
+       while (atomic_read(&data.started) != cpus)
+               cpu_relax();
+
+       if (!wait)
+               return;
+
+       while (atomic_read(&data.finished) != cpus)
+               cpu_relax();
+}
+
+/*
+ * smp_call_function - run a function on all other CPUs.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: currently unused.
+ * @wait: If true, wait (atomically) until function has completed on other
+ *        CPUs.
+ *
+ * Returns 0 on success, else a negative status code. Does not return until
+ * remote CPUs are nearly ready to execute func or are or have executed.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ * Actually there are a few legal cases, like panic.
+ */
+int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
+                       int wait)
+{
+       spin_lock(&call_lock);
+       __smp_call_function(func,info,nonatomic,wait);
+       spin_unlock(&call_lock);
+       return 0;
+}
+EXPORT_SYMBOL(smp_call_function);
+
+static void stop_this_cpu(void *dummy)
+{
+       local_irq_disable();
+       /*
+        * Remove this CPU:
+        */
+       cpu_clear(smp_processor_id(), cpu_online_map);
+       disable_local_APIC();
+       for (;;) 
+               halt();
+} 
+
+void smp_send_stop(void)
+{
+       int nolock;
+       unsigned long flags;
+
+       if (reboot_force)
+               return;
+
+       /* Don't deadlock on the call lock in panic */
+       nolock = !spin_trylock(&call_lock);
+       local_irq_save(flags);
+       __smp_call_function(stop_this_cpu, NULL, 0, 0);
+       if (!nolock)
+               spin_unlock(&call_lock);
+       disable_local_APIC();
+       local_irq_restore(flags);
+}
+
+/*
+ * Reschedule call back. Nothing to do,
+ * all the work is done automatically when
+ * we return from the interrupt.
+ */
+asmlinkage void smp_reschedule_interrupt(void)
+{
+       ack_APIC_irq();
+}
+
+asmlinkage void smp_call_function_interrupt(void)
+{
+       void (*func) (void *info) = call_data->func;
+       void *info = call_data->info;
+       int wait = call_data->wait;
+
+       ack_APIC_irq();
+       /*
+        * Notify initiating CPU that I've grabbed the data and am
+        * about to execute the function
+        */
+       mb();
+       atomic_inc(&call_data->started);
+       /*
+        * At this point the info structure may be out of scope unless wait==1
+        */
+       exit_idle();
+       irq_enter();
+       (*func)(info);
+       irq_exit();
+       if (wait) {
+               mb();
+               atomic_inc(&call_data->finished);
+       }
+}
+
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
new file mode 100644 (file)
index 0000000..32f5078
--- /dev/null
@@ -0,0 +1,1085 @@
+/*
+ *     x86 SMP booting functions
+ *
+ *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *     (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ *     Copyright 2001 Andi Kleen, SuSE Labs.
+ *
+ *     Much of the core SMP work is based on previous work by Thomas Radke, to
+ *     whom a great many thanks are extended.
+ *
+ *     Thanks to Intel for making available several different Pentium,
+ *     Pentium Pro and Pentium-II/Xeon MP machines.
+ *     Original development of Linux SMP code supported by Caldera.
+ *
+ *     This code is released under the GNU General Public License version 2
+ *
+ *     Fixes
+ *             Felix Koop      :       NR_CPUS used properly
+ *             Jose Renau      :       Handle single CPU case.
+ *             Alan Cox        :       By repeated request 8) - Total BogoMIP report.
+ *             Greg Wright     :       Fix for kernel stacks panic.
+ *             Erich Boleyn    :       MP v1.4 and additional changes.
+ *     Matthias Sattler        :       Changes for 2.1 kernel map.
+ *     Michel Lespinasse       :       Changes for 2.1 kernel map.
+ *     Michael Chastain        :       Change trampoline.S to gnu as.
+ *             Alan Cox        :       Dumb bug: 'B' step PPro's are fine
+ *             Ingo Molnar     :       Added APIC timers, based on code
+ *                                     from Jose Renau
+ *             Ingo Molnar     :       various cleanups and rewrites
+ *             Tigran Aivazian :       fixed "0.00 in /proc/uptime on SMP" bug.
+ *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs
+ *     Andi Kleen              :       Changed for SMP boot into long mode.
+ *             Rusty Russell   :       Hacked into shape for new "hotplug" boot process.
+ *      Andi Kleen              :       Converted to new state machine.
+ *                                     Various cleanups.
+ *                                     Probably mostly hotplug CPU ready now.
+ *     Ashok Raj                       : CPU hotplug support
+ */
+
+
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/kernel_stat.h>
+#include <linux/bootmem.h>
+#include <linux/thread_info.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/mc146818rtc.h>
+#include <linux/smp.h>
+#include <linux/kdebug.h>
+
+#include <asm/mtrr.h>
+#include <asm/pgalloc.h>
+#include <asm/desc.h>
+#include <asm/tlbflush.h>
+#include <asm/proto.h>
+#include <asm/nmi.h>
+#include <asm/irq.h>
+#include <asm/hw_irq.h>
+#include <asm/numa.h>
+
+/* Number of siblings per CPU package */
+int smp_num_siblings = 1;
+EXPORT_SYMBOL(smp_num_siblings);
+
+/* Last level cache ID of each logical CPU */
+u8 cpu_llc_id[NR_CPUS] __cpuinitdata  = {[0 ... NR_CPUS-1] = BAD_APICID};
+
+/* Bitmask of currently online CPUs */
+cpumask_t cpu_online_map __read_mostly;
+
+EXPORT_SYMBOL(cpu_online_map);
+
+/*
+ * Private maps to synchronize booting between AP and BP.
+ * Probably not needed anymore, but it makes for easier debugging. -AK
+ */
+cpumask_t cpu_callin_map;
+cpumask_t cpu_callout_map;
+EXPORT_SYMBOL(cpu_callout_map);
+
+cpumask_t cpu_possible_map;
+EXPORT_SYMBOL(cpu_possible_map);
+
+/* Per CPU bogomips and other parameters */
+struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
+EXPORT_SYMBOL(cpu_data);
+
+/* Set when the idlers are all forked */
+int smp_threads_ready;
+
+/* representing HT siblings of each logical CPU */
+cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(cpu_sibling_map);
+
+/* representing HT and core siblings of each logical CPU */
+cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(cpu_core_map);
+
+/*
+ * Trampoline 80x86 program as an array.
+ */
+
+extern unsigned char trampoline_data[];
+extern unsigned char trampoline_end[];
+
+/* State of each CPU */
+DEFINE_PER_CPU(int, cpu_state) = { 0 };
+
+/*
+ * Store all idle threads, this can be reused instead of creating
+ * a new thread. Also avoids complicated thread destroy functionality
+ * for idle threads.
+ */
+struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
+
+#define get_idle_for_cpu(x)     (idle_thread_array[(x)])
+#define set_idle_for_cpu(x,p)   (idle_thread_array[(x)] = (p))
+
+/*
+ * Currently trivial. Write the real->protected mode
+ * bootstrap into the page concerned. The caller
+ * has made sure it's suitably aligned.
+ */
+
+static unsigned long __cpuinit setup_trampoline(void)
+{
+       void *tramp = __va(SMP_TRAMPOLINE_BASE); 
+       memcpy(tramp, trampoline_data, trampoline_end - trampoline_data);
+       return virt_to_phys(tramp);
+}
+
+/*
+ * The bootstrap kernel entry code has set these up. Save them for
+ * a given CPU
+ */
+
+static void __cpuinit smp_store_cpu_info(int id)
+{
+       struct cpuinfo_x86 *c = cpu_data + id;
+
+       *c = boot_cpu_data;
+       identify_cpu(c);
+       print_cpu_info(c);
+}
+
+static atomic_t init_deasserted __cpuinitdata;
+
+/*
+ * Report back to the Boot Processor.
+ * Running on AP.
+ */
+void __cpuinit smp_callin(void)
+{
+       int cpuid, phys_id;
+       unsigned long timeout;
+
+       /*
+        * If waken up by an INIT in an 82489DX configuration
+        * we may get here before an INIT-deassert IPI reaches
+        * our local APIC.  We have to wait for the IPI or we'll
+        * lock up on an APIC access.
+        */
+       while (!atomic_read(&init_deasserted))
+               cpu_relax();
+
+       /*
+        * (This works even if the APIC is not enabled.)
+        */
+       phys_id = GET_APIC_ID(apic_read(APIC_ID));
+       cpuid = smp_processor_id();
+       if (cpu_isset(cpuid, cpu_callin_map)) {
+               panic("smp_callin: phys CPU#%d, CPU#%d already present??\n",
+                                       phys_id, cpuid);
+       }
+       Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
+
+       /*
+        * STARTUP IPIs are fragile beasts as they might sometimes
+        * trigger some glue motherboard logic. Complete APIC bus
+        * silence for 1 second, this overestimates the time the
+        * boot CPU is spending to send the up to 2 STARTUP IPIs
+        * by a factor of two. This should be enough.
+        */
+
+       /*
+        * Waiting 2s total for startup (udelay is not yet working)
+        */
+       timeout = jiffies + 2*HZ;
+       while (time_before(jiffies, timeout)) {
+               /*
+                * Has the boot CPU finished it's STARTUP sequence?
+                */
+               if (cpu_isset(cpuid, cpu_callout_map))
+                       break;
+               cpu_relax();
+       }
+
+       if (!time_before(jiffies, timeout)) {
+               panic("smp_callin: CPU%d started up but did not get a callout!\n",
+                       cpuid);
+       }
+
+       /*
+        * the boot CPU has finished the init stage and is spinning
+        * on callin_map until we finish. We are free to set up this
+        * CPU, first the APIC. (this is probably redundant on most
+        * boards)
+        */
+
+       Dprintk("CALLIN, before setup_local_APIC().\n");
+       setup_local_APIC();
+
+       /*
+        * Get our bogomips.
+        *
+        * Need to enable IRQs because it can take longer and then
+        * the NMI watchdog might kill us.
+        */
+       local_irq_enable();
+       calibrate_delay();
+       local_irq_disable();
+       Dprintk("Stack at about %p\n",&cpuid);
+
+       disable_APIC_timer();
+
+       /*
+        * Save our processor parameters
+        */
+       smp_store_cpu_info(cpuid);
+
+       /*
+        * Allow the master to continue.
+        */
+       cpu_set(cpuid, cpu_callin_map);
+}
+
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+       struct cpuinfo_x86 *c = cpu_data + cpu;
+       /*
+        * For perf, we return last level cache shared map.
+        * And for power savings, we return cpu_core_map
+        */
+       if (sched_mc_power_savings || sched_smt_power_savings)
+               return cpu_core_map[cpu];
+       else
+               return c->llc_shared_map;
+}
+
+/* representing cpus for which sibling maps can be computed */
+static cpumask_t cpu_sibling_setup_map;
+
+static inline void set_cpu_sibling_map(int cpu)
+{
+       int i;
+       struct cpuinfo_x86 *c = cpu_data;
+
+       cpu_set(cpu, cpu_sibling_setup_map);
+
+       if (smp_num_siblings > 1) {
+               for_each_cpu_mask(i, cpu_sibling_setup_map) {
+                       if (c[cpu].phys_proc_id == c[i].phys_proc_id &&
+                           c[cpu].cpu_core_id == c[i].cpu_core_id) {
+                               cpu_set(i, cpu_sibling_map[cpu]);
+                               cpu_set(cpu, cpu_sibling_map[i]);
+                               cpu_set(i, cpu_core_map[cpu]);
+                               cpu_set(cpu, cpu_core_map[i]);
+                               cpu_set(i, c[cpu].llc_shared_map);
+                               cpu_set(cpu, c[i].llc_shared_map);
+                       }
+               }
+       } else {
+               cpu_set(cpu, cpu_sibling_map[cpu]);
+       }
+
+       cpu_set(cpu, c[cpu].llc_shared_map);
+
+       if (current_cpu_data.x86_max_cores == 1) {
+               cpu_core_map[cpu] = cpu_sibling_map[cpu];
+               c[cpu].booted_cores = 1;
+               return;
+       }
+
+       for_each_cpu_mask(i, cpu_sibling_setup_map) {
+               if (cpu_llc_id[cpu] != BAD_APICID &&
+                   cpu_llc_id[cpu] == cpu_llc_id[i]) {
+                       cpu_set(i, c[cpu].llc_shared_map);
+                       cpu_set(cpu, c[i].llc_shared_map);
+               }
+               if (c[cpu].phys_proc_id == c[i].phys_proc_id) {
+                       cpu_set(i, cpu_core_map[cpu]);
+                       cpu_set(cpu, cpu_core_map[i]);
+                       /*
+                        *  Does this new cpu bringup a new core?
+                        */
+                       if (cpus_weight(cpu_sibling_map[cpu]) == 1) {
+                               /*
+                                * for each core in package, increment
+                                * the booted_cores for this new cpu
+                                */
+                               if (first_cpu(cpu_sibling_map[i]) == i)
+                                       c[cpu].booted_cores++;
+                               /*
+                                * increment the core count for all
+                                * the other cpus in this package
+                                */
+                               if (i != cpu)
+                                       c[i].booted_cores++;
+                       } else if (i != cpu && !c[cpu].booted_cores)
+                               c[cpu].booted_cores = c[i].booted_cores;
+               }
+       }
+}
+
+/*
+ * Setup code on secondary processor (after comming out of the trampoline)
+ */
+void __cpuinit start_secondary(void)
+{
+       /*
+        * Dont put anything before smp_callin(), SMP
+        * booting is too fragile that we want to limit the
+        * things done here to the most necessary things.
+        */
+       cpu_init();
+       preempt_disable();
+       smp_callin();
+
+       /* otherwise gcc will move up the smp_processor_id before the cpu_init */
+       barrier();
+
+       /*
+        * Check TSC sync first:
+        */
+       check_tsc_sync_target();
+
+       Dprintk("cpu %d: setting up apic clock\n", smp_processor_id());         
+       setup_secondary_APIC_clock();
+
+       Dprintk("cpu %d: enabling apic timer\n", smp_processor_id());
+
+       if (nmi_watchdog == NMI_IO_APIC) {
+               disable_8259A_irq(0);
+               enable_NMI_through_LVT0(NULL);
+               enable_8259A_irq(0);
+       }
+
+       enable_APIC_timer();
+
+       /*
+        * The sibling maps must be set before turing the online map on for
+        * this cpu
+        */
+       set_cpu_sibling_map(smp_processor_id());
+
+       /*
+        * We need to hold call_lock, so there is no inconsistency
+        * between the time smp_call_function() determines number of
+        * IPI receipients, and the time when the determination is made
+        * for which cpus receive the IPI in genapic_flat.c. Holding this
+        * lock helps us to not include this cpu in a currently in progress
+        * smp_call_function().
+        */
+       lock_ipi_call_lock();
+       spin_lock(&vector_lock);
+
+       /* Setup the per cpu irq handling data structures */
+       __setup_vector_irq(smp_processor_id());
+       /*
+        * Allow the master to continue.
+        */
+       cpu_set(smp_processor_id(), cpu_online_map);
+       per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+       spin_unlock(&vector_lock);
+
+       unlock_ipi_call_lock();
+
+       cpu_idle();
+}
+
+extern volatile unsigned long init_rsp;
+extern void (*initial_code)(void);
+
+#ifdef APIC_DEBUG
+static void inquire_remote_apic(int apicid)
+{
+       unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
+       char *names[] = { "ID", "VERSION", "SPIV" };
+       int timeout;
+       unsigned int status;
+
+       printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
+
+       for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) {
+               printk("... APIC #%d %s: ", apicid, names[i]);
+
+               /*
+                * Wait for idle.
+                */
+               status = safe_apic_wait_icr_idle();
+               if (status)
+                       printk("a previous APIC delivery may have failed\n");
+
+               apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
+               apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
+
+               timeout = 0;
+               do {
+                       udelay(100);
+                       status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
+               } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
+
+               switch (status) {
+               case APIC_ICR_RR_VALID:
+                       status = apic_read(APIC_RRR);
+                       printk("%08x\n", status);
+                       break;
+               default:
+                       printk("failed\n");
+               }
+       }
+}
+#endif
+
+/*
+ * Kick the secondary to wake up.
+ */
+static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip)
+{
+       unsigned long send_status, accept_status = 0;
+       int maxlvt, num_starts, j;
+
+       Dprintk("Asserting INIT.\n");
+
+       /*
+        * Turn INIT on target chip
+        */
+       apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+
+       /*
+        * Send IPI
+        */
+       apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
+                               | APIC_DM_INIT);
+
+       Dprintk("Waiting for send to finish...\n");
+       send_status = safe_apic_wait_icr_idle();
+
+       mdelay(10);
+
+       Dprintk("Deasserting INIT.\n");
+
+       /* Target chip */
+       apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+
+       /* Send IPI */
+       apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
+
+       Dprintk("Waiting for send to finish...\n");
+       send_status = safe_apic_wait_icr_idle();
+
+       mb();
+       atomic_set(&init_deasserted, 1);
+
+       num_starts = 2;
+
+       /*
+        * Run STARTUP IPI loop.
+        */
+       Dprintk("#startup loops: %d.\n", num_starts);
+
+       maxlvt = get_maxlvt();
+
+       for (j = 1; j <= num_starts; j++) {
+               Dprintk("Sending STARTUP #%d.\n",j);
+               apic_write(APIC_ESR, 0);
+               apic_read(APIC_ESR);
+               Dprintk("After apic_write.\n");
+
+               /*
+                * STARTUP IPI
+                */
+
+               /* Target chip */
+               apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+
+               /* Boot on the stack */
+               /* Kick the second */
+               apic_write(APIC_ICR, APIC_DM_STARTUP | (start_rip >> 12));
+
+               /*
+                * Give the other CPU some time to accept the IPI.
+                */
+               udelay(300);
+
+               Dprintk("Startup point 1.\n");
+
+               Dprintk("Waiting for send to finish...\n");
+               send_status = safe_apic_wait_icr_idle();
+
+               /*
+                * Give the other CPU some time to accept the IPI.
+                */
+               udelay(200);
+               /*
+                * Due to the Pentium erratum 3AP.
+                */
+               if (maxlvt > 3) {
+                       apic_write(APIC_ESR, 0);
+               }
+               accept_status = (apic_read(APIC_ESR) & 0xEF);
+               if (send_status || accept_status)
+                       break;
+       }
+       Dprintk("After Startup.\n");
+
+       if (send_status)
+               printk(KERN_ERR "APIC never delivered???\n");
+       if (accept_status)
+               printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
+
+       return (send_status | accept_status);
+}
+
+struct create_idle {
+       struct work_struct work;
+       struct task_struct *idle;
+       struct completion done;
+       int cpu;
+};
+
+void do_fork_idle(struct work_struct *work)
+{
+       struct create_idle *c_idle =
+               container_of(work, struct create_idle, work);
+
+       c_idle->idle = fork_idle(c_idle->cpu);
+       complete(&c_idle->done);
+}
+
+/*
+ * Boot one CPU.
+ */
+static int __cpuinit do_boot_cpu(int cpu, int apicid)
+{
+       unsigned long boot_error;
+       int timeout;
+       unsigned long start_rip;
+       struct create_idle c_idle = {
+               .work = __WORK_INITIALIZER(c_idle.work, do_fork_idle),
+               .cpu = cpu,
+               .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
+       };
+
+       /* allocate memory for gdts of secondary cpus. Hotplug is considered */
+       if (!cpu_gdt_descr[cpu].address &&
+               !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) {
+               printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu);
+               return -1;
+       }
+
+       /* Allocate node local memory for AP pdas */
+       if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) {
+               struct x8664_pda *newpda, *pda;
+               int node = cpu_to_node(cpu);
+               pda = cpu_pda(cpu);
+               newpda = kmalloc_node(sizeof (struct x8664_pda), GFP_ATOMIC,
+                                     node);
+               if (newpda) {
+                       memcpy(newpda, pda, sizeof (struct x8664_pda));
+                       cpu_pda(cpu) = newpda;
+               } else
+                       printk(KERN_ERR
+               "Could not allocate node local PDA for CPU %d on node %d\n",
+                               cpu, node);
+       }
+
+       alternatives_smp_switch(1);
+
+       c_idle.idle = get_idle_for_cpu(cpu);
+
+       if (c_idle.idle) {
+               c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *)
+                       (THREAD_SIZE +  task_stack_page(c_idle.idle))) - 1);
+               init_idle(c_idle.idle, cpu);
+               goto do_rest;
+       }
+
+       /*
+        * During cold boot process, keventd thread is not spun up yet.
+        * When we do cpu hot-add, we create idle threads on the fly, we should
+        * not acquire any attributes from the calling context. Hence the clean
+        * way to create kernel_threads() is to do that from keventd().
+        * We do the current_is_keventd() due to the fact that ACPI notifier
+        * was also queuing to keventd() and when the caller is already running
+        * in context of keventd(), we would end up with locking up the keventd
+        * thread.
+        */
+       if (!keventd_up() || current_is_keventd())
+               c_idle.work.func(&c_idle.work);
+       else {
+               schedule_work(&c_idle.work);
+               wait_for_completion(&c_idle.done);
+       }
+
+       if (IS_ERR(c_idle.idle)) {
+               printk("failed fork for CPU %d\n", cpu);
+               return PTR_ERR(c_idle.idle);
+       }
+
+       set_idle_for_cpu(cpu, c_idle.idle);
+
+do_rest:
+
+       cpu_pda(cpu)->pcurrent = c_idle.idle;
+
+       start_rip = setup_trampoline();
+
+       init_rsp = c_idle.idle->thread.rsp;
+       per_cpu(init_tss,cpu).rsp0 = init_rsp;
+       initial_code = start_secondary;
+       clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
+
+       printk(KERN_INFO "Booting processor %d/%d APIC 0x%x\n", cpu,
+               cpus_weight(cpu_present_map),
+               apicid);
+
+       /*
+        * This grunge runs the startup process for
+        * the targeted processor.
+        */
+
+       atomic_set(&init_deasserted, 0);
+
+       Dprintk("Setting warm reset code and vector.\n");
+
+       CMOS_WRITE(0xa, 0xf);
+       local_flush_tlb();
+       Dprintk("1.\n");
+       *((volatile unsigned short *) phys_to_virt(0x469)) = start_rip >> 4;
+       Dprintk("2.\n");
+       *((volatile unsigned short *) phys_to_virt(0x467)) = start_rip & 0xf;
+       Dprintk("3.\n");
+
+       /*
+        * Be paranoid about clearing APIC errors.
+        */
+       apic_write(APIC_ESR, 0);
+       apic_read(APIC_ESR);
+
+       /*
+        * Status is now clean
+        */
+       boot_error = 0;
+
+       /*
+        * Starting actual IPI sequence...
+        */
+       boot_error = wakeup_secondary_via_INIT(apicid, start_rip);
+
+       if (!boot_error) {
+               /*
+                * allow APs to start initializing.
+                */
+               Dprintk("Before Callout %d.\n", cpu);
+               cpu_set(cpu, cpu_callout_map);
+               Dprintk("After Callout %d.\n", cpu);
+
+               /*
+                * Wait 5s total for a response
+                */
+               for (timeout = 0; timeout < 50000; timeout++) {
+                       if (cpu_isset(cpu, cpu_callin_map))
+                               break;  /* It has booted */
+                       udelay(100);
+               }
+
+               if (cpu_isset(cpu, cpu_callin_map)) {
+                       /* number CPUs logically, starting from 1 (BSP is 0) */
+                       Dprintk("CPU has booted.\n");
+               } else {
+                       boot_error = 1;
+                       if (*((volatile unsigned char *)phys_to_virt(SMP_TRAMPOLINE_BASE))
+                                       == 0xA5)
+                               /* trampoline started but...? */
+                               printk("Stuck ??\n");
+                       else
+                               /* trampoline code not run */
+                               printk("Not responding.\n");
+#ifdef APIC_DEBUG
+                       inquire_remote_apic(apicid);
+#endif
+               }
+       }
+       if (boot_error) {
+               cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
+               clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
+               clear_node_cpumask(cpu); /* was set by numa_add_cpu */
+               cpu_clear(cpu, cpu_present_map);
+               cpu_clear(cpu, cpu_possible_map);
+               x86_cpu_to_apicid[cpu] = BAD_APICID;
+               x86_cpu_to_log_apicid[cpu] = BAD_APICID;
+               return -EIO;
+       }
+
+       return 0;
+}
+
+cycles_t cacheflush_time;
+unsigned long cache_decay_ticks;
+
+/*
+ * Cleanup possible dangling ends...
+ */
+static __cpuinit void smp_cleanup_boot(void)
+{
+       /*
+        * Paranoid:  Set warm reset code and vector here back
+        * to default values.
+        */
+       CMOS_WRITE(0, 0xf);
+
+       /*
+        * Reset trampoline flag
+        */
+       *((volatile int *) phys_to_virt(0x467)) = 0;
+}
+
+/*
+ * Fall back to non SMP mode after errors.
+ *
+ * RED-PEN audit/test this more. I bet there is more state messed up here.
+ */
+static __init void disable_smp(void)
+{
+       cpu_present_map = cpumask_of_cpu(0);
+       cpu_possible_map = cpumask_of_cpu(0);
+       if (smp_found_config)
+               phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
+       else
+               phys_cpu_present_map = physid_mask_of_physid(0);
+       cpu_set(0, cpu_sibling_map[0]);
+       cpu_set(0, cpu_core_map[0]);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+int additional_cpus __initdata = -1;
+
+/*
+ * cpu_possible_map should be static, it cannot change as cpu's
+ * are onlined, or offlined. The reason is per-cpu data-structures
+ * are allocated by some modules at init time, and dont expect to
+ * do this dynamically on cpu arrival/departure.
+ * cpu_present_map on the other hand can change dynamically.
+ * In case when cpu_hotplug is not compiled, then we resort to current
+ * behaviour, which is cpu_possible == cpu_present.
+ * - Ashok Raj
+ *
+ * Three ways to find out the number of additional hotplug CPUs:
+ * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
+ * - The user can overwrite it with additional_cpus=NUM
+ * - Otherwise don't reserve additional CPUs.
+ * We do this because additional CPUs waste a lot of memory.
+ * -AK
+ */
+__init void prefill_possible_map(void)
+{
+       int i;
+       int possible;
+
+       if (additional_cpus == -1) {
+               if (disabled_cpus > 0)
+                       additional_cpus = disabled_cpus;
+               else
+                       additional_cpus = 0;
+       }
+       possible = num_processors + additional_cpus;
+       if (possible > NR_CPUS) 
+               possible = NR_CPUS;
+
+       printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
+               possible,
+               max_t(int, possible - num_processors, 0));
+
+       for (i = 0; i < possible; i++)
+               cpu_set(i, cpu_possible_map);
+}
+#endif
+
+/*
+ * Various sanity checks.
+ */
+static int __init smp_sanity_check(unsigned max_cpus)
+{
+       if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
+               printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
+                      hard_smp_processor_id());
+               physid_set(hard_smp_processor_id(), phys_cpu_present_map);
+       }
+
+       /*
+        * If we couldn't find an SMP configuration at boot time,
+        * get out of here now!
+        */
+       if (!smp_found_config) {
+               printk(KERN_NOTICE "SMP motherboard not detected.\n");
+               disable_smp();
+               if (APIC_init_uniprocessor())
+                       printk(KERN_NOTICE "Local APIC not detected."
+                                          " Using dummy APIC emulation.\n");
+               return -1;
+       }
+
+       /*
+        * Should not be necessary because the MP table should list the boot
+        * CPU too, but we do it for the sake of robustness anyway.
+        */
+       if (!physid_isset(boot_cpu_id, phys_cpu_present_map)) {
+               printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n",
+                                                                boot_cpu_id);
+               physid_set(hard_smp_processor_id(), phys_cpu_present_map);
+       }
+
+       /*
+        * If we couldn't find a local APIC, then get out of here now!
+        */
+       if (!cpu_has_apic) {
+               printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
+                       boot_cpu_id);
+               printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
+               nr_ioapics = 0;
+               return -1;
+       }
+
+       /*
+        * If SMP should be disabled, then really disable it!
+        */
+       if (!max_cpus) {
+               printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
+               nr_ioapics = 0;
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+ * Prepare for SMP bootup.  The MP table or ACPI has been read
+ * earlier.  Just do some sanity checking here and enable APIC mode.
+ */
+void __init smp_prepare_cpus(unsigned int max_cpus)
+{
+       nmi_watchdog_default();
+       current_cpu_data = boot_cpu_data;
+       current_thread_info()->cpu = 0;  /* needed? */
+       set_cpu_sibling_map(0);
+
+       if (smp_sanity_check(max_cpus) < 0) {
+               printk(KERN_INFO "SMP disabled\n");
+               disable_smp();
+               return;
+       }
+
+
+       /*
+        * Switch from PIC to APIC mode.
+        */
+       setup_local_APIC();
+
+       if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) {
+               panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
+                     GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id);
+               /* Or can we switch back to PIC here? */
+       }
+
+       /*
+        * Now start the IO-APICs
+        */
+       if (!skip_ioapic_setup && nr_ioapics)
+               setup_IO_APIC();
+       else
+               nr_ioapics = 0;
+
+       /*
+        * Set up local APIC timer on boot CPU.
+        */
+
+       setup_boot_APIC_clock();
+}
+
+/*
+ * Early setup to make printk work.
+ */
+void __init smp_prepare_boot_cpu(void)
+{
+       int me = smp_processor_id();
+       cpu_set(me, cpu_online_map);
+       cpu_set(me, cpu_callout_map);
+       per_cpu(cpu_state, me) = CPU_ONLINE;
+}
+
+/*
+ * Entry point to boot a CPU.
+ */
+int __cpuinit __cpu_up(unsigned int cpu)
+{
+       int apicid = cpu_present_to_apicid(cpu);
+       unsigned long flags;
+       int err;
+
+       WARN_ON(irqs_disabled());
+
+       Dprintk("++++++++++++++++++++=_---CPU UP  %u\n", cpu);
+
+       if (apicid == BAD_APICID || apicid == boot_cpu_id ||
+           !physid_isset(apicid, phys_cpu_present_map)) {
+               printk("__cpu_up: bad cpu %d\n", cpu);
+               return -EINVAL;
+       }
+
+       /*
+        * Already booted CPU?
+        */
+       if (cpu_isset(cpu, cpu_callin_map)) {
+               Dprintk("do_boot_cpu %d Already started\n", cpu);
+               return -ENOSYS;
+       }
+
+       /*
+        * Save current MTRR state in case it was changed since early boot
+        * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
+        */
+       mtrr_save_state();
+
+       per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+       /* Boot it! */
+       err = do_boot_cpu(cpu, apicid);
+       if (err < 0) {
+               Dprintk("do_boot_cpu failed %d\n", err);
+               return err;
+       }
+
+       /* Unleash the CPU! */
+       Dprintk("waiting for cpu %d\n", cpu);
+
+       /*
+        * Make sure and check TSC sync:
+        */
+       local_irq_save(flags);
+       check_tsc_sync_source(cpu);
+       local_irq_restore(flags);
+
+       while (!cpu_isset(cpu, cpu_online_map))
+               cpu_relax();
+       err = 0;
+
+       return err;
+}
+
+/*
+ * Finish the SMP boot.
+ */
+void __init smp_cpus_done(unsigned int max_cpus)
+{
+       smp_cleanup_boot();
+       setup_ioapic_dest();
+       check_nmi_watchdog();
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static void remove_siblinginfo(int cpu)
+{
+       int sibling;
+       struct cpuinfo_x86 *c = cpu_data;
+
+       for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
+               cpu_clear(cpu, cpu_core_map[sibling]);
+               /*
+                * last thread sibling in this cpu core going down
+                */
+               if (cpus_weight(cpu_sibling_map[cpu]) == 1)
+                       c[sibling].booted_cores--;
+       }
+                       
+       for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
+               cpu_clear(cpu, cpu_sibling_map[sibling]);
+       cpus_clear(cpu_sibling_map[cpu]);
+       cpus_clear(cpu_core_map[cpu]);
+       c[cpu].phys_proc_id = 0;
+       c[cpu].cpu_core_id = 0;
+       cpu_clear(cpu, cpu_sibling_setup_map);
+}
+
+void remove_cpu_from_maps(void)
+{
+       int cpu = smp_processor_id();
+
+       cpu_clear(cpu, cpu_callout_map);
+       cpu_clear(cpu, cpu_callin_map);
+       clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
+       clear_node_cpumask(cpu);
+}
+
+int __cpu_disable(void)
+{
+       int cpu = smp_processor_id();
+
+       /*
+        * Perhaps use cpufreq to drop frequency, but that could go
+        * into generic code.
+        *
+        * We won't take down the boot processor on i386 due to some
+        * interrupts only being able to be serviced by the BSP.
+        * Especially so if we're not using an IOAPIC   -zwane
+        */
+       if (cpu == 0)
+               return -EBUSY;
+
+       if (nmi_watchdog == NMI_LOCAL_APIC)
+               stop_apic_nmi_watchdog(NULL);
+       clear_local_APIC();
+
+       /*
+        * HACK:
+        * Allow any queued timer interrupts to get serviced
+        * This is only a temporary solution until we cleanup
+        * fixup_irqs as we do for IA64.
+        */
+       local_irq_enable();
+       mdelay(1);
+
+       local_irq_disable();
+       remove_siblinginfo(cpu);
+
+       spin_lock(&vector_lock);
+       /* It's now safe to remove this processor from the online map */
+       cpu_clear(cpu, cpu_online_map);
+       spin_unlock(&vector_lock);
+       remove_cpu_from_maps();
+       fixup_irqs(cpu_online_map);
+       return 0;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+       /* We don't do anything here: idle task is faking death itself. */
+       unsigned int i;
+
+       for (i = 0; i < 10; i++) {
+               /* They ack this in play_dead by setting CPU_DEAD */
+               if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+                       printk ("CPU %d is now offline\n", cpu);
+                       if (1 == num_online_cpus())
+                               alternatives_smp_switch(0);
+                       return;
+               }
+               msleep(100);
+       }
+       printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+}
+
+static __init int setup_additional_cpus(char *s)
+{
+       return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL;
+}
+early_param("additional_cpus", setup_additional_cpus);
+
+#else /* ... !CONFIG_HOTPLUG_CPU */
+
+int __cpu_disable(void)
+{
+       return -ENOSYS;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+       /* We said "no" in __cpu_disable */
+       BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
new file mode 100644 (file)
index 0000000..cb91091
--- /dev/null
@@ -0,0 +1,54 @@
+/*
+ * arch/x86_64/kernel/stacktrace.c
+ *
+ * Stack trace management functions
+ *
+ *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ */
+#include <linux/sched.h>
+#include <linux/stacktrace.h>
+#include <linux/module.h>
+#include <asm/stacktrace.h>
+
+static void save_stack_warning(void *data, char *msg)
+{
+}
+
+static void
+save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+}
+
+static int save_stack_stack(void *data, char *name)
+{
+       return -1;
+}
+
+static void save_stack_address(void *data, unsigned long addr)
+{
+       struct stack_trace *trace = (struct stack_trace *)data;
+       if (trace->skip > 0) {
+               trace->skip--;
+               return;
+       }
+       if (trace->nr_entries < trace->max_entries)
+               trace->entries[trace->nr_entries++] = addr;
+}
+
+static struct stacktrace_ops save_stack_ops = {
+       .warning = save_stack_warning,
+       .warning_symbol = save_stack_warning_symbol,
+       .stack = save_stack_stack,
+       .address = save_stack_address,
+};
+
+/*
+ * Save stack-backtrace addresses into a stack_trace buffer.
+ */
+void save_stack_trace(struct stack_trace *trace)
+{
+       dump_trace(current, NULL, NULL, &save_stack_ops, trace);
+       if (trace->nr_entries < trace->max_entries)
+               trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+EXPORT_SYMBOL(save_stack_trace);
diff --git a/arch/x86/kernel/suspend_64.c b/arch/x86/kernel/suspend_64.c
new file mode 100644 (file)
index 0000000..573c0a6
--- /dev/null
@@ -0,0 +1,239 @@
+/*
+ * Suspend support specific for i386.
+ *
+ * Distribute under GPLv2
+ *
+ * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
+ * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
+ */
+
+#include <linux/smp.h>
+#include <linux/suspend.h>
+#include <asm/proto.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/mtrr.h>
+
+/* References to section boundaries */
+extern const void __nosave_begin, __nosave_end;
+
+struct saved_context saved_context;
+
+unsigned long saved_context_eax, saved_context_ebx, saved_context_ecx, saved_context_edx;
+unsigned long saved_context_esp, saved_context_ebp, saved_context_esi, saved_context_edi;
+unsigned long saved_context_r08, saved_context_r09, saved_context_r10, saved_context_r11;
+unsigned long saved_context_r12, saved_context_r13, saved_context_r14, saved_context_r15;
+unsigned long saved_context_eflags;
+
+void __save_processor_state(struct saved_context *ctxt)
+{
+       kernel_fpu_begin();
+
+       /*
+        * descriptor tables
+        */
+       asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit));
+       asm volatile ("sidt %0" : "=m" (ctxt->idt_limit));
+       asm volatile ("str %0"  : "=m" (ctxt->tr));
+
+       /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */
+       /*
+        * segment registers
+        */
+       asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds));
+       asm volatile ("movw %%es, %0" : "=m" (ctxt->es));
+       asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs));
+       asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs));
+       asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss));
+
+       rdmsrl(MSR_FS_BASE, ctxt->fs_base);
+       rdmsrl(MSR_GS_BASE, ctxt->gs_base);
+       rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
+       mtrr_save_fixed_ranges(NULL);
+
+       /*
+        * control registers 
+        */
+       rdmsrl(MSR_EFER, ctxt->efer);
+       ctxt->cr0 = read_cr0();
+       ctxt->cr2 = read_cr2();
+       ctxt->cr3 = read_cr3();
+       ctxt->cr4 = read_cr4();
+       ctxt->cr8 = read_cr8();
+}
+
+void save_processor_state(void)
+{
+       __save_processor_state(&saved_context);
+}
+
+static void do_fpu_end(void)
+{
+       /*
+        * Restore FPU regs if necessary
+        */
+       kernel_fpu_end();
+}
+
+void __restore_processor_state(struct saved_context *ctxt)
+{
+       /*
+        * control registers
+        */
+       wrmsrl(MSR_EFER, ctxt->efer);
+       write_cr8(ctxt->cr8);
+       write_cr4(ctxt->cr4);
+       write_cr3(ctxt->cr3);
+       write_cr2(ctxt->cr2);
+       write_cr0(ctxt->cr0);
+
+       /*
+        * now restore the descriptor tables to their proper values
+        * ltr is done i fix_processor_context().
+        */
+       asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
+       asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
+
+       /*
+        * segment registers
+        */
+       asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds));
+       asm volatile ("movw %0, %%es" :: "r" (ctxt->es));
+       asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs));
+       load_gs_index(ctxt->gs);
+       asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss));
+
+       wrmsrl(MSR_FS_BASE, ctxt->fs_base);
+       wrmsrl(MSR_GS_BASE, ctxt->gs_base);
+       wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
+
+       fix_processor_context();
+
+       do_fpu_end();
+       mtrr_ap_init();
+}
+
+void restore_processor_state(void)
+{
+       __restore_processor_state(&saved_context);
+}
+
+void fix_processor_context(void)
+{
+       int cpu = smp_processor_id();
+       struct tss_struct *t = &per_cpu(init_tss, cpu);
+
+       set_tss_desc(cpu,t);    /* This just modifies memory; should not be neccessary. But... This is neccessary, because 386 hardware has concept of busy TSS or some similar stupidity. */
+
+       cpu_gdt(cpu)[GDT_ENTRY_TSS].type = 9;
+
+       syscall_init();                         /* This sets MSR_*STAR and related */
+       load_TR_desc();                         /* This does ltr */
+       load_LDT(&current->active_mm->context); /* This does lldt */
+
+       /*
+        * Now maybe reload the debug registers
+        */
+       if (current->thread.debugreg7){
+                loaddebug(&current->thread, 0);
+                loaddebug(&current->thread, 1);
+                loaddebug(&current->thread, 2);
+                loaddebug(&current->thread, 3);
+                /* no 4 and 5 */
+                loaddebug(&current->thread, 6);
+                loaddebug(&current->thread, 7);
+       }
+
+}
+
+#ifdef CONFIG_HIBERNATION
+/* Defined in arch/x86_64/kernel/suspend_asm.S */
+extern int restore_image(void);
+
+pgd_t *temp_level4_pgt;
+
+static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+{
+       long i, j;
+
+       i = pud_index(address);
+       pud = pud + i;
+       for (; i < PTRS_PER_PUD; pud++, i++) {
+               unsigned long paddr;
+               pmd_t *pmd;
+
+               paddr = address + i*PUD_SIZE;
+               if (paddr >= end)
+                       break;
+
+               pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
+               if (!pmd)
+                       return -ENOMEM;
+               set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+               for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
+                       unsigned long pe;
+
+                       if (paddr >= end)
+                               break;
+                       pe = _PAGE_NX | _PAGE_PSE | _KERNPG_TABLE | paddr;
+                       pe &= __supported_pte_mask;
+                       set_pmd(pmd, __pmd(pe));
+               }
+       }
+       return 0;
+}
+
+static int set_up_temporary_mappings(void)
+{
+       unsigned long start, end, next;
+       int error;
+
+       temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC);
+       if (!temp_level4_pgt)
+               return -ENOMEM;
+
+       /* It is safe to reuse the original kernel mapping */
+       set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map),
+               init_level4_pgt[pgd_index(__START_KERNEL_map)]);
+
+       /* Set up the direct mapping from scratch */
+       start = (unsigned long)pfn_to_kaddr(0);
+       end = (unsigned long)pfn_to_kaddr(end_pfn);
+
+       for (; start < end; start = next) {
+               pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC);
+               if (!pud)
+                       return -ENOMEM;
+               next = start + PGDIR_SIZE;
+               if (next > end)
+                       next = end;
+               if ((error = res_phys_pud_init(pud, __pa(start), __pa(next))))
+                       return error;
+               set_pgd(temp_level4_pgt + pgd_index(start),
+                       mk_kernel_pgd(__pa(pud)));
+       }
+       return 0;
+}
+
+int swsusp_arch_resume(void)
+{
+       int error;
+
+       /* We have got enough memory and from now on we cannot recover */
+       if ((error = set_up_temporary_mappings()))
+               return error;
+       restore_image();
+       return 0;
+}
+
+/*
+ *     pfn_is_nosave - check if given pfn is in the 'nosave' section
+ */
+
+int pfn_is_nosave(unsigned long pfn)
+{
+       unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT;
+       unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT;
+       return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
+}
+#endif /* CONFIG_HIBERNATION */
diff --git a/arch/x86/kernel/suspend_asm_64.S b/arch/x86/kernel/suspend_asm_64.S
new file mode 100644 (file)
index 0000000..16d183f
--- /dev/null
@@ -0,0 +1,110 @@
+/* Copyright 2004,2005 Pavel Machek <pavel@suse.cz>, Andi Kleen <ak@suse.de>, Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * Distribute under GPLv2.
+ *
+ * swsusp_arch_resume may not use any stack, nor any variable that is
+ * not "NoSave" during copying pages:
+ *
+ * Its rewriting one kernel image with another. What is stack in "old"
+ * image could very well be data page in "new" image, and overwriting
+ * your own stack under you is bad idea.
+ */
+       
+       .text
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+
+ENTRY(swsusp_arch_suspend)
+
+       movq %rsp, saved_context_esp(%rip)
+       movq %rax, saved_context_eax(%rip)
+       movq %rbx, saved_context_ebx(%rip)
+       movq %rcx, saved_context_ecx(%rip)
+       movq %rdx, saved_context_edx(%rip)
+       movq %rbp, saved_context_ebp(%rip)
+       movq %rsi, saved_context_esi(%rip)
+       movq %rdi, saved_context_edi(%rip)
+       movq %r8,  saved_context_r08(%rip)
+       movq %r9,  saved_context_r09(%rip)
+       movq %r10, saved_context_r10(%rip)
+       movq %r11, saved_context_r11(%rip)
+       movq %r12, saved_context_r12(%rip)
+       movq %r13, saved_context_r13(%rip)
+       movq %r14, saved_context_r14(%rip)
+       movq %r15, saved_context_r15(%rip)
+       pushfq ; popq saved_context_eflags(%rip)
+
+       call swsusp_save
+       ret
+
+ENTRY(restore_image)
+       /* switch to temporary page tables */
+       movq    $__PAGE_OFFSET, %rdx
+       movq    temp_level4_pgt(%rip), %rax
+       subq    %rdx, %rax
+       movq    %rax, %cr3
+       /* Flush TLB */
+       movq    mmu_cr4_features(%rip), %rax
+       movq    %rax, %rdx
+       andq    $~(1<<7), %rdx  # PGE
+       movq    %rdx, %cr4;  # turn off PGE
+       movq    %cr3, %rcx;  # flush TLB
+       movq    %rcx, %cr3;
+       movq    %rax, %cr4;  # turn PGE back on
+
+       movq    restore_pblist(%rip), %rdx
+loop:
+       testq   %rdx, %rdx
+       jz      done
+
+       /* get addresses from the pbe and copy the page */
+       movq    pbe_address(%rdx), %rsi
+       movq    pbe_orig_address(%rdx), %rdi
+       movq    $512, %rcx
+       rep
+       movsq
+
+       /* progress to the next pbe */
+       movq    pbe_next(%rdx), %rdx
+       jmp     loop
+done:
+       /* go back to the original page tables */
+       movq    $(init_level4_pgt - __START_KERNEL_map), %rax
+       addq    phys_base(%rip), %rax
+       movq    %rax, %cr3
+
+       /* Flush TLB, including "global" things (vmalloc) */
+       movq    mmu_cr4_features(%rip), %rax
+       movq    %rax, %rdx
+       andq    $~(1<<7), %rdx;  # PGE
+       movq    %rdx, %cr4;  # turn off PGE
+       movq    %cr3, %rcx;  # flush TLB
+       movq    %rcx, %cr3
+       movq    %rax, %cr4;  # turn PGE back on
+
+       movl    $24, %eax
+       movl    %eax, %ds
+
+       movq saved_context_esp(%rip), %rsp
+       movq saved_context_ebp(%rip), %rbp
+       /* Don't restore %rax, it must be 0 anyway */
+       movq saved_context_ebx(%rip), %rbx
+       movq saved_context_ecx(%rip), %rcx
+       movq saved_context_edx(%rip), %rdx
+       movq saved_context_esi(%rip), %rsi
+       movq saved_context_edi(%rip), %rdi
+       movq saved_context_r08(%rip), %r8
+       movq saved_context_r09(%rip), %r9
+       movq saved_context_r10(%rip), %r10
+       movq saved_context_r11(%rip), %r11
+       movq saved_context_r12(%rip), %r12
+       movq saved_context_r13(%rip), %r13
+       movq saved_context_r14(%rip), %r14
+       movq saved_context_r15(%rip), %r15
+       pushq saved_context_eflags(%rip) ; popfq
+
+       xorq    %rax, %rax
+
+       ret
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
new file mode 100644 (file)
index 0000000..4770b7a
--- /dev/null
@@ -0,0 +1,159 @@
+/*
+ * linux/arch/x86_64/kernel/sys_x86_64.c
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/smp.h>
+#include <linux/sem.h>
+#include <linux/msg.h>
+#include <linux/shm.h>
+#include <linux/stat.h>
+#include <linux/mman.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <linux/personality.h>
+
+#include <asm/uaccess.h>
+#include <asm/ia32.h>
+
+/*
+ * sys_pipe() is the normal C calling standard for creating
+ * a pipe. It's not the way Unix traditionally does this, though.
+ */
+asmlinkage long sys_pipe(int __user *fildes)
+{
+       int fd[2];
+       int error;
+
+       error = do_pipe(fd);
+       if (!error) {
+               if (copy_to_user(fildes, fd, 2*sizeof(int)))
+                       error = -EFAULT;
+       }
+       return error;
+}
+
+asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags,
+       unsigned long fd, unsigned long off)
+{
+       long error;
+       struct file * file;
+
+       error = -EINVAL;
+       if (off & ~PAGE_MASK)
+               goto out;
+
+       error = -EBADF;
+       file = NULL;
+       flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+       if (!(flags & MAP_ANONYMOUS)) {
+               file = fget(fd);
+               if (!file)
+                       goto out;
+       }
+       down_write(&current->mm->mmap_sem);
+       error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT);
+       up_write(&current->mm->mmap_sem);
+
+       if (file)
+               fput(file);
+out:
+       return error;
+}
+
+static void find_start_end(unsigned long flags, unsigned long *begin,
+                          unsigned long *end)
+{
+       if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) {
+               /* This is usually used needed to map code in small
+                  model, so it needs to be in the first 31bit. Limit
+                  it to that.  This means we need to move the
+                  unmapped base down for this case. This can give
+                  conflicts with the heap, but we assume that glibc
+                  malloc knows how to fall back to mmap. Give it 1GB
+                  of playground for now. -AK */ 
+               *begin = 0x40000000; 
+               *end = 0x80000000;              
+       } else {
+               *begin = TASK_UNMAPPED_BASE;
+               *end = TASK_SIZE; 
+       }
+} 
+
+unsigned long
+arch_get_unmapped_area(struct file *filp, unsigned long addr,
+               unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+       struct mm_struct *mm = current->mm;
+       struct vm_area_struct *vma;
+       unsigned long start_addr;
+       unsigned long begin, end;
+       
+       if (flags & MAP_FIXED)
+               return addr;
+
+       find_start_end(flags, &begin, &end); 
+
+       if (len > end)
+               return -ENOMEM;
+
+       if (addr) {
+               addr = PAGE_ALIGN(addr);
+               vma = find_vma(mm, addr);
+               if (end - len >= addr &&
+                   (!vma || addr + len <= vma->vm_start))
+                       return addr;
+       }
+       if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
+           && len <= mm->cached_hole_size) {
+               mm->cached_hole_size = 0;
+               mm->free_area_cache = begin;
+       }
+       addr = mm->free_area_cache;
+       if (addr < begin) 
+               addr = begin; 
+       start_addr = addr;
+
+full_search:
+       for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+               /* At this point:  (!vma || addr < vma->vm_end). */
+               if (end - len < addr) {
+                       /*
+                        * Start a new search - just in case we missed
+                        * some holes.
+                        */
+                       if (start_addr != begin) {
+                               start_addr = addr = begin;
+                               mm->cached_hole_size = 0;
+                               goto full_search;
+                       }
+                       return -ENOMEM;
+               }
+               if (!vma || addr + len <= vma->vm_start) {
+                       /*
+                        * Remember the place where we stopped the search:
+                        */
+                       mm->free_area_cache = addr + len;
+                       return addr;
+               }
+               if (addr + mm->cached_hole_size < vma->vm_start)
+                       mm->cached_hole_size = vma->vm_start - addr;
+
+               addr = vma->vm_end;
+       }
+}
+
+asmlinkage long sys_uname(struct new_utsname __user * name)
+{
+       int err;
+       down_read(&uts_sem);
+       err = copy_to_user(name, utsname(), sizeof (*name));
+       up_read(&uts_sem);
+       if (personality(current->personality) == PER_LINUX32) 
+               err |= copy_to_user(&name->machine, "i686", 5);                 
+       return err ? -EFAULT : 0;
+}
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
new file mode 100644 (file)
index 0000000..63d592c
--- /dev/null
@@ -0,0 +1,26 @@
+/* System call table for x86-64. */ 
+
+#include <linux/linkage.h>
+#include <linux/sys.h>
+#include <linux/cache.h>
+#include <asm/asm-offsets.h>
+
+#define __NO_STUBS
+
+#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; 
+#undef _ASM_X86_64_UNISTD_H_
+#include <asm-x86_64/unistd.h>
+
+#undef __SYSCALL
+#define __SYSCALL(nr, sym) [ nr ] = sym, 
+#undef _ASM_X86_64_UNISTD_H_
+
+typedef void (*sys_call_ptr_t)(void); 
+
+extern void sys_ni_syscall(void);
+
+const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
+       /* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */ 
+       [0 ... __NR_syscall_max] = &sys_ni_syscall,
+#include <asm-x86_64/unistd.h>
+};
diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c
new file mode 100644 (file)
index 0000000..e3f2569
--- /dev/null
@@ -0,0 +1,189 @@
+/*
+ * This file manages the translation entries for the IBM Calgary IOMMU.
+ *
+ * Derived from arch/powerpc/platforms/pseries/iommu.c
+ *
+ * Copyright (C) IBM Corporation, 2006
+ *
+ * Author: Jon Mason <jdmason@us.ibm.com>
+ * Author: Muli Ben-Yehuda <muli@il.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/bootmem.h>
+#include <asm/tce.h>
+#include <asm/calgary.h>
+#include <asm/proto.h>
+
+/* flush a tce at 'tceaddr' to main memory */
+static inline void flush_tce(void* tceaddr)
+{
+       /* a single tce can't cross a cache line */
+       if (cpu_has_clflush)
+               asm volatile("clflush (%0)" :: "r" (tceaddr));
+       else
+               asm volatile("wbinvd":::"memory");
+}
+
+void tce_build(struct iommu_table *tbl, unsigned long index,
+       unsigned int npages, unsigned long uaddr, int direction)
+{
+       u64* tp;
+       u64 t;
+       u64 rpn;
+
+       t = (1 << TCE_READ_SHIFT);
+       if (direction != DMA_TO_DEVICE)
+               t |= (1 << TCE_WRITE_SHIFT);
+
+       tp = ((u64*)tbl->it_base) + index;
+
+       while (npages--) {
+               rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT;
+               t &= ~TCE_RPN_MASK;
+               t |= (rpn << TCE_RPN_SHIFT);
+
+               *tp = cpu_to_be64(t);
+               flush_tce(tp);
+
+               uaddr += PAGE_SIZE;
+               tp++;
+       }
+}
+
+void tce_free(struct iommu_table *tbl, long index, unsigned int npages)
+{
+       u64* tp;
+
+       tp  = ((u64*)tbl->it_base) + index;
+
+       while (npages--) {
+               *tp = cpu_to_be64(0);
+               flush_tce(tp);
+               tp++;
+       }
+}
+
+static inline unsigned int table_size_to_number_of_entries(unsigned char size)
+{
+       /*
+        * size is the order of the table, 0-7
+        * smallest table is 8K entries, so shift result by 13 to
+        * multiply by 8K
+        */
+       return (1 << size) << 13;
+}
+
+static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
+{
+       unsigned int bitmapsz;
+       unsigned long bmppages;
+       int ret;
+
+       tbl->it_busno = dev->bus->number;
+
+       /* set the tce table size - measured in entries */
+       tbl->it_size = table_size_to_number_of_entries(specified_table_size);
+
+       /*
+        * number of bytes needed for the bitmap size in number of
+        * entries; we need one bit per entry
+        */
+       bitmapsz = tbl->it_size / BITS_PER_BYTE;
+       bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz));
+       if (!bmppages) {
+               printk(KERN_ERR "Calgary: cannot allocate bitmap\n");
+               ret = -ENOMEM;
+               goto done;
+       }
+
+       tbl->it_map = (unsigned long*)bmppages;
+
+       memset(tbl->it_map, 0, bitmapsz);
+
+       tbl->it_hint = 0;
+
+       spin_lock_init(&tbl->it_lock);
+
+       return 0;
+
+done:
+       return ret;
+}
+
+int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar)
+{
+       struct iommu_table *tbl;
+       int ret;
+
+       if (pci_iommu(dev->bus)) {
+               printk(KERN_ERR "Calgary: dev %p has sysdata->iommu %p\n",
+                      dev, pci_iommu(dev->bus));
+               BUG();
+       }
+
+       tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL);
+       if (!tbl) {
+               printk(KERN_ERR "Calgary: error allocating iommu_table\n");
+               ret = -ENOMEM;
+               goto done;
+       }
+
+       ret = tce_table_setparms(dev, tbl);
+       if (ret)
+               goto free_tbl;
+
+       tbl->bbar = bbar;
+
+       set_pci_iommu(dev->bus, tbl);
+
+       return 0;
+
+free_tbl:
+       kfree(tbl);
+done:
+       return ret;
+}
+
+void * __init alloc_tce_table(void)
+{
+       unsigned int size;
+
+       size = table_size_to_number_of_entries(specified_table_size);
+       size *= TCE_ENTRY_SIZE;
+
+       return __alloc_bootmem_low(size, size, 0);
+}
+
+void __init free_tce_table(void *tbl)
+{
+       unsigned int size;
+
+       if (!tbl)
+               return;
+
+       size = table_size_to_number_of_entries(specified_table_size);
+       size *= TCE_ENTRY_SIZE;
+
+       free_bootmem(__pa(tbl), size);
+}
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
new file mode 100644 (file)
index 0000000..6d48a4e
--- /dev/null
@@ -0,0 +1,447 @@
+/*
+ *  linux/arch/x86-64/kernel/time.c
+ *
+ *  "High Precision Event Timer" based timekeeping.
+ *
+ *  Copyright (c) 1991,1992,1995  Linus Torvalds
+ *  Copyright (c) 1994  Alan Modra
+ *  Copyright (c) 1995  Markus Kuhn
+ *  Copyright (c) 1996  Ingo Molnar
+ *  Copyright (c) 1998  Andrea Arcangeli
+ *  Copyright (c) 2002,2006  Vojtech Pavlik
+ *  Copyright (c) 2003  Andi Kleen
+ *  RTC support code taken from arch/i386/kernel/timers/time_hpet.c
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/mc146818rtc.h>
+#include <linux/time.h>
+#include <linux/ioport.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/sysdev.h>
+#include <linux/bcd.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/kallsyms.h>
+#include <linux/acpi.h>
+#ifdef CONFIG_ACPI
+#include <acpi/achware.h>      /* for PM timer frequency */
+#include <acpi/acpi_bus.h>
+#endif
+#include <asm/8253pit.h>
+#include <asm/i8253.h>
+#include <asm/pgtable.h>
+#include <asm/vsyscall.h>
+#include <asm/timex.h>
+#include <asm/proto.h>
+#include <asm/hpet.h>
+#include <asm/sections.h>
+#include <linux/hpet.h>
+#include <asm/apic.h>
+#include <asm/hpet.h>
+#include <asm/mpspec.h>
+#include <asm/nmi.h>
+#include <asm/vgtod.h>
+
+static char *timename = NULL;
+
+DEFINE_SPINLOCK(rtc_lock);
+EXPORT_SYMBOL(rtc_lock);
+DEFINE_SPINLOCK(i8253_lock);
+EXPORT_SYMBOL(i8253_lock);
+
+volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
+
+unsigned long profile_pc(struct pt_regs *regs)
+{
+       unsigned long pc = instruction_pointer(regs);
+
+       /* Assume the lock function has either no stack frame or a copy
+          of eflags from PUSHF
+          Eflags always has bits 22 and up cleared unlike kernel addresses. */
+       if (!user_mode(regs) && in_lock_functions(pc)) {
+               unsigned long *sp = (unsigned long *)regs->rsp;
+               if (sp[0] >> 22)
+                       return sp[0];
+               if (sp[1] >> 22)
+                       return sp[1];
+       }
+       return pc;
+}
+EXPORT_SYMBOL(profile_pc);
+
+/*
+ * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500
+ * ms after the second nowtime has started, because when nowtime is written
+ * into the registers of the CMOS clock, it will jump to the next second
+ * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data
+ * sheet for details.
+ */
+
+static int set_rtc_mmss(unsigned long nowtime)
+{
+       int retval = 0;
+       int real_seconds, real_minutes, cmos_minutes;
+       unsigned char control, freq_select;
+
+/*
+ * IRQs are disabled when we're called from the timer interrupt,
+ * no need for spin_lock_irqsave()
+ */
+
+       spin_lock(&rtc_lock);
+
+/*
+ * Tell the clock it's being set and stop it.
+ */
+
+       control = CMOS_READ(RTC_CONTROL);
+       CMOS_WRITE(control | RTC_SET, RTC_CONTROL);
+
+       freq_select = CMOS_READ(RTC_FREQ_SELECT);
+       CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT);
+
+       cmos_minutes = CMOS_READ(RTC_MINUTES);
+               BCD_TO_BIN(cmos_minutes);
+
+/*
+ * since we're only adjusting minutes and seconds, don't interfere with hour
+ * overflow. This avoids messing with unknown time zones but requires your RTC
+ * not to be off by more than 15 minutes. Since we're calling it only when
+ * our clock is externally synchronized using NTP, this shouldn't be a problem.
+ */
+
+       real_seconds = nowtime % 60;
+       real_minutes = nowtime / 60;
+       if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1)
+               real_minutes += 30;             /* correct for half hour time zone */
+       real_minutes %= 60;
+
+       if (abs(real_minutes - cmos_minutes) >= 30) {
+               printk(KERN_WARNING "time.c: can't update CMOS clock "
+                      "from %d to %d\n", cmos_minutes, real_minutes);
+               retval = -1;
+       } else {
+               BIN_TO_BCD(real_seconds);
+               BIN_TO_BCD(real_minutes);
+               CMOS_WRITE(real_seconds, RTC_SECONDS);
+               CMOS_WRITE(real_minutes, RTC_MINUTES);
+       }
+
+/*
+ * The following flags have to be released exactly in this order, otherwise the
+ * DS12887 (popular MC146818A clone with integrated battery and quartz) will
+ * not reset the oscillator and will not update precisely 500 ms later. You
+ * won't find this mentioned in the Dallas Semiconductor data sheets, but who
+ * believes data sheets anyway ... -- Markus Kuhn
+ */
+
+       CMOS_WRITE(control, RTC_CONTROL);
+       CMOS_WRITE(freq_select, RTC_FREQ_SELECT);
+
+       spin_unlock(&rtc_lock);
+
+       return retval;
+}
+
+int update_persistent_clock(struct timespec now)
+{
+       return set_rtc_mmss(now.tv_sec);
+}
+
+void main_timer_handler(void)
+{
+/*
+ * Here we are in the timer irq handler. We have irqs locally disabled (so we
+ * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running
+ * on the other CPU, so we need a lock. We also need to lock the vsyscall
+ * variables, because both do_timer() and us change them -arca+vojtech
+ */
+
+       write_seqlock(&xtime_lock);
+
+/*
+ * Do the timer stuff.
+ */
+
+       do_timer(1);
+#ifndef CONFIG_SMP
+       update_process_times(user_mode(get_irq_regs()));
+#endif
+
+/*
+ * In the SMP case we use the local APIC timer interrupt to do the profiling,
+ * except when we simulate SMP mode on a uniprocessor system, in that case we
+ * have to call the local interrupt handler.
+ */
+
+       if (!using_apic_timer)
+               smp_local_timer_interrupt();
+
+       write_sequnlock(&xtime_lock);
+}
+
+static irqreturn_t timer_interrupt(int irq, void *dev_id)
+{
+       if (apic_runs_main_timer > 1)
+               return IRQ_HANDLED;
+       main_timer_handler();
+       if (using_apic_timer)
+               smp_send_timer_broadcast_ipi();
+       return IRQ_HANDLED;
+}
+
+unsigned long read_persistent_clock(void)
+{
+       unsigned int year, mon, day, hour, min, sec;
+       unsigned long flags;
+       unsigned century = 0;
+
+       spin_lock_irqsave(&rtc_lock, flags);
+
+       do {
+               sec = CMOS_READ(RTC_SECONDS);
+               min = CMOS_READ(RTC_MINUTES);
+               hour = CMOS_READ(RTC_HOURS);
+               day = CMOS_READ(RTC_DAY_OF_MONTH);
+               mon = CMOS_READ(RTC_MONTH);
+               year = CMOS_READ(RTC_YEAR);
+#ifdef CONFIG_ACPI
+               if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
+                                       acpi_gbl_FADT.century)
+                       century = CMOS_READ(acpi_gbl_FADT.century);
+#endif
+       } while (sec != CMOS_READ(RTC_SECONDS));
+
+       spin_unlock_irqrestore(&rtc_lock, flags);
+
+       /*
+        * We know that x86-64 always uses BCD format, no need to check the
+        * config register.
+        */
+
+       BCD_TO_BIN(sec);
+       BCD_TO_BIN(min);
+       BCD_TO_BIN(hour);
+       BCD_TO_BIN(day);
+       BCD_TO_BIN(mon);
+       BCD_TO_BIN(year);
+
+       if (century) {
+               BCD_TO_BIN(century);
+               year += century * 100;
+               printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
+       } else {
+               /*
+                * x86-64 systems only exists since 2002.
+                * This will work up to Dec 31, 2100
+                */
+               year += 2000;
+       }
+
+       return mktime(year, mon, day, hour, min, sec);
+}
+
+/* calibrate_cpu is used on systems with fixed rate TSCs to determine
+ * processor frequency */
+#define TICK_COUNT 100000000
+static unsigned int __init tsc_calibrate_cpu_khz(void)
+{
+       int tsc_start, tsc_now;
+       int i, no_ctr_free;
+       unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
+       unsigned long flags;
+
+       for (i = 0; i < 4; i++)
+               if (avail_to_resrv_perfctr_nmi_bit(i))
+                       break;
+       no_ctr_free = (i == 4);
+       if (no_ctr_free) {
+               i = 3;
+               rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
+               wrmsrl(MSR_K7_EVNTSEL3, 0);
+               rdmsrl(MSR_K7_PERFCTR3, pmc3);
+       } else {
+               reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+               reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+       }
+       local_irq_save(flags);
+       /* start meauring cycles, incrementing from 0 */
+       wrmsrl(MSR_K7_PERFCTR0 + i, 0);
+       wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
+       rdtscl(tsc_start);
+       do {
+               rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
+               tsc_now = get_cycles_sync();
+       } while ((tsc_now - tsc_start) < TICK_COUNT);
+
+       local_irq_restore(flags);
+       if (no_ctr_free) {
+               wrmsrl(MSR_K7_EVNTSEL3, 0);
+               wrmsrl(MSR_K7_PERFCTR3, pmc3);
+               wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
+       } else {
+               release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+               release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+       }
+
+       return pmc_now * tsc_khz / (tsc_now - tsc_start);
+}
+
+/*
+ * pit_calibrate_tsc() uses the speaker output (channel 2) of
+ * the PIT. This is better than using the timer interrupt output,
+ * because we can read the value of the speaker with just one inb(),
+ * where we need three i/o operations for the interrupt channel.
+ * We count how many ticks the TSC does in 50 ms.
+ */
+
+static unsigned int __init pit_calibrate_tsc(void)
+{
+       unsigned long start, end;
+       unsigned long flags;
+
+       spin_lock_irqsave(&i8253_lock, flags);
+
+       outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+
+       outb(0xb0, 0x43);
+       outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
+       outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42);
+       start = get_cycles_sync();
+       while ((inb(0x61) & 0x20) == 0);
+       end = get_cycles_sync();
+
+       spin_unlock_irqrestore(&i8253_lock, flags);
+
+       return (end - start) / 50;
+}
+
+#define PIT_MODE 0x43
+#define PIT_CH0  0x40
+
+static void __pit_init(int val, u8 mode)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&i8253_lock, flags);
+       outb_p(mode, PIT_MODE);
+       outb_p(val & 0xff, PIT_CH0);    /* LSB */
+       outb_p(val >> 8, PIT_CH0);      /* MSB */
+       spin_unlock_irqrestore(&i8253_lock, flags);
+}
+
+void __init pit_init(void)
+{
+       __pit_init(LATCH, 0x34); /* binary, mode 2, LSB/MSB, ch 0 */
+}
+
+void pit_stop_interrupt(void)
+{
+       __pit_init(0, 0x30); /* mode 0 */
+}
+
+void stop_timer_interrupt(void)
+{
+       char *name;
+       if (hpet_address) {
+               name = "HPET";
+               hpet_timer_stop_set_go(0);
+       } else {
+               name = "PIT";
+               pit_stop_interrupt();
+       }
+       printk(KERN_INFO "timer: %s interrupt stopped.\n", name);
+}
+
+static struct irqaction irq0 = {
+       .handler        = timer_interrupt,
+       .flags          = IRQF_DISABLED | IRQF_IRQPOLL,
+       .mask           = CPU_MASK_NONE,
+       .name           = "timer"
+};
+
+void __init time_init(void)
+{
+       if (nohpet)
+               hpet_address = 0;
+
+       if (hpet_arch_init())
+               hpet_address = 0;
+
+       if (hpet_use_timer) {
+               /* set tick_nsec to use the proper rate for HPET */
+               tick_nsec = TICK_NSEC_HPET;
+               tsc_khz = hpet_calibrate_tsc();
+               timename = "HPET";
+       } else {
+               pit_init();
+               tsc_khz = pit_calibrate_tsc();
+               timename = "PIT";
+       }
+
+       cpu_khz = tsc_khz;
+       if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
+               boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+               boot_cpu_data.x86 == 16)
+               cpu_khz = tsc_calibrate_cpu_khz();
+
+       if (unsynchronized_tsc())
+               mark_tsc_unstable("TSCs unsynchronized");
+
+       if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
+               vgetcpu_mode = VGETCPU_RDTSCP;
+       else
+               vgetcpu_mode = VGETCPU_LSL;
+
+       set_cyc2ns_scale(tsc_khz);
+       printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
+               cpu_khz / 1000, cpu_khz % 1000);
+       init_tsc_clocksource();
+
+       setup_irq(0, &irq0);
+}
+
+/*
+ * sysfs support for the timer.
+ */
+
+static int timer_suspend(struct sys_device *dev, pm_message_t state)
+{
+       return 0;
+}
+
+static int timer_resume(struct sys_device *dev)
+{
+       if (hpet_address)
+               hpet_reenable();
+       else
+               i8254_timer_resume();
+       return 0;
+}
+
+static struct sysdev_class timer_sysclass = {
+       .resume = timer_resume,
+       .suspend = timer_suspend,
+       set_kset_name("timer"),
+};
+
+/* XXX this sysfs stuff should probably go elsewhere later -john */
+static struct sys_device device_timer = {
+       .id     = 0,
+       .cls    = &timer_sysclass,
+};
+
+static int time_init_device(void)
+{
+       int error = sysdev_class_register(&timer_sysclass);
+       if (!error)
+               error = sysdev_register(&device_timer);
+       return error;
+}
+
+device_initcall(time_init_device);
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
new file mode 100644 (file)
index 0000000..607983b
--- /dev/null
@@ -0,0 +1,166 @@
+/*
+ *
+ *     Trampoline.S    Derived from Setup.S by Linus Torvalds
+ *
+ *     4 Jan 1997 Michael Chastain: changed to gnu as.
+ *     15 Sept 2005 Eric Biederman: 64bit PIC support
+ *
+ *     Entry: CS:IP point to the start of our code, we are 
+ *     in real mode with no stack, but the rest of the 
+ *     trampoline page to make our stack and everything else
+ *     is a mystery.
+ *
+ *     In fact we don't actually need a stack so we don't
+ *     set one up.
+ *
+ *     On entry to trampoline_data, the processor is in real mode
+ *     with 16-bit addressing and 16-bit data.  CS has some value
+ *     and IP is zero.  Thus, data addresses need to be absolute
+ *     (no relocation) and are taken with regard to r_base.
+ *
+ *     With the addition of trampoline_level4_pgt this code can
+ *     now enter a 64bit kernel that lives at arbitrary 64bit
+ *     physical addresses.
+ *
+ *     If you work on this file, check the object module with objdump
+ *     --full-contents --reloc to make sure there are no relocation
+ *     entries.
+ */
+
+#include <linux/linkage.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+#include <asm/segment.h>
+
+.data
+
+.code16
+
+ENTRY(trampoline_data)
+r_base = .
+       cli                     # We should be safe anyway
+       wbinvd  
+       mov     %cs, %ax        # Code and data in the same place
+       mov     %ax, %ds
+       mov     %ax, %es
+       mov     %ax, %ss
+
+
+       movl    $0xA5A5A5A5, trampoline_data - r_base
+                               # write marker for master knows we're running
+
+                                       # Setup stack
+       movw    $(trampoline_stack_end - r_base), %sp
+
+       call    verify_cpu              # Verify the cpu supports long mode
+       testl   %eax, %eax              # Check for return code
+       jnz     no_longmode
+
+       mov     %cs, %ax
+       movzx   %ax, %esi               # Find the 32bit trampoline location
+       shll    $4, %esi
+
+                                       # Fixup the vectors
+       addl    %esi, startup_32_vector - r_base
+       addl    %esi, startup_64_vector - r_base
+       addl    %esi, tgdt + 2 - r_base # Fixup the gdt pointer
+
+       /*
+        * GDT tables in non default location kernel can be beyond 16MB and
+        * lgdt will not be able to load the address as in real mode default
+        * operand size is 16bit. Use lgdtl instead to force operand size
+        * to 32 bit.
+        */
+
+       lidtl   tidt - r_base   # load idt with 0, 0
+       lgdtl   tgdt - r_base   # load gdt with whatever is appropriate
+
+       xor     %ax, %ax
+       inc     %ax             # protected mode (PE) bit
+       lmsw    %ax             # into protected mode
+
+       # flush prefetch and jump to startup_32
+       ljmpl   *(startup_32_vector - r_base)
+
+       .code32
+       .balign 4
+startup_32:
+       movl    $__KERNEL_DS, %eax      # Initialize the %ds segment register
+       movl    %eax, %ds
+
+       xorl    %eax, %eax
+       btsl    $5, %eax                # Enable PAE mode
+       movl    %eax, %cr4
+
+                                       # Setup trampoline 4 level pagetables
+       leal    (trampoline_level4_pgt - r_base)(%esi), %eax
+       movl    %eax, %cr3
+
+       movl    $MSR_EFER, %ecx
+       movl    $(1 << _EFER_LME), %eax # Enable Long Mode
+       xorl    %edx, %edx
+       wrmsr
+
+       xorl    %eax, %eax
+       btsl    $31, %eax               # Enable paging and in turn activate Long Mode
+       btsl    $0, %eax                # Enable protected mode
+       movl    %eax, %cr0
+
+       /*
+        * At this point we're in long mode but in 32bit compatibility mode
+        * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
+        * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
+        * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
+        */
+       ljmp    *(startup_64_vector - r_base)(%esi)
+
+       .code64
+       .balign 4
+startup_64:
+       # Now jump into the kernel using virtual addresses
+       movq    $secondary_startup_64, %rax
+       jmp     *%rax
+
+       .code16
+no_longmode:
+       hlt
+       jmp no_longmode
+#include "verify_cpu_64.S"
+
+       # Careful these need to be in the same 64K segment as the above;
+tidt:
+       .word   0                       # idt limit = 0
+       .word   0, 0                    # idt base = 0L
+
+       # Duplicate the global descriptor table
+       # so the kernel can live anywhere
+       .balign 4
+tgdt:
+       .short  tgdt_end - tgdt         # gdt limit
+       .long   tgdt - r_base
+       .short 0
+       .quad   0x00cf9b000000ffff      # __KERNEL32_CS
+       .quad   0x00af9b000000ffff      # __KERNEL_CS
+       .quad   0x00cf93000000ffff      # __KERNEL_DS
+tgdt_end:
+
+       .balign 4
+startup_32_vector:
+       .long   startup_32 - r_base
+       .word   __KERNEL32_CS, 0
+
+       .balign 4
+startup_64_vector:
+       .long   startup_64 - r_base
+       .word   __KERNEL_CS, 0
+
+trampoline_stack:
+       .org 0x1000
+trampoline_stack_end:
+ENTRY(trampoline_level4_pgt)
+       .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+       .fill   510,8,0
+       .quad   level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
+
+ENTRY(trampoline_end)
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
new file mode 100644 (file)
index 0000000..0388842
--- /dev/null
@@ -0,0 +1,1138 @@
+/*
+ *  linux/arch/x86-64/traps.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ *
+ *  Pentium III FXSR, SSE support
+ *     Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+/*
+ * 'Traps.c' handles hardware traps and faults after we have saved some
+ * state in 'entry.S'.
+ */
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/nmi.h>
+#include <linux/kprobes.h>
+#include <linux/kexec.h>
+#include <linux/unwind.h>
+#include <linux/uaccess.h>
+#include <linux/bug.h>
+#include <linux/kdebug.h>
+
+#if defined(CONFIG_EDAC)
+#include <linux/edac.h>
+#endif
+
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/atomic.h>
+#include <asm/debugreg.h>
+#include <asm/desc.h>
+#include <asm/i387.h>
+#include <asm/processor.h>
+#include <asm/unwind.h>
+#include <asm/smp.h>
+#include <asm/pgalloc.h>
+#include <asm/pda.h>
+#include <asm/proto.h>
+#include <asm/nmi.h>
+#include <asm/stacktrace.h>
+
+asmlinkage void divide_error(void);
+asmlinkage void debug(void);
+asmlinkage void nmi(void);
+asmlinkage void int3(void);
+asmlinkage void overflow(void);
+asmlinkage void bounds(void);
+asmlinkage void invalid_op(void);
+asmlinkage void device_not_available(void);
+asmlinkage void double_fault(void);
+asmlinkage void coprocessor_segment_overrun(void);
+asmlinkage void invalid_TSS(void);
+asmlinkage void segment_not_present(void);
+asmlinkage void stack_segment(void);
+asmlinkage void general_protection(void);
+asmlinkage void page_fault(void);
+asmlinkage void coprocessor_error(void);
+asmlinkage void simd_coprocessor_error(void);
+asmlinkage void reserved(void);
+asmlinkage void alignment_check(void);
+asmlinkage void machine_check(void);
+asmlinkage void spurious_interrupt_bug(void);
+
+static inline void conditional_sti(struct pt_regs *regs)
+{
+       if (regs->eflags & X86_EFLAGS_IF)
+               local_irq_enable();
+}
+
+static inline void preempt_conditional_sti(struct pt_regs *regs)
+{
+       preempt_disable();
+       if (regs->eflags & X86_EFLAGS_IF)
+               local_irq_enable();
+}
+
+static inline void preempt_conditional_cli(struct pt_regs *regs)
+{
+       if (regs->eflags & X86_EFLAGS_IF)
+               local_irq_disable();
+       /* Make sure to not schedule here because we could be running
+          on an exception stack. */
+       preempt_enable_no_resched();
+}
+
+int kstack_depth_to_print = 12;
+
+#ifdef CONFIG_KALLSYMS
+void printk_address(unsigned long address)
+{
+       unsigned long offset = 0, symsize;
+       const char *symname;
+       char *modname;
+       char *delim = ":";
+       char namebuf[128];
+
+       symname = kallsyms_lookup(address, &symsize, &offset,
+                                       &modname, namebuf);
+       if (!symname) {
+               printk(" [<%016lx>]\n", address);
+               return;
+       }
+       if (!modname)
+               modname = delim = "";           
+       printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n",
+               address, delim, modname, delim, symname, offset, symsize);
+}
+#else
+void printk_address(unsigned long address)
+{
+       printk(" [<%016lx>]\n", address);
+}
+#endif
+
+static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
+                                       unsigned *usedp, char **idp)
+{
+       static char ids[][8] = {
+               [DEBUG_STACK - 1] = "#DB",
+               [NMI_STACK - 1] = "NMI",
+               [DOUBLEFAULT_STACK - 1] = "#DF",
+               [STACKFAULT_STACK - 1] = "#SS",
+               [MCE_STACK - 1] = "#MC",
+#if DEBUG_STKSZ > EXCEPTION_STKSZ
+               [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
+#endif
+       };
+       unsigned k;
+
+       /*
+        * Iterate over all exception stacks, and figure out whether
+        * 'stack' is in one of them:
+        */
+       for (k = 0; k < N_EXCEPTION_STACKS; k++) {
+               unsigned long end = per_cpu(orig_ist, cpu).ist[k];
+               /*
+                * Is 'stack' above this exception frame's end?
+                * If yes then skip to the next frame.
+                */
+               if (stack >= end)
+                       continue;
+               /*
+                * Is 'stack' above this exception frame's start address?
+                * If yes then we found the right frame.
+                */
+               if (stack >= end - EXCEPTION_STKSZ) {
+                       /*
+                        * Make sure we only iterate through an exception
+                        * stack once. If it comes up for the second time
+                        * then there's something wrong going on - just
+                        * break out and return NULL:
+                        */
+                       if (*usedp & (1U << k))
+                               break;
+                       *usedp |= 1U << k;
+                       *idp = ids[k];
+                       return (unsigned long *)end;
+               }
+               /*
+                * If this is a debug stack, and if it has a larger size than
+                * the usual exception stacks, then 'stack' might still
+                * be within the lower portion of the debug stack:
+                */
+#if DEBUG_STKSZ > EXCEPTION_STKSZ
+               if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
+                       unsigned j = N_EXCEPTION_STACKS - 1;
+
+                       /*
+                        * Black magic. A large debug stack is composed of
+                        * multiple exception stack entries, which we
+                        * iterate through now. Dont look:
+                        */
+                       do {
+                               ++j;
+                               end -= EXCEPTION_STKSZ;
+                               ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
+                       } while (stack < end - EXCEPTION_STKSZ);
+                       if (*usedp & (1U << j))
+                               break;
+                       *usedp |= 1U << j;
+                       *idp = ids[j];
+                       return (unsigned long *)end;
+               }
+#endif
+       }
+       return NULL;
+}
+
+#define MSG(txt) ops->warning(data, txt)
+
+/*
+ * x86-64 can have upto three kernel stacks: 
+ * process stack
+ * interrupt stack
+ * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
+ */
+
+static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
+{
+       void *t = (void *)tinfo;
+        return p > t && p < t + THREAD_SIZE - 3;
+}
+
+void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
+               unsigned long *stack,
+               struct stacktrace_ops *ops, void *data)
+{
+       const unsigned cpu = get_cpu();
+       unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
+       unsigned used = 0;
+       struct thread_info *tinfo;
+
+       if (!tsk)
+               tsk = current;
+
+       if (!stack) {
+               unsigned long dummy;
+               stack = &dummy;
+               if (tsk && tsk != current)
+                       stack = (unsigned long *)tsk->thread.rsp;
+       }
+
+       /*
+        * Print function call entries within a stack. 'cond' is the
+        * "end of stackframe" condition, that the 'stack++'
+        * iteration will eventually trigger.
+        */
+#define HANDLE_STACK(cond) \
+       do while (cond) { \
+               unsigned long addr = *stack++; \
+               /* Use unlocked access here because except for NMIs     \
+                  we should be already protected against module unloads */ \
+               if (__kernel_text_address(addr)) { \
+                       /* \
+                        * If the address is either in the text segment of the \
+                        * kernel, or in the region which contains vmalloc'ed \
+                        * memory, it *may* be the address of a calling \
+                        * routine; if so, print it so that someone tracing \
+                        * down the cause of the crash will be able to figure \
+                        * out the call path that was taken. \
+                        */ \
+                       ops->address(data, addr);   \
+               } \
+       } while (0)
+
+       /*
+        * Print function call entries in all stacks, starting at the
+        * current stack address. If the stacks consist of nested
+        * exceptions
+        */
+       for (;;) {
+               char *id;
+               unsigned long *estack_end;
+               estack_end = in_exception_stack(cpu, (unsigned long)stack,
+                                               &used, &id);
+
+               if (estack_end) {
+                       if (ops->stack(data, id) < 0)
+                               break;
+                       HANDLE_STACK (stack < estack_end);
+                       ops->stack(data, "<EOE>");
+                       /*
+                        * We link to the next stack via the
+                        * second-to-last pointer (index -2 to end) in the
+                        * exception stack:
+                        */
+                       stack = (unsigned long *) estack_end[-2];
+                       continue;
+               }
+               if (irqstack_end) {
+                       unsigned long *irqstack;
+                       irqstack = irqstack_end -
+                               (IRQSTACKSIZE - 64) / sizeof(*irqstack);
+
+                       if (stack >= irqstack && stack < irqstack_end) {
+                               if (ops->stack(data, "IRQ") < 0)
+                                       break;
+                               HANDLE_STACK (stack < irqstack_end);
+                               /*
+                                * We link to the next stack (which would be
+                                * the process stack normally) the last
+                                * pointer (index -1 to end) in the IRQ stack:
+                                */
+                               stack = (unsigned long *) (irqstack_end[-1]);
+                               irqstack_end = NULL;
+                               ops->stack(data, "EOI");
+                               continue;
+                       }
+               }
+               break;
+       }
+
+       /*
+        * This handles the process stack:
+        */
+       tinfo = task_thread_info(tsk);
+       HANDLE_STACK (valid_stack_ptr(tinfo, stack));
+#undef HANDLE_STACK
+       put_cpu();
+}
+EXPORT_SYMBOL(dump_trace);
+
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+       print_symbol(msg, symbol);
+       printk("\n");
+}
+
+static void print_trace_warning(void *data, char *msg)
+{
+       printk("%s\n", msg);
+}
+
+static int print_trace_stack(void *data, char *name)
+{
+       printk(" <%s> ", name);
+       return 0;
+}
+
+static void print_trace_address(void *data, unsigned long addr)
+{
+       touch_nmi_watchdog();
+       printk_address(addr);
+}
+
+static struct stacktrace_ops print_trace_ops = {
+       .warning = print_trace_warning,
+       .warning_symbol = print_trace_warning_symbol,
+       .stack = print_trace_stack,
+       .address = print_trace_address,
+};
+
+void
+show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
+{
+       printk("\nCall Trace:\n");
+       dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
+       printk("\n");
+}
+
+static void
+_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
+{
+       unsigned long *stack;
+       int i;
+       const int cpu = smp_processor_id();
+       unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
+       unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
+
+       // debugging aid: "show_stack(NULL, NULL);" prints the
+       // back trace for this cpu.
+
+       if (rsp == NULL) {
+               if (tsk)
+                       rsp = (unsigned long *)tsk->thread.rsp;
+               else
+                       rsp = (unsigned long *)&rsp;
+       }
+
+       stack = rsp;
+       for(i=0; i < kstack_depth_to_print; i++) {
+               if (stack >= irqstack && stack <= irqstack_end) {
+                       if (stack == irqstack_end) {
+                               stack = (unsigned long *) (irqstack_end[-1]);
+                               printk(" <EOI> ");
+                       }
+               } else {
+               if (((long) stack & (THREAD_SIZE-1)) == 0)
+                       break;
+               }
+               if (i && ((i % 4) == 0))
+                       printk("\n");
+               printk(" %016lx", *stack++);
+               touch_nmi_watchdog();
+       }
+       show_trace(tsk, regs, rsp);
+}
+
+void show_stack(struct task_struct *tsk, unsigned long * rsp)
+{
+       _show_stack(tsk, NULL, rsp);
+}
+
+/*
+ * The architecture-independent dump_stack generator
+ */
+void dump_stack(void)
+{
+       unsigned long dummy;
+       show_trace(NULL, NULL, &dummy);
+}
+
+EXPORT_SYMBOL(dump_stack);
+
+void show_registers(struct pt_regs *regs)
+{
+       int i;
+       int in_kernel = !user_mode(regs);
+       unsigned long rsp;
+       const int cpu = smp_processor_id();
+       struct task_struct *cur = cpu_pda(cpu)->pcurrent;
+
+       rsp = regs->rsp;
+       printk("CPU %d ", cpu);
+       __show_regs(regs);
+       printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
+               cur->comm, cur->pid, task_thread_info(cur), cur);
+
+       /*
+        * When in-kernel, we also print out the stack and code at the
+        * time of the fault..
+        */
+       if (in_kernel) {
+               printk("Stack: ");
+               _show_stack(NULL, regs, (unsigned long*)rsp);
+
+               printk("\nCode: ");
+               if (regs->rip < PAGE_OFFSET)
+                       goto bad;
+
+               for (i=0; i<20; i++) {
+                       unsigned char c;
+                       if (__get_user(c, &((unsigned char*)regs->rip)[i])) {
+bad:
+                               printk(" Bad RIP value.");
+                               break;
+                       }
+                       printk("%02x ", c);
+               }
+       }
+       printk("\n");
+}      
+
+int is_valid_bugaddr(unsigned long rip)
+{
+       unsigned short ud2;
+
+       if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2)))
+               return 0;
+
+       return ud2 == 0x0b0f;
+}
+
+#ifdef CONFIG_BUG
+void out_of_line_bug(void)
+{ 
+       BUG(); 
+} 
+EXPORT_SYMBOL(out_of_line_bug);
+#endif
+
+static DEFINE_SPINLOCK(die_lock);
+static int die_owner = -1;
+static unsigned int die_nest_count;
+
+unsigned __kprobes long oops_begin(void)
+{
+       int cpu;
+       unsigned long flags;
+
+       oops_enter();
+
+       /* racy, but better than risking deadlock. */
+       local_irq_save(flags);
+       cpu = smp_processor_id();
+       if (!spin_trylock(&die_lock)) { 
+               if (cpu == die_owner) 
+                       /* nested oops. should stop eventually */;
+               else
+                       spin_lock(&die_lock);
+       }
+       die_nest_count++;
+       die_owner = cpu;
+       console_verbose();
+       bust_spinlocks(1);
+       return flags;
+}
+
+void __kprobes oops_end(unsigned long flags)
+{ 
+       die_owner = -1;
+       bust_spinlocks(0);
+       die_nest_count--;
+       if (die_nest_count)
+               /* We still own the lock */
+               local_irq_restore(flags);
+       else
+               /* Nest count reaches zero, release the lock. */
+               spin_unlock_irqrestore(&die_lock, flags);
+       if (panic_on_oops)
+               panic("Fatal exception");
+       oops_exit();
+}
+
+void __kprobes __die(const char * str, struct pt_regs * regs, long err)
+{
+       static int die_counter;
+       printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
+#ifdef CONFIG_PREEMPT
+       printk("PREEMPT ");
+#endif
+#ifdef CONFIG_SMP
+       printk("SMP ");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+       printk("DEBUG_PAGEALLOC");
+#endif
+       printk("\n");
+       notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
+       show_registers(regs);
+       add_taint(TAINT_DIE);
+       /* Executive summary in case the oops scrolled away */
+       printk(KERN_ALERT "RIP ");
+       printk_address(regs->rip); 
+       printk(" RSP <%016lx>\n", regs->rsp); 
+       if (kexec_should_crash(current))
+               crash_kexec(regs);
+}
+
+void die(const char * str, struct pt_regs * regs, long err)
+{
+       unsigned long flags = oops_begin();
+
+       if (!user_mode(regs))
+               report_bug(regs->rip, regs);
+
+       __die(str, regs, err);
+       oops_end(flags);
+       do_exit(SIGSEGV); 
+}
+
+void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
+{
+       unsigned long flags = oops_begin();
+
+       /*
+        * We are in trouble anyway, lets at least try
+        * to get a message out.
+        */
+       printk(str, smp_processor_id());
+       show_registers(regs);
+       if (kexec_should_crash(current))
+               crash_kexec(regs);
+       if (do_panic || panic_on_oops)
+               panic("Non maskable interrupt");
+       oops_end(flags);
+       nmi_exit();
+       local_irq_enable();
+       do_exit(SIGSEGV);
+}
+
+static void __kprobes do_trap(int trapnr, int signr, char *str,
+                             struct pt_regs * regs, long error_code,
+                             siginfo_t *info)
+{
+       struct task_struct *tsk = current;
+
+       if (user_mode(regs)) {
+               /*
+                * We want error_code and trap_no set for userspace
+                * faults and kernelspace faults which result in
+                * die(), but not kernelspace faults which are fixed
+                * up.  die() gives the process no chance to handle
+                * the signal and notice the kernel fault information,
+                * so that won't result in polluting the information
+                * about previously queued, but not yet delivered,
+                * faults.  See also do_general_protection below.
+                */
+               tsk->thread.error_code = error_code;
+               tsk->thread.trap_no = trapnr;
+
+               if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
+                   printk_ratelimit())
+                       printk(KERN_INFO
+                              "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
+                              tsk->comm, tsk->pid, str,
+                              regs->rip, regs->rsp, error_code); 
+
+               if (info)
+                       force_sig_info(signr, info, tsk);
+               else
+                       force_sig(signr, tsk);
+               return;
+       }
+
+
+       /* kernel trap */ 
+       {            
+               const struct exception_table_entry *fixup;
+               fixup = search_exception_tables(regs->rip);
+               if (fixup)
+                       regs->rip = fixup->fixup;
+               else {
+                       tsk->thread.error_code = error_code;
+                       tsk->thread.trap_no = trapnr;
+                       die(str, regs, error_code);
+               }
+               return;
+       }
+}
+
+#define DO_ERROR(trapnr, signr, str, name) \
+asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+                                                       == NOTIFY_STOP) \
+               return; \
+       conditional_sti(regs);                                          \
+       do_trap(trapnr, signr, str, regs, error_code, NULL); \
+}
+
+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
+asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+       siginfo_t info; \
+       info.si_signo = signr; \
+       info.si_errno = 0; \
+       info.si_code = sicode; \
+       info.si_addr = (void __user *)siaddr; \
+       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+                                                       == NOTIFY_STOP) \
+               return; \
+       conditional_sti(regs);                                          \
+       do_trap(trapnr, signr, str, regs, error_code, &info); \
+}
+
+DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->rip)
+DO_ERROR( 4, SIGSEGV, "overflow", overflow)
+DO_ERROR( 5, SIGSEGV, "bounds", bounds)
+DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
+DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
+DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
+DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
+DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
+DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
+DO_ERROR(18, SIGSEGV, "reserved", reserved)
+
+/* Runs on IST stack */
+asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
+{
+       if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
+                       12, SIGBUS) == NOTIFY_STOP)
+               return;
+       preempt_conditional_sti(regs);
+       do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
+       preempt_conditional_cli(regs);
+}
+
+asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
+{
+       static const char str[] = "double fault";
+       struct task_struct *tsk = current;
+
+       /* Return not checked because double check cannot be ignored */
+       notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
+
+       tsk->thread.error_code = error_code;
+       tsk->thread.trap_no = 8;
+
+       /* This is always a kernel trap and never fixable (and thus must
+          never return). */
+       for (;;)
+               die(str, regs, error_code);
+}
+
+asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
+                                               long error_code)
+{
+       struct task_struct *tsk = current;
+
+       conditional_sti(regs);
+
+       if (user_mode(regs)) {
+               tsk->thread.error_code = error_code;
+               tsk->thread.trap_no = 13;
+
+               if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+                   printk_ratelimit())
+                       printk(KERN_INFO
+                      "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
+                              tsk->comm, tsk->pid,
+                              regs->rip, regs->rsp, error_code); 
+
+               force_sig(SIGSEGV, tsk);
+               return;
+       } 
+
+       /* kernel gp */
+       {
+               const struct exception_table_entry *fixup;
+               fixup = search_exception_tables(regs->rip);
+               if (fixup) {
+                       regs->rip = fixup->fixup;
+                       return;
+               }
+
+               tsk->thread.error_code = error_code;
+               tsk->thread.trap_no = 13;
+               if (notify_die(DIE_GPF, "general protection fault", regs,
+                                       error_code, 13, SIGSEGV) == NOTIFY_STOP)
+                       return;
+               die("general protection fault", regs, error_code);
+       }
+}
+
+static __kprobes void
+mem_parity_error(unsigned char reason, struct pt_regs * regs)
+{
+       printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
+               reason);
+       printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
+
+#if defined(CONFIG_EDAC)
+       if(edac_handler_set()) {
+               edac_atomic_assert_error();
+               return;
+       }
+#endif
+
+       if (panic_on_unrecovered_nmi)
+               panic("NMI: Not continuing");
+
+       printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+
+       /* Clear and disable the memory parity error line. */
+       reason = (reason & 0xf) | 4;
+       outb(reason, 0x61);
+}
+
+static __kprobes void
+io_check_error(unsigned char reason, struct pt_regs * regs)
+{
+       printk("NMI: IOCK error (debug interrupt?)\n");
+       show_registers(regs);
+
+       /* Re-enable the IOCK line, wait for a few seconds */
+       reason = (reason & 0xf) | 8;
+       outb(reason, 0x61);
+       mdelay(2000);
+       reason &= ~8;
+       outb(reason, 0x61);
+}
+
+static __kprobes void
+unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
+{
+       printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
+               reason);
+       printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
+
+       if (panic_on_unrecovered_nmi)
+               panic("NMI: Not continuing");
+
+       printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+}
+
+/* Runs on IST stack. This code must keep interrupts off all the time.
+   Nested NMIs are prevented by the CPU. */
+asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
+{
+       unsigned char reason = 0;
+       int cpu;
+
+       cpu = smp_processor_id();
+
+       /* Only the BSP gets external NMIs from the system.  */
+       if (!cpu)
+               reason = get_nmi_reason();
+
+       if (!(reason & 0xc0)) {
+               if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
+                                                               == NOTIFY_STOP)
+                       return;
+               /*
+                * Ok, so this is none of the documented NMI sources,
+                * so it must be the NMI watchdog.
+                */
+               if (nmi_watchdog_tick(regs,reason))
+                       return;
+               if (!do_nmi_callback(regs,cpu))
+                       unknown_nmi_error(reason, regs);
+
+               return;
+       }
+       if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
+               return; 
+
+       /* AK: following checks seem to be broken on modern chipsets. FIXME */
+
+       if (reason & 0x80)
+               mem_parity_error(reason, regs);
+       if (reason & 0x40)
+               io_check_error(reason, regs);
+}
+
+/* runs on IST stack. */
+asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
+{
+       if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
+               return;
+       }
+       preempt_conditional_sti(regs);
+       do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
+       preempt_conditional_cli(regs);
+}
+
+/* Help handler running on IST stack to switch back to user stack
+   for scheduling or signal handling. The actual stack switch is done in
+   entry.S */
+asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
+{
+       struct pt_regs *regs = eregs;
+       /* Did already sync */
+       if (eregs == (struct pt_regs *)eregs->rsp)
+               ;
+       /* Exception from user space */
+       else if (user_mode(eregs))
+               regs = task_pt_regs(current);
+       /* Exception from kernel and interrupts are enabled. Move to
+          kernel process stack. */
+       else if (eregs->eflags & X86_EFLAGS_IF)
+               regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
+       if (eregs != regs)
+               *regs = *eregs;
+       return regs;
+}
+
+/* runs on IST stack. */
+asmlinkage void __kprobes do_debug(struct pt_regs * regs,
+                                  unsigned long error_code)
+{
+       unsigned long condition;
+       struct task_struct *tsk = current;
+       siginfo_t info;
+
+       get_debugreg(condition, 6);
+
+       if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
+                                               SIGTRAP) == NOTIFY_STOP)
+               return;
+
+       preempt_conditional_sti(regs);
+
+       /* Mask out spurious debug traps due to lazy DR7 setting */
+       if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
+               if (!tsk->thread.debugreg7) { 
+                       goto clear_dr7;
+               }
+       }
+
+       tsk->thread.debugreg6 = condition;
+
+       /* Mask out spurious TF errors due to lazy TF clearing */
+       if (condition & DR_STEP) {
+               /*
+                * The TF error should be masked out only if the current
+                * process is not traced and if the TRAP flag has been set
+                * previously by a tracing process (condition detected by
+                * the PT_DTRACE flag); remember that the i386 TRAP flag
+                * can be modified by the process itself in user mode,
+                * allowing programs to debug themselves without the ptrace()
+                * interface.
+                */
+                if (!user_mode(regs))
+                       goto clear_TF_reenable;
+               /*
+                * Was the TF flag set by a debugger? If so, clear it now,
+                * so that register information is correct.
+                */
+               if (tsk->ptrace & PT_DTRACE) {
+                       regs->eflags &= ~TF_MASK;
+                       tsk->ptrace &= ~PT_DTRACE;
+               }
+       }
+
+       /* Ok, finally something we can handle */
+       tsk->thread.trap_no = 1;
+       tsk->thread.error_code = error_code;
+       info.si_signo = SIGTRAP;
+       info.si_errno = 0;
+       info.si_code = TRAP_BRKPT;
+       info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
+       force_sig_info(SIGTRAP, &info, tsk);
+
+clear_dr7:
+       set_debugreg(0UL, 7);
+       preempt_conditional_cli(regs);
+       return;
+
+clear_TF_reenable:
+       set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+       regs->eflags &= ~TF_MASK;
+       preempt_conditional_cli(regs);
+}
+
+static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
+{
+       const struct exception_table_entry *fixup;
+       fixup = search_exception_tables(regs->rip);
+       if (fixup) {
+               regs->rip = fixup->fixup;
+               return 1;
+       }
+       notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
+       /* Illegal floating point operation in the kernel */
+       current->thread.trap_no = trapnr;
+       die(str, regs, 0);
+       return 0;
+}
+
+/*
+ * Note that we play around with the 'TS' bit in an attempt to get
+ * the correct behaviour even in the presence of the asynchronous
+ * IRQ13 behaviour
+ */
+asmlinkage void do_coprocessor_error(struct pt_regs *regs)
+{
+       void __user *rip = (void __user *)(regs->rip);
+       struct task_struct * task;
+       siginfo_t info;
+       unsigned short cwd, swd;
+
+       conditional_sti(regs);
+       if (!user_mode(regs) &&
+           kernel_math_error(regs, "kernel x87 math error", 16))
+               return;
+
+       /*
+        * Save the info for the exception handler and clear the error.
+        */
+       task = current;
+       save_init_fpu(task);
+       task->thread.trap_no = 16;
+       task->thread.error_code = 0;
+       info.si_signo = SIGFPE;
+       info.si_errno = 0;
+       info.si_code = __SI_FAULT;
+       info.si_addr = rip;
+       /*
+        * (~cwd & swd) will mask out exceptions that are not set to unmasked
+        * status.  0x3f is the exception bits in these regs, 0x200 is the
+        * C1 reg you need in case of a stack fault, 0x040 is the stack
+        * fault bit.  We should only be taking one exception at a time,
+        * so if this combination doesn't produce any single exception,
+        * then we have a bad program that isn't synchronizing its FPU usage
+        * and it will suffer the consequences since we won't be able to
+        * fully reproduce the context of the exception
+        */
+       cwd = get_fpu_cwd(task);
+       swd = get_fpu_swd(task);
+       switch (swd & ~cwd & 0x3f) {
+               case 0x000:
+               default:
+                       break;
+               case 0x001: /* Invalid Op */
+                       /*
+                        * swd & 0x240 == 0x040: Stack Underflow
+                        * swd & 0x240 == 0x240: Stack Overflow
+                        * User must clear the SF bit (0x40) if set
+                        */
+                       info.si_code = FPE_FLTINV;
+                       break;
+               case 0x002: /* Denormalize */
+               case 0x010: /* Underflow */
+                       info.si_code = FPE_FLTUND;
+                       break;
+               case 0x004: /* Zero Divide */
+                       info.si_code = FPE_FLTDIV;
+                       break;
+               case 0x008: /* Overflow */
+                       info.si_code = FPE_FLTOVF;
+                       break;
+               case 0x020: /* Precision */
+                       info.si_code = FPE_FLTRES;
+                       break;
+       }
+       force_sig_info(SIGFPE, &info, task);
+}
+
+asmlinkage void bad_intr(void)
+{
+       printk("bad interrupt"); 
+}
+
+asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
+{
+       void __user *rip = (void __user *)(regs->rip);
+       struct task_struct * task;
+       siginfo_t info;
+       unsigned short mxcsr;
+
+       conditional_sti(regs);
+       if (!user_mode(regs) &&
+               kernel_math_error(regs, "kernel simd math error", 19))
+               return;
+
+       /*
+        * Save the info for the exception handler and clear the error.
+        */
+       task = current;
+       save_init_fpu(task);
+       task->thread.trap_no = 19;
+       task->thread.error_code = 0;
+       info.si_signo = SIGFPE;
+       info.si_errno = 0;
+       info.si_code = __SI_FAULT;
+       info.si_addr = rip;
+       /*
+        * The SIMD FPU exceptions are handled a little differently, as there
+        * is only a single status/control register.  Thus, to determine which
+        * unmasked exception was caught we must mask the exception mask bits
+        * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+        */
+       mxcsr = get_fpu_mxcsr(task);
+       switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
+               case 0x000:
+               default:
+                       break;
+               case 0x001: /* Invalid Op */
+                       info.si_code = FPE_FLTINV;
+                       break;
+               case 0x002: /* Denormalize */
+               case 0x010: /* Underflow */
+                       info.si_code = FPE_FLTUND;
+                       break;
+               case 0x004: /* Zero Divide */
+                       info.si_code = FPE_FLTDIV;
+                       break;
+               case 0x008: /* Overflow */
+                       info.si_code = FPE_FLTOVF;
+                       break;
+               case 0x020: /* Precision */
+                       info.si_code = FPE_FLTRES;
+                       break;
+       }
+       force_sig_info(SIGFPE, &info, task);
+}
+
+asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
+{
+}
+
+asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
+{
+}
+
+asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
+{
+}
+
+/*
+ *  'math_state_restore()' saves the current math information in the
+ * old math state array, and gets the new ones from the current task
+ *
+ * Careful.. There are problems with IBM-designed IRQ13 behaviour.
+ * Don't touch unless you *really* know how it works.
+ */
+asmlinkage void math_state_restore(void)
+{
+       struct task_struct *me = current;
+       clts();                 /* Allow maths ops (or we recurse) */
+
+       if (!used_math())
+               init_fpu(me);
+       restore_fpu_checking(&me->thread.i387.fxsave);
+       task_thread_info(me)->status |= TS_USEDFPU;
+       me->fpu_counter++;
+}
+
+void __init trap_init(void)
+{
+       set_intr_gate(0,&divide_error);
+       set_intr_gate_ist(1,&debug,DEBUG_STACK);
+       set_intr_gate_ist(2,&nmi,NMI_STACK);
+       set_system_gate_ist(3,&int3,DEBUG_STACK); /* int3 can be called from all */
+       set_system_gate(4,&overflow);   /* int4 can be called from all */
+       set_intr_gate(5,&bounds);
+       set_intr_gate(6,&invalid_op);
+       set_intr_gate(7,&device_not_available);
+       set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK);
+       set_intr_gate(9,&coprocessor_segment_overrun);
+       set_intr_gate(10,&invalid_TSS);
+       set_intr_gate(11,&segment_not_present);
+       set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK);
+       set_intr_gate(13,&general_protection);
+       set_intr_gate(14,&page_fault);
+       set_intr_gate(15,&spurious_interrupt_bug);
+       set_intr_gate(16,&coprocessor_error);
+       set_intr_gate(17,&alignment_check);
+#ifdef CONFIG_X86_MCE
+       set_intr_gate_ist(18,&machine_check, MCE_STACK); 
+#endif
+       set_intr_gate(19,&simd_coprocessor_error);
+
+#ifdef CONFIG_IA32_EMULATION
+       set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
+#endif
+       
+       /*
+        * Should be a barrier for any external CPU state.
+        */
+       cpu_init();
+}
+
+
+static int __init oops_setup(char *s)
+{ 
+       if (!s)
+               return -EINVAL;
+       if (!strcmp(s, "panic"))
+               panic_on_oops = 1;
+       return 0;
+} 
+early_param("oops", oops_setup);
+
+static int __init kstack_setup(char *s)
+{
+       if (!s)
+               return -EINVAL;
+       kstack_depth_to_print = simple_strtoul(s,NULL,0);
+       return 0;
+}
+early_param("kstack", kstack_setup);
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
new file mode 100644 (file)
index 0000000..2a59bde
--- /dev/null
@@ -0,0 +1,207 @@
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/clocksource.h>
+#include <linux/time.h>
+#include <linux/acpi.h>
+#include <linux/cpufreq.h>
+
+#include <asm/timex.h>
+
+static int notsc __initdata = 0;
+
+unsigned int cpu_khz;          /* TSC clocks / usec, not used here */
+EXPORT_SYMBOL(cpu_khz);
+unsigned int tsc_khz;
+EXPORT_SYMBOL(tsc_khz);
+
+static unsigned int cyc2ns_scale __read_mostly;
+
+void set_cyc2ns_scale(unsigned long khz)
+{
+       cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz;
+}
+
+static unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+       return (cyc * cyc2ns_scale) >> NS_SCALE;
+}
+
+unsigned long long sched_clock(void)
+{
+       unsigned long a = 0;
+
+       /* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
+        * which means it is not completely exact and may not be monotonous
+        * between CPUs. But the errors should be too small to matter for
+        * scheduling purposes.
+        */
+
+       rdtscll(a);
+       return cycles_2_ns(a);
+}
+
+static int tsc_unstable;
+
+inline int check_tsc_unstable(void)
+{
+       return tsc_unstable;
+}
+#ifdef CONFIG_CPU_FREQ
+
+/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
+ * changes.
+ *
+ * RED-PEN: On SMP we assume all CPUs run with the same frequency.  It's
+ * not that important because current Opteron setups do not support
+ * scaling on SMP anyroads.
+ *
+ * Should fix up last_tsc too. Currently gettimeofday in the
+ * first tick after the change will be slightly wrong.
+ */
+
+static unsigned int  ref_freq;
+static unsigned long loops_per_jiffy_ref;
+static unsigned long tsc_khz_ref;
+
+static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+                                void *data)
+{
+       struct cpufreq_freqs *freq = data;
+       unsigned long *lpj, dummy;
+
+       if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC))
+               return 0;
+
+       lpj = &dummy;
+       if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+#ifdef CONFIG_SMP
+               lpj = &cpu_data[freq->cpu].loops_per_jiffy;
+#else
+               lpj = &boot_cpu_data.loops_per_jiffy;
+#endif
+
+       if (!ref_freq) {
+               ref_freq = freq->old;
+               loops_per_jiffy_ref = *lpj;
+               tsc_khz_ref = tsc_khz;
+       }
+       if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
+               (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
+               (val == CPUFREQ_RESUMECHANGE)) {
+               *lpj =
+               cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
+
+               tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
+               if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+                       mark_tsc_unstable("cpufreq changes");
+       }
+
+       set_cyc2ns_scale(tsc_khz_ref);
+
+       return 0;
+}
+
+static struct notifier_block time_cpufreq_notifier_block = {
+       .notifier_call  = time_cpufreq_notifier
+};
+
+static int __init cpufreq_tsc(void)
+{
+       cpufreq_register_notifier(&time_cpufreq_notifier_block,
+                                 CPUFREQ_TRANSITION_NOTIFIER);
+       return 0;
+}
+
+core_initcall(cpufreq_tsc);
+
+#endif
+
+/*
+ * Make an educated guess if the TSC is trustworthy and synchronized
+ * over all CPUs.
+ */
+__cpuinit int unsynchronized_tsc(void)
+{
+       if (tsc_unstable)
+               return 1;
+
+#ifdef CONFIG_SMP
+       if (apic_is_clustered_box())
+               return 1;
+#endif
+       /* Most intel systems have synchronized TSCs except for
+          multi node systems */
+       if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+#ifdef CONFIG_ACPI
+               /* But TSC doesn't tick in C3 so don't use it there */
+               if (acpi_gbl_FADT.header.length > 0 &&
+                   acpi_gbl_FADT.C3latency < 1000)
+                       return 1;
+#endif
+               return 0;
+       }
+
+       /* Assume multi socket systems are not synchronized */
+       return num_present_cpus() > 1;
+}
+
+int __init notsc_setup(char *s)
+{
+       notsc = 1;
+       return 1;
+}
+
+__setup("notsc", notsc_setup);
+
+
+/* clock source code: */
+static cycle_t read_tsc(void)
+{
+       cycle_t ret = (cycle_t)get_cycles_sync();
+       return ret;
+}
+
+static cycle_t __vsyscall_fn vread_tsc(void)
+{
+       cycle_t ret = (cycle_t)get_cycles_sync();
+       return ret;
+}
+
+static struct clocksource clocksource_tsc = {
+       .name                   = "tsc",
+       .rating                 = 300,
+       .read                   = read_tsc,
+       .mask                   = CLOCKSOURCE_MASK(64),
+       .shift                  = 22,
+       .flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
+                                 CLOCK_SOURCE_MUST_VERIFY,
+       .vread                  = vread_tsc,
+};
+
+void mark_tsc_unstable(char *reason)
+{
+       if (!tsc_unstable) {
+               tsc_unstable = 1;
+               printk("Marking TSC unstable due to %s\n", reason);
+               /* Change only the rating, when not registered */
+               if (clocksource_tsc.mult)
+                       clocksource_change_rating(&clocksource_tsc, 0);
+               else
+                       clocksource_tsc.rating = 0;
+       }
+}
+EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+
+void __init init_tsc_clocksource(void)
+{
+       if (!notsc) {
+               clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
+                                                       clocksource_tsc.shift);
+               if (check_tsc_unstable())
+                       clocksource_tsc.rating = 0;
+
+               clocksource_register(&clocksource_tsc);
+       }
+}
index 12424629af87017b87cd8cfe9666388d28c7defa..355f5f506c8133d2a97108ae211f52839199510b 100644 (file)
@@ -1 +1,187 @@
-#include "../../x86_64/kernel/tsc_sync.c"
+/*
+ * arch/x86_64/kernel/tsc_sync.c: check TSC synchronization.
+ *
+ * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar
+ *
+ * We check whether all boot CPUs have their TSC's synchronized,
+ * print a warning if not and turn off the TSC clock-source.
+ *
+ * The warp-check is point-to-point between two CPUs, the CPU
+ * initiating the bootup is the 'source CPU', the freshly booting
+ * CPU is the 'target CPU'.
+ *
+ * Only two CPUs may participate - they can enter in any order.
+ * ( The serial nature of the boot logic and the CPU hotplug lock
+ *   protects against more than 2 CPUs entering this code. )
+ */
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/nmi.h>
+#include <asm/tsc.h>
+
+/*
+ * Entry/exit counters that make sure that both CPUs
+ * run the measurement code at once:
+ */
+static __cpuinitdata atomic_t start_count;
+static __cpuinitdata atomic_t stop_count;
+
+/*
+ * We use a raw spinlock in this exceptional case, because
+ * we want to have the fastest, inlined, non-debug version
+ * of a critical section, to be able to prove TSC time-warps:
+ */
+static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static __cpuinitdata cycles_t last_tsc;
+static __cpuinitdata cycles_t max_warp;
+static __cpuinitdata int nr_warps;
+
+/*
+ * TSC-warp measurement loop running on both CPUs:
+ */
+static __cpuinit void check_tsc_warp(void)
+{
+       cycles_t start, now, prev, end;
+       int i;
+
+       start = get_cycles_sync();
+       /*
+        * The measurement runs for 20 msecs:
+        */
+       end = start + tsc_khz * 20ULL;
+       now = start;
+
+       for (i = 0; ; i++) {
+               /*
+                * We take the global lock, measure TSC, save the
+                * previous TSC that was measured (possibly on
+                * another CPU) and update the previous TSC timestamp.
+                */
+               __raw_spin_lock(&sync_lock);
+               prev = last_tsc;
+               now = get_cycles_sync();
+               last_tsc = now;
+               __raw_spin_unlock(&sync_lock);
+
+               /*
+                * Be nice every now and then (and also check whether
+                * measurement is done [we also insert a 100 million
+                * loops safety exit, so we dont lock up in case the
+                * TSC readout is totally broken]):
+                */
+               if (unlikely(!(i & 7))) {
+                       if (now > end || i > 100000000)
+                               break;
+                       cpu_relax();
+                       touch_nmi_watchdog();
+               }
+               /*
+                * Outside the critical section we can now see whether
+                * we saw a time-warp of the TSC going backwards:
+                */
+               if (unlikely(prev > now)) {
+                       __raw_spin_lock(&sync_lock);
+                       max_warp = max(max_warp, prev - now);
+                       nr_warps++;
+                       __raw_spin_unlock(&sync_lock);
+               }
+
+       }
+}
+
+/*
+ * Source CPU calls into this - it waits for the freshly booted
+ * target CPU to arrive and then starts the measurement:
+ */
+void __cpuinit check_tsc_sync_source(int cpu)
+{
+       int cpus = 2;
+
+       /*
+        * No need to check if we already know that the TSC is not
+        * synchronized:
+        */
+       if (unsynchronized_tsc())
+               return;
+
+       printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
+                         smp_processor_id(), cpu);
+
+       /*
+        * Reset it - in case this is a second bootup:
+        */
+       atomic_set(&stop_count, 0);
+
+       /*
+        * Wait for the target to arrive:
+        */
+       while (atomic_read(&start_count) != cpus-1)
+               cpu_relax();
+       /*
+        * Trigger the target to continue into the measurement too:
+        */
+       atomic_inc(&start_count);
+
+       check_tsc_warp();
+
+       while (atomic_read(&stop_count) != cpus-1)
+               cpu_relax();
+
+       /*
+        * Reset it - just in case we boot another CPU later:
+        */
+       atomic_set(&start_count, 0);
+
+       if (nr_warps) {
+               printk("\n");
+               printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
+                                   " turning off TSC clock.\n", max_warp);
+               mark_tsc_unstable("check_tsc_sync_source failed");
+               nr_warps = 0;
+               max_warp = 0;
+               last_tsc = 0;
+       } else {
+               printk(" passed.\n");
+       }
+
+       /*
+        * Let the target continue with the bootup:
+        */
+       atomic_inc(&stop_count);
+}
+
+/*
+ * Freshly booted CPUs call into this:
+ */
+void __cpuinit check_tsc_sync_target(void)
+{
+       int cpus = 2;
+
+       if (unsynchronized_tsc())
+               return;
+
+       /*
+        * Register this CPU's participation and wait for the
+        * source CPU to start the measurement:
+        */
+       atomic_inc(&start_count);
+       while (atomic_read(&start_count) != cpus)
+               cpu_relax();
+
+       check_tsc_warp();
+
+       /*
+        * Ok, we are done:
+        */
+       atomic_inc(&stop_count);
+
+       /*
+        * Wait for the source CPU to print stuff:
+        */
+       while (atomic_read(&stop_count) != cpus)
+               cpu_relax();
+}
+#undef NR_LOOPS
+
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S
new file mode 100644 (file)
index 0000000..45b6f8a
--- /dev/null
@@ -0,0 +1,105 @@
+/*
+ *
+ *     verify_cpu.S - Code for cpu long mode and SSE verification. This
+ *     code has been borrowed from boot/setup.S and was introduced by
+ *     Andi Kleen.
+ *
+ *     Copyright (c) 2007  Andi Kleen (ak@suse.de)
+ *     Copyright (c) 2007  Eric Biederman (ebiederm@xmission.com)
+ *     Copyright (c) 2007  Vivek Goyal (vgoyal@in.ibm.com)
+ *
+ *     This source code is licensed under the GNU General Public License,
+ *     Version 2.  See the file COPYING for more details.
+ *
+ *     This is a common code for verification whether CPU supports
+ *     long mode and SSE or not. It is not called directly instead this
+ *     file is included at various places and compiled in that context.
+ *     Following are the current usage.
+ *
+ *     This file is included by both 16bit and 32bit code.
+ *
+ *     arch/x86_64/boot/setup.S : Boot cpu verification (16bit)
+ *     arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit)
+ *     arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit)
+ *     arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit)
+ *
+ *     verify_cpu, returns the status of cpu check in register %eax.
+ *             0: Success    1: Failure
+ *
+ *     The caller needs to check for the error code and take the action
+ *     appropriately. Either display a message or halt.
+ */
+
+#include <asm/cpufeature.h>
+
+verify_cpu:
+       pushfl                          # Save caller passed flags
+       pushl   $0                      # Kill any dangerous flags
+       popfl
+
+       pushfl                          # standard way to check for cpuid
+       popl    %eax
+       movl    %eax,%ebx
+       xorl    $0x200000,%eax
+       pushl   %eax
+       popfl
+       pushfl
+       popl    %eax
+       cmpl    %eax,%ebx
+       jz      verify_cpu_no_longmode  # cpu has no cpuid
+
+       movl    $0x0,%eax               # See if cpuid 1 is implemented
+       cpuid
+       cmpl    $0x1,%eax
+       jb      verify_cpu_no_longmode  # no cpuid 1
+
+       xor     %di,%di
+       cmpl    $0x68747541,%ebx        # AuthenticAMD
+       jnz     verify_cpu_noamd
+       cmpl    $0x69746e65,%edx
+       jnz     verify_cpu_noamd
+       cmpl    $0x444d4163,%ecx
+       jnz     verify_cpu_noamd
+       mov     $1,%di                  # cpu is from AMD
+
+verify_cpu_noamd:
+       movl    $0x1,%eax               # Does the cpu have what it takes
+       cpuid
+       andl    $REQUIRED_MASK0,%edx
+       xorl    $REQUIRED_MASK0,%edx
+       jnz     verify_cpu_no_longmode
+
+       movl    $0x80000000,%eax        # See if extended cpuid is implemented
+       cpuid
+       cmpl    $0x80000001,%eax
+       jb      verify_cpu_no_longmode  # no extended cpuid
+
+       movl    $0x80000001,%eax        # Does the cpu have what it takes
+       cpuid
+       andl    $REQUIRED_MASK1,%edx
+       xorl    $REQUIRED_MASK1,%edx
+       jnz     verify_cpu_no_longmode
+
+verify_cpu_sse_test:
+       movl    $1,%eax
+       cpuid
+       andl    $SSE_MASK,%edx
+       cmpl    $SSE_MASK,%edx
+       je      verify_cpu_sse_ok
+       test    %di,%di
+       jz      verify_cpu_no_longmode  # only try to force SSE on AMD
+       movl    $0xc0010015,%ecx        # HWCR
+       rdmsr
+       btr     $15,%eax                # enable SSE
+       wrmsr
+       xor     %di,%di                 # don't loop
+       jmp     verify_cpu_sse_test     # try again
+
+verify_cpu_no_longmode:
+       popfl                           # Restore caller passed flags
+       movl $1,%eax
+       ret
+verify_cpu_sse_ok:
+       popfl                           # Restore caller passed flags
+       xorl %eax, %eax
+       ret
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
new file mode 100644 (file)
index 0000000..ba8ea97
--- /dev/null
@@ -0,0 +1,235 @@
+/* ld script to make x86-64 Linux kernel
+ * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
+ */
+
+#define LOAD_OFFSET __START_KERNEL_map
+
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/page.h>
+
+#undef i386    /* in case the preprocessor is a 32bit one */
+
+OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
+OUTPUT_ARCH(i386:x86-64)
+ENTRY(phys_startup_64)
+jiffies_64 = jiffies;
+_proxy_pda = 1;
+PHDRS {
+       text PT_LOAD FLAGS(5);  /* R_E */
+       data PT_LOAD FLAGS(7);  /* RWE */
+       user PT_LOAD FLAGS(7);  /* RWE */
+       data.init PT_LOAD FLAGS(7);     /* RWE */
+       note PT_NOTE FLAGS(4);  /* R__ */
+}
+SECTIONS
+{
+  . = __START_KERNEL;
+  phys_startup_64 = startup_64 - LOAD_OFFSET;
+  _text = .;                   /* Text and read-only data */
+  .text :  AT(ADDR(.text) - LOAD_OFFSET) {
+       /* First the code that has to be first for bootstrapping */
+       *(.text.head)
+       _stext = .;
+       /* Then the rest */
+       TEXT_TEXT
+       SCHED_TEXT
+       LOCK_TEXT
+       KPROBES_TEXT
+       *(.fixup)
+       *(.gnu.warning)
+       } :text = 0x9090
+                               /* out-of-line lock text */
+  .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
+
+  _etext = .;                  /* End of text section */
+
+  . = ALIGN(16);               /* Exception table */
+  __start___ex_table = .;
+  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
+  __stop___ex_table = .;
+
+  NOTES :text :note
+
+  BUG_TABLE :text
+
+  RODATA
+
+  . = ALIGN(4);
+  .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
+       __tracedata_start = .;
+       *(.tracedata)
+       __tracedata_end = .;
+  }
+
+  . = ALIGN(PAGE_SIZE);        /* Align data segment to page size boundary */
+                               /* Data */
+  .data : AT(ADDR(.data) - LOAD_OFFSET) {
+       DATA_DATA
+       CONSTRUCTORS
+       } :data
+
+  _edata = .;                  /* End of data section */
+
+  . = ALIGN(PAGE_SIZE);
+  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+  .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+       *(.data.cacheline_aligned)
+  }
+  . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
+  .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
+       *(.data.read_mostly)
+  }
+
+#define VSYSCALL_ADDR (-10*1024*1024)
+#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+
+#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
+#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
+
+#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
+#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
+
+  . = VSYSCALL_ADDR;
+  .vsyscall_0 :         AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
+  __vsyscall_0 = VSYSCALL_VIRT_ADDR;
+
+  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+  .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
+  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+  .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data))
+               { *(.vsyscall_gtod_data) }
+  vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
+  .vsyscall_clock : AT(VLOAD(.vsyscall_clock))
+               { *(.vsyscall_clock) }
+  vsyscall_clock = VVIRT(.vsyscall_clock);
+
+
+  .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
+               { *(.vsyscall_1) }
+  .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2))
+               { *(.vsyscall_2) }
+
+  .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
+  vgetcpu_mode = VVIRT(.vgetcpu_mode);
+
+  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+  .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
+  jiffies = VVIRT(.jiffies);
+
+  .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3))
+               { *(.vsyscall_3) }
+
+  . = VSYSCALL_VIRT_ADDR + 4096;
+
+#undef VSYSCALL_ADDR
+#undef VSYSCALL_PHYS_ADDR
+#undef VSYSCALL_VIRT_ADDR
+#undef VLOAD_OFFSET
+#undef VLOAD
+#undef VVIRT_OFFSET
+#undef VVIRT
+
+  . = ALIGN(8192);             /* init_task */
+  .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+       *(.data.init_task)
+  }:data.init
+
+  . = ALIGN(4096);
+  .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+       *(.data.page_aligned)
+  }
+
+  /* might get freed after init */
+  . = ALIGN(4096);
+  __smp_alt_begin = .;
+  __smp_locks = .;
+  .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+       *(.smp_locks)
+  }
+  __smp_locks_end = .;
+  . = ALIGN(4096);
+  __smp_alt_end = .;
+
+  . = ALIGN(4096);             /* Init code and data */
+  __init_begin = .;
+  .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
+       _sinittext = .;
+       *(.init.text)
+       _einittext = .;
+  }
+  __initdata_begin = .;
+  .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
+  __initdata_end = .;
+  . = ALIGN(16);
+  __setup_start = .;
+  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) }
+  __setup_end = .;
+  __initcall_start = .;
+  .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+       INITCALLS
+  }
+  __initcall_end = .;
+  __con_initcall_start = .;
+  .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+       *(.con_initcall.init)
+  }
+  __con_initcall_end = .;
+  SECURITY_INIT
+  . = ALIGN(8);
+  __alt_instructions = .;
+  .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+       *(.altinstructions)
+  }
+  __alt_instructions_end = .; 
+  .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
+       *(.altinstr_replacement)
+  }
+  /* .exit.text is discard at runtime, not link time, to deal with references
+     from .altinstructions and .eh_frame */
+  .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
+  .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
+
+/* vdso blob that is mapped into user space */
+  vdso_start = . ;
+  .vdso  : AT(ADDR(.vdso) - LOAD_OFFSET) { *(.vdso) }
+  . = ALIGN(4096);
+  vdso_end = .;
+
+#ifdef CONFIG_BLK_DEV_INITRD
+  . = ALIGN(4096);
+  __initramfs_start = .;
+  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
+  __initramfs_end = .;
+#endif
+
+  PERCPU(4096)
+
+  . = ALIGN(4096);
+  __init_end = .;
+
+  . = ALIGN(4096);
+  __nosave_begin = .;
+  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
+  . = ALIGN(4096);
+  __nosave_end = .;
+
+  __bss_start = .;             /* BSS */
+  .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+       *(.bss.page_aligned)
+       *(.bss)
+       }
+  __bss_stop = .;
+
+  _end = . ;
+
+  /* Sections to be discarded */
+  /DISCARD/ : {
+       *(.exitcall.exit)
+       *(.eh_frame)
+       }
+
+  STABS_DEBUG
+
+  DWARF_DEBUG
+}
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
new file mode 100644 (file)
index 0000000..414caf0
--- /dev/null
@@ -0,0 +1,49 @@
+/*
+ * vSMPowered(tm) systems specific initialization
+ * Copyright (C) 2005 ScaleMP Inc.
+ *
+ * Use of this code is subject to the terms and conditions of the
+ * GNU general public license version 2. See "COPYING" or
+ * http://www.gnu.org/licenses/gpl.html
+ *
+ * Ravikiran Thirumalai <kiran@scalemp.com>,
+ * Shai Fultheim <shai@scalemp.com>
+ */
+
+#include <linux/init.h>
+#include <linux/pci_ids.h>
+#include <linux/pci_regs.h>
+#include <asm/pci-direct.h>
+#include <asm/io.h>
+
+static int __init vsmp_init(void)
+{
+       void *address;
+       unsigned int cap, ctl;
+
+       if (!early_pci_allowed())
+               return 0;
+
+       /* Check if we are running on a ScaleMP vSMP box */
+       if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) ||
+           (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL))
+               return 0;
+
+       /* set vSMP magic bits to indicate vSMP capable kernel */
+       address = ioremap(read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0), 8);
+       cap = readl(address);
+       ctl = readl(address + 4);
+       printk("vSMP CTL: capabilities:0x%08x  control:0x%08x\n", cap, ctl);
+       if (cap & ctl & (1 << 4)) {
+               /* Turn on vSMP IRQ fastpath handling (see system.h) */
+               ctl &= ~(1 << 4);
+               writel(ctl, address + 4);
+               ctl = readl(address + 4);
+               printk("vSMP CTL: control set to:0x%08x\n", ctl);
+       }
+
+       iounmap(address);
+       return 0;
+}
+
+core_initcall(vsmp_init);
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
new file mode 100644 (file)
index 0000000..06c3494
--- /dev/null
@@ -0,0 +1,349 @@
+/*
+ *  linux/arch/x86_64/kernel/vsyscall.c
+ *
+ *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
+ *  Copyright 2003 Andi Kleen, SuSE Labs.
+ *
+ *  Thanks to hpa@transmeta.com for some useful hint.
+ *  Special thanks to Ingo Molnar for his early experience with
+ *  a different vsyscall implementation for Linux/IA32 and for the name.
+ *
+ *  vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
+ *  at virtual address -10Mbyte+1024bytes etc... There are at max 4
+ *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
+ *  jumping out of line if necessary. We cannot add more with this
+ *  mechanism because older kernels won't return -ENOSYS.
+ *  If we want more than four we need a vDSO.
+ *
+ *  Note: the concept clashes with user mode linux. If you use UML and
+ *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
+ */
+
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/seqlock.h>
+#include <linux/jiffies.h>
+#include <linux/sysctl.h>
+#include <linux/clocksource.h>
+#include <linux/getcpu.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/notifier.h>
+
+#include <asm/vsyscall.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/unistd.h>
+#include <asm/fixmap.h>
+#include <asm/errno.h>
+#include <asm/io.h>
+#include <asm/segment.h>
+#include <asm/desc.h>
+#include <asm/topology.h>
+#include <asm/vgtod.h>
+
+#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+#define __syscall_clobber "r11","rcx","memory"
+#define __pa_vsymbol(x)                        \
+       ({unsigned long v;              \
+       extern char __vsyscall_0;       \
+         asm("" : "=r" (v) : "0" (x)); \
+         ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); })
+
+/*
+ * vsyscall_gtod_data contains data that is :
+ * - readonly from vsyscalls
+ * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
+ * Try to keep this structure as small as possible to avoid cache line ping pongs
+ */
+int __vgetcpu_mode __section_vgetcpu_mode;
+
+struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
+{
+       .lock = SEQLOCK_UNLOCKED,
+       .sysctl_enabled = 1,
+};
+
+void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
+{
+       unsigned long flags;
+
+       write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
+       /* copy vsyscall data */
+       vsyscall_gtod_data.clock.vread = clock->vread;
+       vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
+       vsyscall_gtod_data.clock.mask = clock->mask;
+       vsyscall_gtod_data.clock.mult = clock->mult;
+       vsyscall_gtod_data.clock.shift = clock->shift;
+       vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
+       vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
+       vsyscall_gtod_data.sys_tz = sys_tz;
+       vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
+       vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
+       write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
+}
+
+/* RED-PEN may want to readd seq locking, but then the variable should be
+ * write-once.
+ */
+static __always_inline void do_get_tz(struct timezone * tz)
+{
+       *tz = __vsyscall_gtod_data.sys_tz;
+}
+
+static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
+{
+       int ret;
+       asm volatile("vsysc2: syscall"
+               : "=a" (ret)
+               : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
+               : __syscall_clobber );
+       return ret;
+}
+
+static __always_inline long time_syscall(long *t)
+{
+       long secs;
+       asm volatile("vsysc1: syscall"
+               : "=a" (secs)
+               : "0" (__NR_time),"D" (t) : __syscall_clobber);
+       return secs;
+}
+
+static __always_inline void do_vgettimeofday(struct timeval * tv)
+{
+       cycle_t now, base, mask, cycle_delta;
+       unsigned seq;
+       unsigned long mult, shift, nsec;
+       cycle_t (*vread)(void);
+       do {
+               seq = read_seqbegin(&__vsyscall_gtod_data.lock);
+
+               vread = __vsyscall_gtod_data.clock.vread;
+               if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
+                       gettimeofday(tv,NULL);
+                       return;
+               }
+               now = vread();
+               base = __vsyscall_gtod_data.clock.cycle_last;
+               mask = __vsyscall_gtod_data.clock.mask;
+               mult = __vsyscall_gtod_data.clock.mult;
+               shift = __vsyscall_gtod_data.clock.shift;
+
+               tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
+               nsec = __vsyscall_gtod_data.wall_time_nsec;
+       } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
+
+       /* calculate interval: */
+       cycle_delta = (now - base) & mask;
+       /* convert to nsecs: */
+       nsec += (cycle_delta * mult) >> shift;
+
+       while (nsec >= NSEC_PER_SEC) {
+               tv->tv_sec += 1;
+               nsec -= NSEC_PER_SEC;
+       }
+       tv->tv_usec = nsec / NSEC_PER_USEC;
+}
+
+int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
+{
+       if (tv)
+               do_vgettimeofday(tv);
+       if (tz)
+               do_get_tz(tz);
+       return 0;
+}
+
+/* This will break when the xtime seconds get inaccurate, but that is
+ * unlikely */
+time_t __vsyscall(1) vtime(time_t *t)
+{
+       struct timeval tv;
+       time_t result;
+       if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
+               return time_syscall(t);
+
+       vgettimeofday(&tv, 0);
+       result = tv.tv_sec;
+       if (t)
+               *t = result;
+       return result;
+}
+
+/* Fast way to get current CPU and node.
+   This helps to do per node and per CPU caches in user space.
+   The result is not guaranteed without CPU affinity, but usually
+   works out because the scheduler tries to keep a thread on the same
+   CPU.
+
+   tcache must point to a two element sized long array.
+   All arguments can be NULL. */
+long __vsyscall(2)
+vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
+{
+       unsigned int dummy, p;
+       unsigned long j = 0;
+
+       /* Fast cache - only recompute value once per jiffies and avoid
+          relatively costly rdtscp/cpuid otherwise.
+          This works because the scheduler usually keeps the process
+          on the same CPU and this syscall doesn't guarantee its
+          results anyways.
+          We do this here because otherwise user space would do it on
+          its own in a likely inferior way (no access to jiffies).
+          If you don't like it pass NULL. */
+       if (tcache && tcache->blob[0] == (j = __jiffies)) {
+               p = tcache->blob[1];
+       } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
+               /* Load per CPU data from RDTSCP */
+               rdtscp(dummy, dummy, p);
+       } else {
+               /* Load per CPU data from GDT */
+               asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+       }
+       if (tcache) {
+               tcache->blob[0] = j;
+               tcache->blob[1] = p;
+       }
+       if (cpu)
+               *cpu = p & 0xfff;
+       if (node)
+               *node = p >> 12;
+       return 0;
+}
+
+long __vsyscall(3) venosys_1(void)
+{
+       return -ENOSYS;
+}
+
+#ifdef CONFIG_SYSCTL
+
+#define SYSCALL 0x050f
+#define NOP2    0x9090
+
+/*
+ * NOP out syscall in vsyscall page when not needed.
+ */
+static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
+                        void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+       extern u16 vsysc1, vsysc2;
+       u16 __iomem *map1;
+       u16 __iomem *map2;
+       int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+       if (!write)
+               return ret;
+       /* gcc has some trouble with __va(__pa()), so just do it this
+          way. */
+       map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
+       if (!map1)
+               return -ENOMEM;
+       map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
+       if (!map2) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       if (!vsyscall_gtod_data.sysctl_enabled) {
+               writew(SYSCALL, map1);
+               writew(SYSCALL, map2);
+       } else {
+               writew(NOP2, map1);
+               writew(NOP2, map2);
+       }
+       iounmap(map2);
+out:
+       iounmap(map1);
+       return ret;
+}
+
+static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
+                               void __user *oldval, size_t __user *oldlenp,
+                               void __user *newval, size_t newlen)
+{
+       return -ENOSYS;
+}
+
+static ctl_table kernel_table2[] = {
+       { .ctl_name = 99, .procname = "vsyscall64",
+         .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
+         .mode = 0644,
+         .strategy = vsyscall_sysctl_nostrat,
+         .proc_handler = vsyscall_sysctl_change },
+       {}
+};
+
+static ctl_table kernel_root_table2[] = {
+       { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
+         .child = kernel_table2 },
+       {}
+};
+
+#endif
+
+/* Assume __initcall executes before all user space. Hopefully kmod
+   doesn't violate that. We'll find out if it does. */
+static void __cpuinit vsyscall_set_cpu(int cpu)
+{
+       unsigned long *d;
+       unsigned long node = 0;
+#ifdef CONFIG_NUMA
+       node = cpu_to_node[cpu];
+#endif
+       if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP))
+               write_rdtscp_aux((node << 12) | cpu);
+
+       /* Store cpu number in limit so that it can be loaded quickly
+          in user space in vgetcpu.
+          12 bits for the CPU and 8 bits for the node. */
+       d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU);
+       *d = 0x0f40000000000ULL;
+       *d |= cpu;
+       *d |= (node & 0xf) << 12;
+       *d |= (node >> 4) << 48;
+}
+
+static void __cpuinit cpu_vsyscall_init(void *arg)
+{
+       /* preemption should be already off */
+       vsyscall_set_cpu(raw_smp_processor_id());
+}
+
+static int __cpuinit
+cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
+{
+       long cpu = (long)arg;
+       if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
+               smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
+       return NOTIFY_DONE;
+}
+
+static void __init map_vsyscall(void)
+{
+       extern char __vsyscall_0;
+       unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
+
+       /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
+       __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
+}
+
+static int __init vsyscall_init(void)
+{
+       BUG_ON(((unsigned long) &vgettimeofday !=
+                       VSYSCALL_ADDR(__NR_vgettimeofday)));
+       BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
+       BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
+       BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
+       map_vsyscall();
+#ifdef CONFIG_SYSCTL
+       register_sysctl_table(kernel_root_table2);
+#endif
+       on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
+       hotcpu_notifier(cpu_vsyscall_notifier, 0);
+       return 0;
+}
+
+__initcall(vsyscall_init);
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
new file mode 100644 (file)
index 0000000..77c25b3
--- /dev/null
@@ -0,0 +1,62 @@
+/* Exports for assembly files.
+   All C exports should go in the respective C files. */
+
+#include <linux/module.h>
+#include <linux/smp.h>
+
+#include <asm/semaphore.h>
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+
+EXPORT_SYMBOL(kernel_thread);
+
+EXPORT_SYMBOL(__down_failed);
+EXPORT_SYMBOL(__down_failed_interruptible);
+EXPORT_SYMBOL(__down_failed_trylock);
+EXPORT_SYMBOL(__up_wakeup);
+
+EXPORT_SYMBOL(__get_user_1);
+EXPORT_SYMBOL(__get_user_2);
+EXPORT_SYMBOL(__get_user_4);
+EXPORT_SYMBOL(__get_user_8);
+EXPORT_SYMBOL(__put_user_1);
+EXPORT_SYMBOL(__put_user_2);
+EXPORT_SYMBOL(__put_user_4);
+EXPORT_SYMBOL(__put_user_8);
+
+EXPORT_SYMBOL(copy_user_generic);
+EXPORT_SYMBOL(__copy_user_nocache);
+EXPORT_SYMBOL(copy_from_user);
+EXPORT_SYMBOL(copy_to_user);
+EXPORT_SYMBOL(__copy_from_user_inatomic);
+
+EXPORT_SYMBOL(copy_page);
+EXPORT_SYMBOL(clear_page);
+
+#ifdef CONFIG_SMP
+extern void  __write_lock_failed(rwlock_t *rw);
+extern void  __read_lock_failed(rwlock_t *rw);
+EXPORT_SYMBOL(__write_lock_failed);
+EXPORT_SYMBOL(__read_lock_failed);
+#endif
+
+/* Export string functions. We normally rely on gcc builtin for most of these,
+   but gcc sometimes decides not to inline them. */    
+#undef memcpy
+#undef memset
+#undef memmove
+
+extern void * memset(void *,int,__kernel_size_t);
+extern void * memcpy(void *,const void *,__kernel_size_t);
+extern void * __memcpy(void *,const void *,__kernel_size_t);
+
+EXPORT_SYMBOL(memset);
+EXPORT_SYMBOL(memcpy);
+EXPORT_SYMBOL(__memcpy);
+
+EXPORT_SYMBOL(empty_zero_page);
+EXPORT_SYMBOL(init_level4_pgt);
+EXPORT_SYMBOL(load_gs_index);
+
+EXPORT_SYMBOL(_proxy_pda);
index 189d80d3a89182a45b8d62e99899fc1d912ea8b4..afaf0f99887831f1c33e5ef47bd0e5bde008dc83 100644 (file)
 #
 # $Id: Makefile,v 1.31 2002/03/22 15:56:07 ak Exp $
 
+# Fill in SRCARCH
+SRCARCH        := x86
+
+archprepare:
+       @mkdir -p ${objtree}/arch/x86/kernel
+
+
 LDFLAGS                := -m elf_x86_64
 OBJCOPYFLAGS   := -O binary -R .note -R .comment -S
 LDFLAGS_vmlinux :=
@@ -71,10 +78,10 @@ CFLAGS += $(cflags-y)
 CFLAGS_KERNEL += $(cflags-kernel-y)
 AFLAGS += -m64
 
-head-y := arch/x86_64/kernel/head_64.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task_64.o
+head-y := arch/x86/kernel/head_64.o arch/x86/kernel/head64.o arch/x86/kernel/init_task_64.o
 
 libs-y                                         += arch/x86/lib/
-core-y                                 += arch/x86_64/kernel/ \
+core-y                                 += arch/x86/kernel/ \
                                           arch/x86/mm/ \
                                           arch/x86/crypto/ \
                                           arch/x86/vdso/
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
deleted file mode 100644 (file)
index 577d08f..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-ifeq ($(CONFIG_X86_32),y)
-include ${srctree}/arch/x86/kernel/Makefile_32
-else
-include ${srctree}/arch/x86_64/kernel/Makefile_64
-endif
diff --git a/arch/x86_64/kernel/Makefile_64 b/arch/x86_64/kernel/Makefile_64
deleted file mode 100644 (file)
index 120d4e5..0000000
+++ /dev/null
@@ -1,63 +0,0 @@
-#
-# Makefile for the linux kernel.
-#
-
-extra-y        := head_64.o head64.o init_task_64.o vmlinux.lds
-EXTRA_AFLAGS   := -traditional
-obj-y  := process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \
-               ptrace_64.o time_64.o ioport_64.o ldt_64.o setup_64.o i8259_64.o sys_x86_64.o \
-               x8664_ksyms_64.o i387_64.o syscall_64.o vsyscall_64.o \
-               setup64.o bootflag.o e820_64.o reboot_64.o quirks.o i8237.o \
-               pci-dma_64.o pci-nommu_64.o alternative.o hpet_64.o tsc_64.o bugs_64.o \
-               perfctr-watchdog.o
-
-obj-$(CONFIG_STACKTRACE)       += stacktrace.o
-obj-$(CONFIG_X86_MCE)          += mce_64.o therm_throt.o
-obj-$(CONFIG_X86_MCE_INTEL)    += mce_intel_64.o
-obj-$(CONFIG_X86_MCE_AMD)      += mce_amd_64.o
-obj-$(CONFIG_MTRR)             += ../../x86/kernel/cpu/mtrr/
-obj-$(CONFIG_ACPI)             += ../../x86/kernel/acpi/
-obj-$(CONFIG_X86_MSR)          += msr.o
-obj-$(CONFIG_MICROCODE)                += microcode.o
-obj-$(CONFIG_X86_CPUID)                += cpuid.o
-obj-$(CONFIG_SMP)              += smp_64.o smpboot_64.o trampoline_64.o tsc_sync.o
-obj-y                          += apic_64.o  nmi_64.o
-obj-y                          += io_apic_64.o mpparse_64.o genapic_64.o genapic_flat_64.o
-obj-$(CONFIG_KEXEC)            += machine_kexec_64.o relocate_kernel_64.o crash_64.o
-obj-$(CONFIG_CRASH_DUMP)       += crash_dump_64.o
-obj-$(CONFIG_PM)               += suspend_64.o
-obj-$(CONFIG_HIBERNATION)      += suspend_asm_64.o
-obj-$(CONFIG_CPU_FREQ)         += ../../x86/kernel/cpu/cpufreq/
-obj-$(CONFIG_EARLY_PRINTK)     += early_printk.o
-obj-$(CONFIG_IOMMU)            += pci-gart_64.o aperture_64.o
-obj-$(CONFIG_CALGARY_IOMMU)    += pci-calgary_64.o tce_64.o
-obj-$(CONFIG_SWIOTLB)          += pci-swiotlb_64.o
-obj-$(CONFIG_KPROBES)          += kprobes_64.o
-obj-$(CONFIG_X86_PM_TIMER)     += pmtimer_64.o
-obj-$(CONFIG_X86_VSMP)         += vsmp_64.o
-obj-$(CONFIG_K8_NB)            += k8.o
-obj-$(CONFIG_AUDIT)            += audit_64.o
-
-obj-$(CONFIG_MODULES)          += module_64.o
-obj-$(CONFIG_PCI)              += early-quirks_64.o
-
-obj-y                          += topology.o
-obj-y                          += intel_cacheinfo.o
-obj-y                          += addon_cpuid_features.o
-obj-y                          += pcspeaker.o
-
-CFLAGS_vsyscall_64.o           := $(PROFILING) -g0
-
-therm_throt-y                   += ../../x86/kernel/cpu/mcheck/therm_throt.o
-bootflag-y                     += ../../x86/kernel/bootflag.o
-cpuid-$(subst m,y,$(CONFIG_X86_CPUID))  += ../../x86/kernel/cpuid.o
-topology-y                     += ../../x86/kernel/topology.o
-microcode-$(subst m,y,$(CONFIG_MICROCODE))  += ../../x86/kernel/microcode.o
-intel_cacheinfo-y              += ../../x86/kernel/cpu/intel_cacheinfo.o
-addon_cpuid_features-y         += ../../x86/kernel/cpu/addon_cpuid_features.o
-quirks-y                       += ../../x86/kernel/quirks.o
-i8237-y                                += ../../x86/kernel/i8237.o
-msr-$(subst m,y,$(CONFIG_X86_MSR))  += ../../x86/kernel/msr.o
-alternative-y                  += ../../x86/kernel/alternative.o
-pcspeaker-y                    += ../../x86/kernel/pcspeaker.o
-perfctr-watchdog-y             += ../../x86/kernel/cpu/perfctr-watchdog.o
diff --git a/arch/x86_64/kernel/aperture_64.c b/arch/x86_64/kernel/aperture_64.c
deleted file mode 100644 (file)
index 8f681ca..0000000
+++ /dev/null
@@ -1,298 +0,0 @@
-/* 
- * Firmware replacement code.
- * 
- * Work around broken BIOSes that don't set an aperture or only set the
- * aperture in the AGP bridge. 
- * If all fails map the aperture over some low memory.  This is cheaper than 
- * doing bounce buffering. The memory is lost. This is done at early boot 
- * because only the bootmem allocator can allocate 32+MB. 
- * 
- * Copyright 2002 Andi Kleen, SuSE Labs.
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/mmzone.h>
-#include <linux/pci_ids.h>
-#include <linux/pci.h>
-#include <linux/bitops.h>
-#include <linux/ioport.h>
-#include <asm/e820.h>
-#include <asm/io.h>
-#include <asm/iommu.h>
-#include <asm/pci-direct.h>
-#include <asm/dma.h>
-#include <asm/k8.h>
-
-int iommu_aperture;
-int iommu_aperture_disabled __initdata = 0;
-int iommu_aperture_allowed __initdata = 0;
-
-int fallback_aper_order __initdata = 1; /* 64MB */
-int fallback_aper_force __initdata = 0; 
-
-int fix_aperture __initdata = 1;
-
-static struct resource gart_resource = {
-       .name   = "GART",
-       .flags  = IORESOURCE_MEM,
-};
-
-static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
-{
-       gart_resource.start = aper_base;
-       gart_resource.end = aper_base + aper_size - 1;
-       insert_resource(&iomem_resource, &gart_resource);
-}
-
-/* This code runs before the PCI subsystem is initialized, so just
-   access the northbridge directly. */
-
-static u32 __init allocate_aperture(void) 
-{
-       u32 aper_size;
-       void *p; 
-
-       if (fallback_aper_order > 7) 
-               fallback_aper_order = 7; 
-       aper_size = (32 * 1024 * 1024) << fallback_aper_order; 
-
-       /* 
-        * Aperture has to be naturally aligned. This means an 2GB aperture won't
-        * have much chance of finding a place in the lower 4GB of memory.
-        * Unfortunately we cannot move it up because that would make the
-        * IOMMU useless.
-        */
-       p = __alloc_bootmem_nopanic(aper_size, aper_size, 0);
-       if (!p || __pa(p)+aper_size > 0xffffffff) {
-               printk("Cannot allocate aperture memory hole (%p,%uK)\n",
-                      p, aper_size>>10);
-               if (p)
-                       free_bootmem(__pa(p), aper_size);
-               return 0;
-       }
-       printk("Mapping aperture over %d KB of RAM @ %lx\n",
-              aper_size >> 10, __pa(p)); 
-       insert_aperture_resource((u32)__pa(p), aper_size);
-       return (u32)__pa(p); 
-}
-
-static int __init aperture_valid(u64 aper_base, u32 aper_size)
-{ 
-       if (!aper_base) 
-               return 0;
-       if (aper_size < 64*1024*1024) { 
-               printk("Aperture too small (%d MB)\n", aper_size>>20);
-               return 0;
-       }
-       if (aper_base + aper_size > 0x100000000UL) {
-               printk("Aperture beyond 4GB. Ignoring.\n");
-               return 0; 
-       }
-       if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
-               printk("Aperture pointing to e820 RAM. Ignoring.\n");
-               return 0; 
-       } 
-       return 1;
-} 
-
-/* Find a PCI capability */
-static __u32 __init find_cap(int num, int slot, int func, int cap) 
-{ 
-       u8 pos;
-       int bytes;
-       if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST))
-               return 0;
-       pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST);
-       for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { 
-               u8 id;
-               pos &= ~3; 
-               id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID);
-               if (id == 0xff)
-                       break;
-               if (id == cap) 
-                       return pos; 
-               pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); 
-       } 
-       return 0;
-} 
-
-/* Read a standard AGPv3 bridge header */
-static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
-{ 
-       u32 apsize;
-       u32 apsizereg;
-       int nbits;
-       u32 aper_low, aper_hi;
-       u64 aper;
-
-       printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func);
-       apsizereg = read_pci_config_16(num,slot,func, cap + 0x14);
-       if (apsizereg == 0xffffffff) {
-               printk("APSIZE in AGP bridge unreadable\n");
-               return 0;
-       }
-
-       apsize = apsizereg & 0xfff;
-       /* Some BIOS use weird encodings not in the AGPv3 table. */
-       if (apsize & 0xff) 
-               apsize |= 0xf00; 
-       nbits = hweight16(apsize);
-       *order = 7 - nbits;
-       if ((int)*order < 0) /* < 32MB */
-               *order = 0;
-       
-       aper_low = read_pci_config(num,slot,func, 0x10);
-       aper_hi = read_pci_config(num,slot,func,0x14);
-       aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
-
-       printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", 
-              aper, 32 << *order, apsizereg);
-
-       if (!aperture_valid(aper, (32*1024*1024) << *order))
-           return 0;
-       return (u32)aper; 
-} 
-
-/* Look for an AGP bridge. Windows only expects the aperture in the
-   AGP bridge and some BIOS forget to initialize the Northbridge too.
-   Work around this here. 
-
-   Do an PCI bus scan by hand because we're running before the PCI
-   subsystem. 
-
-   All K8 AGP bridges are AGPv3 compliant, so we can do this scan
-   generically. It's probably overkill to always scan all slots because
-   the AGP bridges should be always an own bus on the HT hierarchy, 
-   but do it here for future safety. */
-static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
-{
-       int num, slot, func;
-
-       /* Poor man's PCI discovery */
-       for (num = 0; num < 256; num++) { 
-               for (slot = 0; slot < 32; slot++) { 
-                       for (func = 0; func < 8; func++) { 
-                               u32 class, cap;
-                               u8 type;
-                               class = read_pci_config(num,slot,func,
-                                                       PCI_CLASS_REVISION);
-                               if (class == 0xffffffff)
-                                       break; 
-                               
-                               switch (class >> 16) { 
-                               case PCI_CLASS_BRIDGE_HOST:
-                               case PCI_CLASS_BRIDGE_OTHER: /* needed? */
-                                       /* AGP bridge? */
-                                       cap = find_cap(num,slot,func,PCI_CAP_ID_AGP);
-                                       if (!cap)
-                                               break;
-                                       *valid_agp = 1; 
-                                       return read_agp(num,slot,func,cap,order);
-                               } 
-                               
-                               /* No multi-function device? */
-                               type = read_pci_config_byte(num,slot,func,
-                                                              PCI_HEADER_TYPE);
-                               if (!(type & 0x80))
-                                       break;
-                       } 
-               } 
-       }
-       printk("No AGP bridge found\n"); 
-       return 0;
-}
-
-void __init iommu_hole_init(void) 
-{ 
-       int fix, num; 
-       u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
-       u64 aper_base, last_aper_base = 0;
-       int valid_agp = 0;
-
-       if (iommu_aperture_disabled || !fix_aperture || !early_pci_allowed())
-               return;
-
-       printk(KERN_INFO  "Checking aperture...\n");
-
-       fix = 0;
-       for (num = 24; num < 32; num++) {               
-               if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
-                       continue;
-
-               iommu_detected = 1;
-               iommu_aperture = 1; 
-
-               aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; 
-               aper_size = (32 * 1024 * 1024) << aper_order; 
-               aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
-               aper_base <<= 25; 
-
-               printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, 
-                      aper_base, aper_size>>20);
-               
-               if (!aperture_valid(aper_base, aper_size)) {
-                       fix = 1; 
-                       break; 
-               }
-
-               if ((last_aper_order && aper_order != last_aper_order) ||
-                   (last_aper_base && aper_base != last_aper_base)) {
-                       fix = 1;
-                       break;
-               }
-               last_aper_order = aper_order;
-               last_aper_base = aper_base;
-       } 
-
-       if (!fix && !fallback_aper_force) {
-               if (last_aper_base) {
-                       unsigned long n = (32 * 1024 * 1024) << last_aper_order;
-                       insert_aperture_resource((u32)last_aper_base, n);
-               }
-               return; 
-       }
-
-       if (!fallback_aper_force)
-               aper_alloc = search_agp_bridge(&aper_order, &valid_agp); 
-               
-       if (aper_alloc) { 
-               /* Got the aperture from the AGP bridge */
-       } else if (swiotlb && !valid_agp) {
-               /* Do nothing */
-       } else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) ||
-                  force_iommu ||
-                  valid_agp ||
-                  fallback_aper_force) { 
-               printk("Your BIOS doesn't leave a aperture memory hole\n");
-               printk("Please enable the IOMMU option in the BIOS setup\n");
-               printk("This costs you %d MB of RAM\n",
-                      32 << fallback_aper_order);
-
-               aper_order = fallback_aper_order;
-               aper_alloc = allocate_aperture();
-               if (!aper_alloc) { 
-                       /* Could disable AGP and IOMMU here, but it's probably
-                          not worth it. But the later users cannot deal with
-                          bad apertures and turning on the aperture over memory
-                          causes very strange problems, so it's better to 
-                          panic early. */
-                       panic("Not enough memory for aperture");
-               }
-       } else { 
-               return; 
-       } 
-
-       /* Fix up the north bridges */
-       for (num = 24; num < 32; num++) {               
-               if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
-                       continue;       
-
-               /* Don't enable translation yet. That is done later. 
-                  Assume this BIOS didn't initialise the GART so 
-                  just overwrite all previous bits */ 
-               write_pci_config(0, num, 3, 0x90, aper_order<<1); 
-               write_pci_config(0, num, 3, 0x94, aper_alloc>>25); 
-       } 
-} 
diff --git a/arch/x86_64/kernel/apic_64.c b/arch/x86_64/kernel/apic_64.c
deleted file mode 100644 (file)
index 925758d..0000000
+++ /dev/null
@@ -1,1253 +0,0 @@
-/*
- *     Local APIC handling, local APIC timers
- *
- *     (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
- *
- *     Fixes
- *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
- *                                     thanks to Eric Gilmore
- *                                     and Rolf G. Tews
- *                                     for testing these extensively.
- *     Maciej W. Rozycki       :       Various updates and fixes.
- *     Mikael Pettersson       :       Power Management for UP-APIC.
- *     Pavel Machek and
- *     Mikael Pettersson       :       PM converted to driver model.
- */
-
-#include <linux/init.h>
-
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/interrupt.h>
-#include <linux/mc146818rtc.h>
-#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/module.h>
-#include <linux/ioport.h>
-
-#include <asm/atomic.h>
-#include <asm/smp.h>
-#include <asm/mtrr.h>
-#include <asm/mpspec.h>
-#include <asm/pgalloc.h>
-#include <asm/mach_apic.h>
-#include <asm/nmi.h>
-#include <asm/idle.h>
-#include <asm/proto.h>
-#include <asm/timex.h>
-#include <asm/hpet.h>
-#include <asm/apic.h>
-
-int apic_mapped;
-int apic_verbosity;
-int apic_runs_main_timer;
-int apic_calibrate_pmtmr __initdata;
-
-int disable_apic_timer __initdata;
-
-/* Local APIC timer works in C2? */
-int local_apic_timer_c2_ok;
-EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
-
-static struct resource *ioapic_resources;
-static struct resource lapic_resource = {
-       .name = "Local APIC",
-       .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
-};
-
-/*
- * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
- * IPIs in place of local APIC timers
- */
-static cpumask_t timer_interrupt_broadcast_ipi_mask;
-
-/* Using APIC to generate smp_local_timer_interrupt? */
-int using_apic_timer __read_mostly = 0;
-
-static void apic_pm_activate(void);
-
-void apic_wait_icr_idle(void)
-{
-       while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
-               cpu_relax();
-}
-
-unsigned int safe_apic_wait_icr_idle(void)
-{
-       unsigned int send_status;
-       int timeout;
-
-       timeout = 0;
-       do {
-               send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
-               if (!send_status)
-                       break;
-               udelay(100);
-       } while (timeout++ < 1000);
-
-       return send_status;
-}
-
-void enable_NMI_through_LVT0 (void * dummy)
-{
-       unsigned int v;
-
-       /* unmask and set to NMI */
-       v = APIC_DM_NMI;
-       apic_write(APIC_LVT0, v);
-}
-
-int get_maxlvt(void)
-{
-       unsigned int v, maxlvt;
-
-       v = apic_read(APIC_LVR);
-       maxlvt = GET_APIC_MAXLVT(v);
-       return maxlvt;
-}
-
-/*
- * 'what should we do if we get a hw irq event on an illegal vector'.
- * each architecture has to answer this themselves.
- */
-void ack_bad_irq(unsigned int irq)
-{
-       printk("unexpected IRQ trap at vector %02x\n", irq);
-       /*
-        * Currently unexpected vectors happen only on SMP and APIC.
-        * We _must_ ack these because every local APIC has only N
-        * irq slots per priority level, and a 'hanging, unacked' IRQ
-        * holds up an irq slot - in excessive cases (when multiple
-        * unexpected vectors occur) that might lock up the APIC
-        * completely.
-        * But don't ack when the APIC is disabled. -AK
-        */
-       if (!disable_apic)
-               ack_APIC_irq();
-}
-
-void clear_local_APIC(void)
-{
-       int maxlvt;
-       unsigned int v;
-
-       maxlvt = get_maxlvt();
-
-       /*
-        * Masking an LVT entry can trigger a local APIC error
-        * if the vector is zero. Mask LVTERR first to prevent this.
-        */
-       if (maxlvt >= 3) {
-               v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
-               apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
-       }
-       /*
-        * Careful: we have to set masks only first to deassert
-        * any level-triggered sources.
-        */
-       v = apic_read(APIC_LVTT);
-       apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
-       v = apic_read(APIC_LVT0);
-       apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
-       v = apic_read(APIC_LVT1);
-       apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
-       if (maxlvt >= 4) {
-               v = apic_read(APIC_LVTPC);
-               apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
-       }
-
-       /*
-        * Clean APIC state for other OSs:
-        */
-       apic_write(APIC_LVTT, APIC_LVT_MASKED);
-       apic_write(APIC_LVT0, APIC_LVT_MASKED);
-       apic_write(APIC_LVT1, APIC_LVT_MASKED);
-       if (maxlvt >= 3)
-               apic_write(APIC_LVTERR, APIC_LVT_MASKED);
-       if (maxlvt >= 4)
-               apic_write(APIC_LVTPC, APIC_LVT_MASKED);
-       apic_write(APIC_ESR, 0);
-       apic_read(APIC_ESR);
-}
-
-void disconnect_bsp_APIC(int virt_wire_setup)
-{
-       /* Go back to Virtual Wire compatibility mode */
-       unsigned long value;
-
-       /* For the spurious interrupt use vector F, and enable it */
-       value = apic_read(APIC_SPIV);
-       value &= ~APIC_VECTOR_MASK;
-       value |= APIC_SPIV_APIC_ENABLED;
-       value |= 0xf;
-       apic_write(APIC_SPIV, value);
-
-       if (!virt_wire_setup) {
-               /* For LVT0 make it edge triggered, active high, external and enabled */
-               value = apic_read(APIC_LVT0);
-               value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
-                       APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
-                       APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
-               value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
-               value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
-               apic_write(APIC_LVT0, value);
-       } else {
-               /* Disable LVT0 */
-               apic_write(APIC_LVT0, APIC_LVT_MASKED);
-       }
-
-       /* For LVT1 make it edge triggered, active high, nmi and enabled */
-       value = apic_read(APIC_LVT1);
-       value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
-                       APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
-                       APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
-       value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
-       value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
-       apic_write(APIC_LVT1, value);
-}
-
-void disable_local_APIC(void)
-{
-       unsigned int value;
-
-       clear_local_APIC();
-
-       /*
-        * Disable APIC (implies clearing of registers
-        * for 82489DX!).
-        */
-       value = apic_read(APIC_SPIV);
-       value &= ~APIC_SPIV_APIC_ENABLED;
-       apic_write(APIC_SPIV, value);
-}
-
-/*
- * This is to verify that we're looking at a real local APIC.
- * Check these against your board if the CPUs aren't getting
- * started for no apparent reason.
- */
-int __init verify_local_APIC(void)
-{
-       unsigned int reg0, reg1;
-
-       /*
-        * The version register is read-only in a real APIC.
-        */
-       reg0 = apic_read(APIC_LVR);
-       apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
-       apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
-       reg1 = apic_read(APIC_LVR);
-       apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
-
-       /*
-        * The two version reads above should print the same
-        * numbers.  If the second one is different, then we
-        * poke at a non-APIC.
-        */
-       if (reg1 != reg0)
-               return 0;
-
-       /*
-        * Check if the version looks reasonably.
-        */
-       reg1 = GET_APIC_VERSION(reg0);
-       if (reg1 == 0x00 || reg1 == 0xff)
-               return 0;
-       reg1 = get_maxlvt();
-       if (reg1 < 0x02 || reg1 == 0xff)
-               return 0;
-
-       /*
-        * The ID register is read/write in a real APIC.
-        */
-       reg0 = apic_read(APIC_ID);
-       apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
-       apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
-       reg1 = apic_read(APIC_ID);
-       apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
-       apic_write(APIC_ID, reg0);
-       if (reg1 != (reg0 ^ APIC_ID_MASK))
-               return 0;
-
-       /*
-        * The next two are just to see if we have sane values.
-        * They're only really relevant if we're in Virtual Wire
-        * compatibility mode, but most boxes are anymore.
-        */
-       reg0 = apic_read(APIC_LVT0);
-       apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0);
-       reg1 = apic_read(APIC_LVT1);
-       apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
-
-       return 1;
-}
-
-void __init sync_Arb_IDs(void)
-{
-       /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
-       unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
-       if (ver >= 0x14)        /* P4 or higher */
-               return;
-
-       /*
-        * Wait for idle.
-        */
-       apic_wait_icr_idle();
-
-       apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
-       apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
-                               | APIC_DM_INIT);
-}
-
-/*
- * An initial setup of the virtual wire mode.
- */
-void __init init_bsp_APIC(void)
-{
-       unsigned int value;
-
-       /*
-        * Don't do the setup now if we have a SMP BIOS as the
-        * through-I/O-APIC virtual wire mode might be active.
-        */
-       if (smp_found_config || !cpu_has_apic)
-               return;
-
-       value = apic_read(APIC_LVR);
-
-       /*
-        * Do not trust the local APIC being empty at bootup.
-        */
-       clear_local_APIC();
-
-       /*
-        * Enable APIC.
-        */
-       value = apic_read(APIC_SPIV);
-       value &= ~APIC_VECTOR_MASK;
-       value |= APIC_SPIV_APIC_ENABLED;
-       value |= APIC_SPIV_FOCUS_DISABLED;
-       value |= SPURIOUS_APIC_VECTOR;
-       apic_write(APIC_SPIV, value);
-
-       /*
-        * Set up the virtual wire mode.
-        */
-       apic_write(APIC_LVT0, APIC_DM_EXTINT);
-       value = APIC_DM_NMI;
-       apic_write(APIC_LVT1, value);
-}
-
-void __cpuinit setup_local_APIC (void)
-{
-       unsigned int value, maxlvt;
-       int i, j;
-
-       value = apic_read(APIC_LVR);
-
-       BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f);
-
-       /*
-        * Double-check whether this APIC is really registered.
-        * This is meaningless in clustered apic mode, so we skip it.
-        */
-       if (!apic_id_registered())
-               BUG();
-
-       /*
-        * Intel recommends to set DFR, LDR and TPR before enabling
-        * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel
-        * document number 292116).  So here it goes...
-        */
-       init_apic_ldr();
-
-       /*
-        * Set Task Priority to 'accept all'. We never change this
-        * later on.
-        */
-       value = apic_read(APIC_TASKPRI);
-       value &= ~APIC_TPRI_MASK;
-       apic_write(APIC_TASKPRI, value);
-
-       /*
-        * After a crash, we no longer service the interrupts and a pending
-        * interrupt from previous kernel might still have ISR bit set.
-        *
-        * Most probably by now CPU has serviced that pending interrupt and
-        * it might not have done the ack_APIC_irq() because it thought,
-        * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
-        * does not clear the ISR bit and cpu thinks it has already serivced
-        * the interrupt. Hence a vector might get locked. It was noticed
-        * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
-        */
-       for (i = APIC_ISR_NR - 1; i >= 0; i--) {
-               value = apic_read(APIC_ISR + i*0x10);
-               for (j = 31; j >= 0; j--) {
-                       if (value & (1<<j))
-                               ack_APIC_irq();
-               }
-       }
-
-       /*
-        * Now that we are all set up, enable the APIC
-        */
-       value = apic_read(APIC_SPIV);
-       value &= ~APIC_VECTOR_MASK;
-       /*
-        * Enable APIC
-        */
-       value |= APIC_SPIV_APIC_ENABLED;
-
-       /* We always use processor focus */
-
-       /*
-        * Set spurious IRQ vector
-        */
-       value |= SPURIOUS_APIC_VECTOR;
-       apic_write(APIC_SPIV, value);
-
-       /*
-        * Set up LVT0, LVT1:
-        *
-        * set up through-local-APIC on the BP's LINT0. This is not
-        * strictly necessary in pure symmetric-IO mode, but sometimes
-        * we delegate interrupts to the 8259A.
-        */
-       /*
-        * TODO: set up through-local-APIC from through-I/O-APIC? --macro
-        */
-       value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
-       if (!smp_processor_id() && !value) {
-               value = APIC_DM_EXTINT;
-               apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id());
-       } else {
-               value = APIC_DM_EXTINT | APIC_LVT_MASKED;
-               apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id());
-       }
-       apic_write(APIC_LVT0, value);
-
-       /*
-        * only the BP should see the LINT1 NMI signal, obviously.
-        */
-       if (!smp_processor_id())
-               value = APIC_DM_NMI;
-       else
-               value = APIC_DM_NMI | APIC_LVT_MASKED;
-       apic_write(APIC_LVT1, value);
-
-       {
-               unsigned oldvalue;
-               maxlvt = get_maxlvt();
-               oldvalue = apic_read(APIC_ESR);
-               value = ERROR_APIC_VECTOR;      // enables sending errors
-               apic_write(APIC_LVTERR, value);
-               /*
-                * spec says clear errors after enabling vector.
-                */
-               if (maxlvt > 3)
-                       apic_write(APIC_ESR, 0);
-               value = apic_read(APIC_ESR);
-               if (value != oldvalue)
-                       apic_printk(APIC_VERBOSE,
-                       "ESR value after enabling vector: %08x, after %08x\n",
-                       oldvalue, value);
-       }
-
-       nmi_watchdog_default();
-       setup_apic_nmi_watchdog(NULL);
-       apic_pm_activate();
-}
-
-#ifdef CONFIG_PM
-
-static struct {
-       /* 'active' is true if the local APIC was enabled by us and
-          not the BIOS; this signifies that we are also responsible
-          for disabling it before entering apm/acpi suspend */
-       int active;
-       /* r/w apic fields */
-       unsigned int apic_id;
-       unsigned int apic_taskpri;
-       unsigned int apic_ldr;
-       unsigned int apic_dfr;
-       unsigned int apic_spiv;
-       unsigned int apic_lvtt;
-       unsigned int apic_lvtpc;
-       unsigned int apic_lvt0;
-       unsigned int apic_lvt1;
-       unsigned int apic_lvterr;
-       unsigned int apic_tmict;
-       unsigned int apic_tdcr;
-       unsigned int apic_thmr;
-} apic_pm_state;
-
-static int lapic_suspend(struct sys_device *dev, pm_message_t state)
-{
-       unsigned long flags;
-       int maxlvt;
-
-       if (!apic_pm_state.active)
-               return 0;
-
-       maxlvt = get_maxlvt();
-
-       apic_pm_state.apic_id = apic_read(APIC_ID);
-       apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
-       apic_pm_state.apic_ldr = apic_read(APIC_LDR);
-       apic_pm_state.apic_dfr = apic_read(APIC_DFR);
-       apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
-       apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
-       if (maxlvt >= 4)
-               apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
-       apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
-       apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
-       apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
-       apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
-       apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
-#ifdef CONFIG_X86_MCE_INTEL
-       if (maxlvt >= 5)
-               apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
-#endif
-       local_irq_save(flags);
-       disable_local_APIC();
-       local_irq_restore(flags);
-       return 0;
-}
-
-static int lapic_resume(struct sys_device *dev)
-{
-       unsigned int l, h;
-       unsigned long flags;
-       int maxlvt;
-
-       if (!apic_pm_state.active)
-               return 0;
-
-       maxlvt = get_maxlvt();
-
-       local_irq_save(flags);
-       rdmsr(MSR_IA32_APICBASE, l, h);
-       l &= ~MSR_IA32_APICBASE_BASE;
-       l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
-       wrmsr(MSR_IA32_APICBASE, l, h);
-       apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
-       apic_write(APIC_ID, apic_pm_state.apic_id);
-       apic_write(APIC_DFR, apic_pm_state.apic_dfr);
-       apic_write(APIC_LDR, apic_pm_state.apic_ldr);
-       apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
-       apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
-       apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
-       apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-#ifdef CONFIG_X86_MCE_INTEL
-       if (maxlvt >= 5)
-               apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
-#endif
-       if (maxlvt >= 4)
-               apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
-       apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
-       apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
-       apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
-       apic_write(APIC_ESR, 0);
-       apic_read(APIC_ESR);
-       apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
-       apic_write(APIC_ESR, 0);
-       apic_read(APIC_ESR);
-       local_irq_restore(flags);
-       return 0;
-}
-
-static struct sysdev_class lapic_sysclass = {
-       set_kset_name("lapic"),
-       .resume         = lapic_resume,
-       .suspend        = lapic_suspend,
-};
-
-static struct sys_device device_lapic = {
-       .id             = 0,
-       .cls            = &lapic_sysclass,
-};
-
-static void __cpuinit apic_pm_activate(void)
-{
-       apic_pm_state.active = 1;
-}
-
-static int __init init_lapic_sysfs(void)
-{
-       int error;
-       if (!cpu_has_apic)
-               return 0;
-       /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
-       error = sysdev_class_register(&lapic_sysclass);
-       if (!error)
-               error = sysdev_register(&device_lapic);
-       return error;
-}
-device_initcall(init_lapic_sysfs);
-
-#else  /* CONFIG_PM */
-
-static void apic_pm_activate(void) { }
-
-#endif /* CONFIG_PM */
-
-static int __init apic_set_verbosity(char *str)
-{
-       if (str == NULL)  {
-               skip_ioapic_setup = 0;
-               ioapic_force = 1;
-               return 0;
-       }
-       if (strcmp("debug", str) == 0)
-               apic_verbosity = APIC_DEBUG;
-       else if (strcmp("verbose", str) == 0)
-               apic_verbosity = APIC_VERBOSE;
-       else {
-               printk(KERN_WARNING "APIC Verbosity level %s not recognised"
-                               " use apic=verbose or apic=debug\n", str);
-               return -EINVAL;
-       }
-
-       return 0;
-}
-early_param("apic", apic_set_verbosity);
-
-/*
- * Detect and enable local APICs on non-SMP boards.
- * Original code written by Keir Fraser.
- * On AMD64 we trust the BIOS - if it says no APIC it is likely
- * not correctly set up (usually the APIC timer won't work etc.)
- */
-
-static int __init detect_init_APIC (void)
-{
-       if (!cpu_has_apic) {
-               printk(KERN_INFO "No local APIC present\n");
-               return -1;
-       }
-
-       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-       boot_cpu_id = 0;
-       return 0;
-}
-
-#ifdef CONFIG_X86_IO_APIC
-static struct resource * __init ioapic_setup_resources(void)
-{
-#define IOAPIC_RESOURCE_NAME_SIZE 11
-       unsigned long n;
-       struct resource *res;
-       char *mem;
-       int i;
-
-       if (nr_ioapics <= 0)
-               return NULL;
-
-       n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
-       n *= nr_ioapics;
-
-       mem = alloc_bootmem(n);
-       res = (void *)mem;
-
-       if (mem != NULL) {
-               memset(mem, 0, n);
-               mem += sizeof(struct resource) * nr_ioapics;
-
-               for (i = 0; i < nr_ioapics; i++) {
-                       res[i].name = mem;
-                       res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-                       sprintf(mem,  "IOAPIC %u", i);
-                       mem += IOAPIC_RESOURCE_NAME_SIZE;
-               }
-       }
-
-       ioapic_resources = res;
-
-       return res;
-}
-
-static int __init ioapic_insert_resources(void)
-{
-       int i;
-       struct resource *r = ioapic_resources;
-
-       if (!r) {
-               printk("IO APIC resources could be not be allocated.\n");
-               return -1;
-       }
-
-       for (i = 0; i < nr_ioapics; i++) {
-               insert_resource(&iomem_resource, r);
-               r++;
-       }
-
-       return 0;
-}
-
-/* Insert the IO APIC resources after PCI initialization has occured to handle
- * IO APICS that are mapped in on a BAR in PCI space. */
-late_initcall(ioapic_insert_resources);
-#endif
-
-void __init init_apic_mappings(void)
-{
-       unsigned long apic_phys;
-
-       /*
-        * If no local APIC can be found then set up a fake all
-        * zeroes page to simulate the local APIC and another
-        * one for the IO-APIC.
-        */
-       if (!smp_found_config && detect_init_APIC()) {
-               apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
-               apic_phys = __pa(apic_phys);
-       } else
-               apic_phys = mp_lapic_addr;
-
-       set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
-       apic_mapped = 1;
-       apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys);
-
-       /* Put local APIC into the resource map. */
-       lapic_resource.start = apic_phys;
-       lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
-       insert_resource(&iomem_resource, &lapic_resource);
-
-       /*
-        * Fetch the APIC ID of the BSP in case we have a
-        * default configuration (or the MP table is broken).
-        */
-       boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
-
-       {
-               unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
-               int i;
-               struct resource *ioapic_res;
-
-               ioapic_res = ioapic_setup_resources();
-               for (i = 0; i < nr_ioapics; i++) {
-                       if (smp_found_config) {
-                               ioapic_phys = mp_ioapics[i].mpc_apicaddr;
-                       } else {
-                               ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
-                               ioapic_phys = __pa(ioapic_phys);
-                       }
-                       set_fixmap_nocache(idx, ioapic_phys);
-                       apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n",
-                                       __fix_to_virt(idx), ioapic_phys);
-                       idx++;
-
-                       if (ioapic_res != NULL) {
-                               ioapic_res->start = ioapic_phys;
-                               ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
-                               ioapic_res++;
-                       }
-               }
-       }
-}
-
-/*
- * This function sets up the local APIC timer, with a timeout of
- * 'clocks' APIC bus clock. During calibration we actually call
- * this function twice on the boot CPU, once with a bogus timeout
- * value, second time for real. The other (noncalibrating) CPUs
- * call this function only once, with the real, calibrated value.
- *
- * We do reads before writes even if unnecessary, to get around the
- * P5 APIC double write bug.
- */
-
-#define APIC_DIVISOR 16
-
-static void __setup_APIC_LVTT(unsigned int clocks)
-{
-       unsigned int lvtt_value, tmp_value;
-       int cpu = smp_processor_id();
-
-       lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
-
-       if (cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask))
-               lvtt_value |= APIC_LVT_MASKED;
-
-       apic_write(APIC_LVTT, lvtt_value);
-
-       /*
-        * Divide PICLK by 16
-        */
-       tmp_value = apic_read(APIC_TDCR);
-       apic_write(APIC_TDCR, (tmp_value
-                               & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
-                               | APIC_TDR_DIV_16);
-
-       apic_write(APIC_TMICT, clocks/APIC_DIVISOR);
-}
-
-static void setup_APIC_timer(unsigned int clocks)
-{
-       unsigned long flags;
-
-       local_irq_save(flags);
-
-       /* wait for irq slice */
-       if (hpet_address && hpet_use_timer) {
-               u32 trigger = hpet_readl(HPET_T0_CMP);
-               while (hpet_readl(HPET_T0_CMP) == trigger)
-                       /* do nothing */ ;
-       } else {
-               int c1, c2;
-               outb_p(0x00, 0x43);
-               c2 = inb_p(0x40);
-               c2 |= inb_p(0x40) << 8;
-               do {
-                       c1 = c2;
-                       outb_p(0x00, 0x43);
-                       c2 = inb_p(0x40);
-                       c2 |= inb_p(0x40) << 8;
-               } while (c2 - c1 < 300);
-       }
-       __setup_APIC_LVTT(clocks);
-       /* Turn off PIT interrupt if we use APIC timer as main timer.
-          Only works with the PM timer right now
-          TBD fix it for HPET too. */
-       if ((pmtmr_ioport != 0) &&
-               smp_processor_id() == boot_cpu_id &&
-               apic_runs_main_timer == 1 &&
-               !cpu_isset(boot_cpu_id, timer_interrupt_broadcast_ipi_mask)) {
-               stop_timer_interrupt();
-               apic_runs_main_timer++;
-       }
-       local_irq_restore(flags);
-}
-
-/*
- * In this function we calibrate APIC bus clocks to the external
- * timer. Unfortunately we cannot use jiffies and the timer irq
- * to calibrate, since some later bootup code depends on getting
- * the first irq? Ugh.
- *
- * We want to do the calibration only once since we
- * want to have local timer irqs syncron. CPUs connected
- * by the same APIC bus have the very same bus frequency.
- * And we want to have irqs off anyways, no accidental
- * APIC irq that way.
- */
-
-#define TICK_COUNT 100000000
-
-static int __init calibrate_APIC_clock(void)
-{
-       unsigned apic, apic_start;
-       unsigned long tsc, tsc_start;
-       int result;
-       /*
-        * Put whatever arbitrary (but long enough) timeout
-        * value into the APIC clock, we just want to get the
-        * counter running for calibration.
-        */
-       __setup_APIC_LVTT(4000000000);
-
-       apic_start = apic_read(APIC_TMCCT);
-#ifdef CONFIG_X86_PM_TIMER
-       if (apic_calibrate_pmtmr && pmtmr_ioport) {
-               pmtimer_wait(5000);  /* 5ms wait */
-               apic = apic_read(APIC_TMCCT);
-               result = (apic_start - apic) * 1000L / 5;
-       } else
-#endif
-       {
-               rdtscll(tsc_start);
-
-               do {
-                       apic = apic_read(APIC_TMCCT);
-                       rdtscll(tsc);
-               } while ((tsc - tsc_start) < TICK_COUNT &&
-                               (apic_start - apic) < TICK_COUNT);
-
-               result = (apic_start - apic) * 1000L * tsc_khz /
-                                       (tsc - tsc_start);
-       }
-       printk("result %d\n", result);
-
-
-       printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
-               result / 1000 / 1000, result / 1000 % 1000);
-
-       return result * APIC_DIVISOR / HZ;
-}
-
-static unsigned int calibration_result;
-
-void __init setup_boot_APIC_clock (void)
-{
-       if (disable_apic_timer) {
-               printk(KERN_INFO "Disabling APIC timer\n");
-               return;
-       }
-
-       printk(KERN_INFO "Using local APIC timer interrupts.\n");
-       using_apic_timer = 1;
-
-       local_irq_disable();
-
-       calibration_result = calibrate_APIC_clock();
-       /*
-        * Now set up the timer for real.
-        */
-       setup_APIC_timer(calibration_result);
-
-       local_irq_enable();
-}
-
-void __cpuinit setup_secondary_APIC_clock(void)
-{
-       local_irq_disable(); /* FIXME: Do we need this? --RR */
-       setup_APIC_timer(calibration_result);
-       local_irq_enable();
-}
-
-void disable_APIC_timer(void)
-{
-       if (using_apic_timer) {
-               unsigned long v;
-
-               v = apic_read(APIC_LVTT);
-               /*
-                * When an illegal vector value (0-15) is written to an LVT
-                * entry and delivery mode is Fixed, the APIC may signal an
-                * illegal vector error, with out regard to whether the mask
-                * bit is set or whether an interrupt is actually seen on input.
-                *
-                * Boot sequence might call this function when the LVTT has
-                * '0' vector value. So make sure vector field is set to
-                * valid value.
-                */
-               v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
-               apic_write(APIC_LVTT, v);
-       }
-}
-
-void enable_APIC_timer(void)
-{
-       int cpu = smp_processor_id();
-
-       if (using_apic_timer &&
-           !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) {
-               unsigned long v;
-
-               v = apic_read(APIC_LVTT);
-               apic_write(APIC_LVTT, v & ~APIC_LVT_MASKED);
-       }
-}
-
-void switch_APIC_timer_to_ipi(void *cpumask)
-{
-       cpumask_t mask = *(cpumask_t *)cpumask;
-       int cpu = smp_processor_id();
-
-       if (cpu_isset(cpu, mask) &&
-           !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) {
-               disable_APIC_timer();
-               cpu_set(cpu, timer_interrupt_broadcast_ipi_mask);
-       }
-}
-EXPORT_SYMBOL(switch_APIC_timer_to_ipi);
-
-void smp_send_timer_broadcast_ipi(void)
-{
-       int cpu = smp_processor_id();
-       cpumask_t mask;
-
-       cpus_and(mask, cpu_online_map, timer_interrupt_broadcast_ipi_mask);
-
-       if (cpu_isset(cpu, mask)) {
-               cpu_clear(cpu, mask);
-               add_pda(apic_timer_irqs, 1);
-               smp_local_timer_interrupt();
-       }
-
-       if (!cpus_empty(mask)) {
-               send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
-       }
-}
-
-void switch_ipi_to_APIC_timer(void *cpumask)
-{
-       cpumask_t mask = *(cpumask_t *)cpumask;
-       int cpu = smp_processor_id();
-
-       if (cpu_isset(cpu, mask) &&
-           cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) {
-               cpu_clear(cpu, timer_interrupt_broadcast_ipi_mask);
-               enable_APIC_timer();
-       }
-}
-EXPORT_SYMBOL(switch_ipi_to_APIC_timer);
-
-int setup_profiling_timer(unsigned int multiplier)
-{
-       return -EINVAL;
-}
-
-void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector,
-                            unsigned char msg_type, unsigned char mask)
-{
-       unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE;
-       unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;
-       apic_write(reg, v);
-}
-
-#undef APIC_DIVISOR
-
-/*
- * Local timer interrupt handler. It does both profiling and
- * process statistics/rescheduling.
- *
- * We do profiling in every local tick, statistics/rescheduling
- * happen only every 'profiling multiplier' ticks. The default
- * multiplier is 1 and it can be changed by writing the new multiplier
- * value into /proc/profile.
- */
-
-void smp_local_timer_interrupt(void)
-{
-       profile_tick(CPU_PROFILING);
-#ifdef CONFIG_SMP
-       update_process_times(user_mode(get_irq_regs()));
-#endif
-       if (apic_runs_main_timer > 1 && smp_processor_id() == boot_cpu_id)
-               main_timer_handler();
-       /*
-        * We take the 'long' return path, and there every subsystem
-        * grabs the appropriate locks (kernel lock/ irq lock).
-        *
-        * We might want to decouple profiling from the 'long path',
-        * and do the profiling totally in assembly.
-        *
-        * Currently this isn't too much of an issue (performance wise),
-        * we can take more than 100K local irqs per second on a 100 MHz P5.
-        */
-}
-
-/*
- * Local APIC timer interrupt. This is the most natural way for doing
- * local interrupts, but local timer interrupts can be emulated by
- * broadcast interrupts too. [in case the hw doesn't support APIC timers]
- *
- * [ if a single-CPU system runs an SMP kernel then we call the local
- *   interrupt as well. Thus we cannot inline the local irq ... ]
- */
-void smp_apic_timer_interrupt(struct pt_regs *regs)
-{
-       struct pt_regs *old_regs = set_irq_regs(regs);
-
-       /*
-        * the NMI deadlock-detector uses this.
-        */
-       add_pda(apic_timer_irqs, 1);
-
-       /*
-        * NOTE! We'd better ACK the irq immediately,
-        * because timer handling can be slow.
-        */
-       ack_APIC_irq();
-       /*
-        * update_process_times() expects us to have done irq_enter().
-        * Besides, if we don't timer interrupts ignore the global
-        * interrupt lock, which is the WrongThing (tm) to do.
-        */
-       exit_idle();
-       irq_enter();
-       smp_local_timer_interrupt();
-       irq_exit();
-       set_irq_regs(old_regs);
-}
-
-/*
- * apic_is_clustered_box() -- Check if we can expect good TSC
- *
- * Thus far, the major user of this is IBM's Summit2 series:
- *
- * Clustered boxes may have unsynced TSC problems if they are
- * multi-chassis. Use available data to take a good guess.
- * If in doubt, go HPET.
- */
-__cpuinit int apic_is_clustered_box(void)
-{
-       int i, clusters, zeros;
-       unsigned id;
-       DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
-
-       bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
-
-       for (i = 0; i < NR_CPUS; i++) {
-               id = bios_cpu_apicid[i];
-               if (id != BAD_APICID)
-                       __set_bit(APIC_CLUSTERID(id), clustermap);
-       }
-
-       /* Problem:  Partially populated chassis may not have CPUs in some of
-        * the APIC clusters they have been allocated.  Only present CPUs have
-        * bios_cpu_apicid entries, thus causing zeroes in the bitmap.  Since
-        * clusters are allocated sequentially, count zeros only if they are
-        * bounded by ones.
-        */
-       clusters = 0;
-       zeros = 0;
-       for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
-               if (test_bit(i, clustermap)) {
-                       clusters += 1 + zeros;
-                       zeros = 0;
-               } else
-                       ++zeros;
-       }
-
-       /*
-        * If clusters > 2, then should be multi-chassis.
-        * May have to revisit this when multi-core + hyperthreaded CPUs come
-        * out, but AFAIK this will work even for them.
-        */
-       return (clusters > 2);
-}
-
-/*
- * This interrupt should _never_ happen with our APIC/SMP architecture
- */
-asmlinkage void smp_spurious_interrupt(void)
-{
-       unsigned int v;
-       exit_idle();
-       irq_enter();
-       /*
-        * Check if this really is a spurious interrupt and ACK it
-        * if it is a vectored one.  Just in case...
-        * Spurious interrupts should not be ACKed.
-        */
-       v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
-       if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
-               ack_APIC_irq();
-
-       irq_exit();
-}
-
-/*
- * This interrupt should never happen with our APIC/SMP architecture
- */
-
-asmlinkage void smp_error_interrupt(void)
-{
-       unsigned int v, v1;
-
-       exit_idle();
-       irq_enter();
-       /* First tickle the hardware, only then report what went on. -- REW */
-       v = apic_read(APIC_ESR);
-       apic_write(APIC_ESR, 0);
-       v1 = apic_read(APIC_ESR);
-       ack_APIC_irq();
-       atomic_inc(&irq_err_count);
-
-       /* Here is what the APIC error bits mean:
-          0: Send CS error
-          1: Receive CS error
-          2: Send accept error
-          3: Receive accept error
-          4: Reserved
-          5: Send illegal vector
-          6: Received illegal vector
-          7: Illegal register address
-       */
-       printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
-               smp_processor_id(), v , v1);
-       irq_exit();
-}
-
-int disable_apic;
-
-/*
- * This initializes the IO-APIC and APIC hardware if this is
- * a UP kernel.
- */
-int __init APIC_init_uniprocessor (void)
-{
-       if (disable_apic) {
-               printk(KERN_INFO "Apic disabled\n");
-               return -1;
-       }
-       if (!cpu_has_apic) {
-               disable_apic = 1;
-               printk(KERN_INFO "Apic disabled by BIOS\n");
-               return -1;
-       }
-
-       verify_local_APIC();
-
-       phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
-       apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id));
-
-       setup_local_APIC();
-
-       if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
-               setup_IO_APIC();
-       else
-               nr_ioapics = 0;
-       setup_boot_APIC_clock();
-       check_nmi_watchdog();
-       return 0;
-}
-
-static __init int setup_disableapic(char *str)
-{
-       disable_apic = 1;
-       clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
-       return 0;
-}
-early_param("disableapic", setup_disableapic);
-
-/* same as disableapic, for compatibility */
-static __init int setup_nolapic(char *str)
-{
-       return setup_disableapic(str);
-}
-early_param("nolapic", setup_nolapic);
-
-static int __init parse_lapic_timer_c2_ok(char *arg)
-{
-       local_apic_timer_c2_ok = 1;
-       return 0;
-}
-early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
-
-static __init int setup_noapictimer(char *str)
-{
-       if (str[0] != ' ' && str[0] != 0)
-               return 0;
-       disable_apic_timer = 1;
-       return 1;
-}
-
-static __init int setup_apicmaintimer(char *str)
-{
-       apic_runs_main_timer = 1;
-       nohpet = 1;
-       return 1;
-}
-__setup("apicmaintimer", setup_apicmaintimer);
-
-static __init int setup_noapicmaintimer(char *str)
-{
-       apic_runs_main_timer = -1;
-       return 1;
-}
-__setup("noapicmaintimer", setup_noapicmaintimer);
-
-static __init int setup_apicpmtimer(char *s)
-{
-       apic_calibrate_pmtmr = 1;
-       notsc_setup(NULL);
-       return setup_apicmaintimer(NULL);
-}
-__setup("apicpmtimer", setup_apicpmtimer);
-
-__setup("noapictimer", setup_noapictimer);
-
diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c
deleted file mode 100644 (file)
index cfa82c8..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-#ifdef CONFIG_X86_32
-# include "asm-offsets_32.c"
-#else
-# include "asm-offsets_64.c"
-#endif
diff --git a/arch/x86_64/kernel/asm-offsets_64.c b/arch/x86_64/kernel/asm-offsets_64.c
deleted file mode 100644 (file)
index 778953b..0000000
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Generate definitions needed by assembly language modules.
- * This code generates raw asm output which is post-processed to extract
- * and format the required data.
- */
-
-#include <linux/crypto.h>
-#include <linux/sched.h> 
-#include <linux/stddef.h>
-#include <linux/errno.h> 
-#include <linux/hardirq.h>
-#include <linux/suspend.h>
-#include <asm/pda.h>
-#include <asm/processor.h>
-#include <asm/segment.h>
-#include <asm/thread_info.h>
-#include <asm/ia32.h>
-
-#define DEFINE(sym, val) \
-        asm volatile("\n->" #sym " %0 " #val : : "i" (val))
-
-#define BLANK() asm volatile("\n->" : : )
-
-#define __NO_STUBS 1
-#undef __SYSCALL
-#undef _ASM_X86_64_UNISTD_H_
-#define __SYSCALL(nr, sym) [nr] = 1,
-static char syscalls[] = {
-#include <asm/unistd.h>
-};
-
-int main(void)
-{
-#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
-       ENTRY(state);
-       ENTRY(flags); 
-       ENTRY(thread); 
-       ENTRY(pid);
-       BLANK();
-#undef ENTRY
-#define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry))
-       ENTRY(flags);
-       ENTRY(addr_limit);
-       ENTRY(preempt_count);
-       ENTRY(status);
-       BLANK();
-#undef ENTRY
-#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
-       ENTRY(kernelstack); 
-       ENTRY(oldrsp); 
-       ENTRY(pcurrent); 
-       ENTRY(irqcount);
-       ENTRY(cpunumber);
-       ENTRY(irqstackptr);
-       ENTRY(data_offset);
-       BLANK();
-#undef ENTRY
-#ifdef CONFIG_IA32_EMULATION
-#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry))
-       ENTRY(eax);
-       ENTRY(ebx);
-       ENTRY(ecx);
-       ENTRY(edx);
-       ENTRY(esi);
-       ENTRY(edi);
-       ENTRY(ebp);
-       ENTRY(esp);
-       ENTRY(eip);
-       BLANK();
-#undef ENTRY
-       DEFINE(IA32_RT_SIGFRAME_sigcontext,
-              offsetof (struct rt_sigframe32, uc.uc_mcontext));
-       BLANK();
-#endif
-       DEFINE(pbe_address, offsetof(struct pbe, address));
-       DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
-       DEFINE(pbe_next, offsetof(struct pbe, next));
-       BLANK();
-       DEFINE(TSS_ist, offsetof(struct tss_struct, ist));
-       BLANK();
-       DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
-       BLANK();
-       DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
-       return 0;
-}
diff --git a/arch/x86_64/kernel/audit_64.c b/arch/x86_64/kernel/audit_64.c
deleted file mode 100644 (file)
index 06d3e5a..0000000
+++ /dev/null
@@ -1,81 +0,0 @@
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/audit.h>
-#include <asm/unistd.h>
-
-static unsigned dir_class[] = {
-#include <asm-generic/audit_dir_write.h>
-~0U
-};
-
-static unsigned read_class[] = {
-#include <asm-generic/audit_read.h>
-~0U
-};
-
-static unsigned write_class[] = {
-#include <asm-generic/audit_write.h>
-~0U
-};
-
-static unsigned chattr_class[] = {
-#include <asm-generic/audit_change_attr.h>
-~0U
-};
-
-static unsigned signal_class[] = {
-#include <asm-generic/audit_signal.h>
-~0U
-};
-
-int audit_classify_arch(int arch)
-{
-#ifdef CONFIG_IA32_EMULATION
-       if (arch == AUDIT_ARCH_I386)
-               return 1;
-#endif
-       return 0;
-}
-
-int audit_classify_syscall(int abi, unsigned syscall)
-{
-#ifdef CONFIG_IA32_EMULATION
-       extern int ia32_classify_syscall(unsigned);
-       if (abi == AUDIT_ARCH_I386)
-               return ia32_classify_syscall(syscall);
-#endif
-       switch(syscall) {
-       case __NR_open:
-               return 2;
-       case __NR_openat:
-               return 3;
-       case __NR_execve:
-               return 5;
-       default:
-               return 0;
-       }
-}
-
-static int __init audit_classes_init(void)
-{
-#ifdef CONFIG_IA32_EMULATION
-       extern __u32 ia32_dir_class[];
-       extern __u32 ia32_write_class[];
-       extern __u32 ia32_read_class[];
-       extern __u32 ia32_chattr_class[];
-       extern __u32 ia32_signal_class[];
-       audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class);
-       audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class);
-       audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class);
-       audit_register_class(AUDIT_CLASS_CHATTR_32, ia32_chattr_class);
-       audit_register_class(AUDIT_CLASS_SIGNAL_32, ia32_signal_class);
-#endif
-       audit_register_class(AUDIT_CLASS_WRITE, write_class);
-       audit_register_class(AUDIT_CLASS_READ, read_class);
-       audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class);
-       audit_register_class(AUDIT_CLASS_CHATTR, chattr_class);
-       audit_register_class(AUDIT_CLASS_SIGNAL, signal_class);
-       return 0;
-}
-
-__initcall(audit_classes_init);
diff --git a/arch/x86_64/kernel/bugs_64.c b/arch/x86_64/kernel/bugs_64.c
deleted file mode 100644 (file)
index 4e5e9d3..0000000
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- *  arch/x86_64/kernel/bugs.c
- *
- *  Copyright (C) 1994  Linus Torvalds
- *  Copyright (C) 2000  SuSE
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <asm/alternative.h>
-#include <asm/bugs.h>
-#include <asm/processor.h>
-#include <asm/mtrr.h>
-
-void __init check_bugs(void)
-{
-       identify_cpu(&boot_cpu_data);
-       mtrr_bp_init();
-#if !defined(CONFIG_SMP)
-       printk("CPU: ");
-       print_cpu_info(&boot_cpu_data);
-#endif
-       alternative_instructions();
-}
diff --git a/arch/x86_64/kernel/crash_64.c b/arch/x86_64/kernel/crash_64.c
deleted file mode 100644 (file)
index 13432a1..0000000
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Architecture specific (x86_64) functions for kexec based crash dumps.
- *
- * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
- *
- * Copyright (C) IBM Corporation, 2004. All rights reserved.
- *
- */
-
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/irq.h>
-#include <linux/reboot.h>
-#include <linux/kexec.h>
-#include <linux/delay.h>
-#include <linux/elf.h>
-#include <linux/elfcore.h>
-#include <linux/kdebug.h>
-
-#include <asm/processor.h>
-#include <asm/hardirq.h>
-#include <asm/nmi.h>
-#include <asm/hw_irq.h>
-#include <asm/mach_apic.h>
-
-/* This keeps a track of which one is crashing cpu. */
-static int crashing_cpu;
-
-#ifdef CONFIG_SMP
-static atomic_t waiting_for_crash_ipi;
-
-static int crash_nmi_callback(struct notifier_block *self,
-                               unsigned long val, void *data)
-{
-       struct pt_regs *regs;
-       int cpu;
-
-       if (val != DIE_NMI_IPI)
-               return NOTIFY_OK;
-
-       regs = ((struct die_args *)data)->regs;
-       cpu = raw_smp_processor_id();
-
-       /*
-        * Don't do anything if this handler is invoked on crashing cpu.
-        * Otherwise, system will completely hang. Crashing cpu can get
-        * an NMI if system was initially booted with nmi_watchdog parameter.
-        */
-       if (cpu == crashing_cpu)
-               return NOTIFY_STOP;
-       local_irq_disable();
-
-       crash_save_cpu(regs, cpu);
-       disable_local_APIC();
-       atomic_dec(&waiting_for_crash_ipi);
-       /* Assume hlt works */
-       for(;;)
-               halt();
-
-       return 1;
-}
-
-static void smp_send_nmi_allbutself(void)
-{
-       send_IPI_allbutself(NMI_VECTOR);
-}
-
-/*
- * This code is a best effort heuristic to get the
- * other cpus to stop executing. So races with
- * cpu hotplug shouldn't matter.
- */
-
-static struct notifier_block crash_nmi_nb = {
-       .notifier_call = crash_nmi_callback,
-};
-
-static void nmi_shootdown_cpus(void)
-{
-       unsigned long msecs;
-
-       atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
-       if (register_die_notifier(&crash_nmi_nb))
-               return;         /* return what? */
-
-       /*
-        * Ensure the new callback function is set before sending
-        * out the NMI
-        */
-       wmb();
-
-       smp_send_nmi_allbutself();
-
-       msecs = 1000; /* Wait at most a second for the other cpus to stop */
-       while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
-               mdelay(1);
-               msecs--;
-       }
-       /* Leave the nmi callback set */
-       disable_local_APIC();
-}
-#else
-static void nmi_shootdown_cpus(void)
-{
-       /* There are no cpus to shootdown */
-}
-#endif
-
-void machine_crash_shutdown(struct pt_regs *regs)
-{
-       /*
-        * This function is only called after the system
-        * has panicked or is otherwise in a critical state.
-        * The minimum amount of code to allow a kexec'd kernel
-        * to run successfully needs to happen here.
-        *
-        * In practice this means shooting down the other cpus in
-        * an SMP system.
-        */
-       /* The kernel is broken so disable interrupts */
-       local_irq_disable();
-
-       /* Make a note of crashing cpu. Will be used in NMI callback.*/
-       crashing_cpu = smp_processor_id();
-       nmi_shootdown_cpus();
-
-       if(cpu_has_apic)
-                disable_local_APIC();
-
-       disable_IO_APIC();
-
-       crash_save_cpu(regs, smp_processor_id());
-}
diff --git a/arch/x86_64/kernel/crash_dump_64.c b/arch/x86_64/kernel/crash_dump_64.c
deleted file mode 100644 (file)
index 942deac..0000000
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- *     kernel/crash_dump.c - Memory preserving reboot related code.
- *
- *     Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
- *     Copyright (C) IBM Corporation, 2004. All rights reserved
- */
-
-#include <linux/errno.h>
-#include <linux/crash_dump.h>
-
-#include <asm/uaccess.h>
-#include <asm/io.h>
-
-/**
- * copy_oldmem_page - copy one page from "oldmem"
- * @pfn: page frame number to be copied
- * @buf: target memory address for the copy; this can be in kernel address
- *     space or user address space (see @userbuf)
- * @csize: number of bytes to copy
- * @offset: offset in bytes into the page (based on pfn) to begin the copy
- * @userbuf: if set, @buf is in user address space, use copy_to_user(),
- *     otherwise @buf is in kernel address space, use memcpy().
- *
- * Copy a page from "oldmem". For this page, there is no pte mapped
- * in the current kernel. We stitch up a pte, similar to kmap_atomic.
- */
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
-                               size_t csize, unsigned long offset, int userbuf)
-{
-       void  *vaddr;
-
-       if (!csize)
-               return 0;
-
-       vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
-
-       if (userbuf) {
-               if (copy_to_user(buf, (vaddr + offset), csize)) {
-                       iounmap(vaddr);
-                       return -EFAULT;
-               }
-       } else
-       memcpy(buf, (vaddr + offset), csize);
-
-       iounmap(vaddr);
-       return csize;
-}
diff --git a/arch/x86_64/kernel/e820_64.c b/arch/x86_64/kernel/e820_64.c
deleted file mode 100644 (file)
index 0f4d5e2..0000000
+++ /dev/null
@@ -1,725 +0,0 @@
-/* 
- * Handle the memory map.
- * The functions here do the job until bootmem takes over.
- *
- *  Getting sanitize_e820_map() in sync with i386 version by applying change:
- *  -  Provisions for empty E820 memory regions (reported by certain BIOSes).
- *     Alex Achenbach <xela@slit.de>, December 2002.
- *  Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
- *
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/ioport.h>
-#include <linux/string.h>
-#include <linux/kexec.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/suspend.h>
-#include <linux/pfn.h>
-
-#include <asm/pgtable.h>
-#include <asm/page.h>
-#include <asm/e820.h>
-#include <asm/proto.h>
-#include <asm/bootsetup.h>
-#include <asm/sections.h>
-
-struct e820map e820;
-
-/* 
- * PFN of last memory page.
- */
-unsigned long end_pfn; 
-EXPORT_SYMBOL(end_pfn);
-
-/* 
- * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
- * The direct mapping extends to end_pfn_map, so that we can directly access
- * apertures, ACPI and other tables without having to play with fixmaps.
- */ 
-unsigned long end_pfn_map; 
-
-/* 
- * Last pfn which the user wants to use.
- */
-static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
-
-extern struct resource code_resource, data_resource;
-
-/* Check for some hardcoded bad areas that early boot is not allowed to touch */ 
-static inline int bad_addr(unsigned long *addrp, unsigned long size)
-{ 
-       unsigned long addr = *addrp, last = addr + size; 
-
-       /* various gunk below that needed for SMP startup */
-       if (addr < 0x8000) { 
-               *addrp = PAGE_ALIGN(0x8000);
-               return 1; 
-       }
-
-       /* direct mapping tables of the kernel */
-       if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { 
-               *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
-               return 1;
-       } 
-
-       /* initrd */ 
-#ifdef CONFIG_BLK_DEV_INITRD
-       if (LOADER_TYPE && INITRD_START && last >= INITRD_START && 
-           addr < INITRD_START+INITRD_SIZE) { 
-               *addrp = PAGE_ALIGN(INITRD_START + INITRD_SIZE);
-               return 1;
-       } 
-#endif
-       /* kernel code */
-       if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) {
-               *addrp = PAGE_ALIGN(__pa_symbol(&_end));
-               return 1;
-       }
-
-       if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
-               *addrp = PAGE_ALIGN(ebda_addr + ebda_size);
-               return 1;
-       }
-
-#ifdef CONFIG_NUMA
-       /* NUMA memory to node map */
-       if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
-               *addrp = nodemap_addr + nodemap_size;
-               return 1;
-       }
-#endif
-       /* XXX ramdisk image here? */ 
-       return 0;
-} 
-
-/*
- * This function checks if any part of the range <start,end> is mapped
- * with type.
- */
-int
-e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
-{ 
-       int i;
-       for (i = 0; i < e820.nr_map; i++) { 
-               struct e820entry *ei = &e820.map[i]; 
-               if (type && ei->type != type) 
-                       continue;
-               if (ei->addr >= end || ei->addr + ei->size <= start)
-                       continue; 
-               return 1; 
-       } 
-       return 0;
-}
-EXPORT_SYMBOL_GPL(e820_any_mapped);
-
-/*
- * This function checks if the entire range <start,end> is mapped with type.
- *
- * Note: this function only works correct if the e820 table is sorted and
- * not-overlapping, which is the case
- */
-int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
-{
-       int i;
-       for (i = 0; i < e820.nr_map; i++) {
-               struct e820entry *ei = &e820.map[i];
-               if (type && ei->type != type)
-                       continue;
-               /* is the region (part) in overlap with the current region ?*/
-               if (ei->addr >= end || ei->addr + ei->size <= start)
-                       continue;
-
-               /* if the region is at the beginning of <start,end> we move
-                * start to the end of the region since it's ok until there
-                */
-               if (ei->addr <= start)
-                       start = ei->addr + ei->size;
-               /* if start is now at or beyond end, we're done, full coverage */
-               if (start >= end)
-                       return 1; /* we're done */
-       }
-       return 0;
-}
-
-/* 
- * Find a free area in a specific range. 
- */ 
-unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) 
-{ 
-       int i; 
-       for (i = 0; i < e820.nr_map; i++) { 
-               struct e820entry *ei = &e820.map[i]; 
-               unsigned long addr = ei->addr, last; 
-               if (ei->type != E820_RAM) 
-                       continue; 
-               if (addr < start) 
-                       addr = start;
-               if (addr > ei->addr + ei->size) 
-                       continue; 
-               while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
-                       ;
-               last = PAGE_ALIGN(addr) + size;
-               if (last > ei->addr + ei->size)
-                       continue;
-               if (last > end) 
-                       continue;
-               return addr; 
-       } 
-       return -1UL;            
-} 
-
-/*
- * Find the highest page frame number we have available
- */
-unsigned long __init e820_end_of_ram(void)
-{
-       unsigned long end_pfn = 0;
-       end_pfn = find_max_pfn_with_active_regions();
-       
-       if (end_pfn > end_pfn_map) 
-               end_pfn_map = end_pfn;
-       if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
-               end_pfn_map = MAXMEM>>PAGE_SHIFT;
-       if (end_pfn > end_user_pfn)
-               end_pfn = end_user_pfn;
-       if (end_pfn > end_pfn_map) 
-               end_pfn = end_pfn_map; 
-
-       printk("end_pfn_map = %lu\n", end_pfn_map);
-       return end_pfn; 
-}
-
-/*
- * Mark e820 reserved areas as busy for the resource manager.
- */
-void __init e820_reserve_resources(void)
-{
-       int i;
-       for (i = 0; i < e820.nr_map; i++) {
-               struct resource *res;
-               res = alloc_bootmem_low(sizeof(struct resource));
-               switch (e820.map[i].type) {
-               case E820_RAM:  res->name = "System RAM"; break;
-               case E820_ACPI: res->name = "ACPI Tables"; break;
-               case E820_NVS:  res->name = "ACPI Non-volatile Storage"; break;
-               default:        res->name = "reserved";
-               }
-               res->start = e820.map[i].addr;
-               res->end = res->start + e820.map[i].size - 1;
-               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-               request_resource(&iomem_resource, res);
-               if (e820.map[i].type == E820_RAM) {
-                       /*
-                        *  We don't know which RAM region contains kernel data,
-                        *  so we try it repeatedly and let the resource manager
-                        *  test it.
-                        */
-                       request_resource(res, &code_resource);
-                       request_resource(res, &data_resource);
-#ifdef CONFIG_KEXEC
-                       request_resource(res, &crashk_res);
-#endif
-               }
-       }
-}
-
-/*
- * Find the ranges of physical addresses that do not correspond to
- * e820 RAM areas and mark the corresponding pages as nosave for software
- * suspend and suspend to RAM.
- *
- * This function requires the e820 map to be sorted and without any
- * overlapping entries and assumes the first e820 area to be RAM.
- */
-void __init e820_mark_nosave_regions(void)
-{
-       int i;
-       unsigned long paddr;
-
-       paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
-       for (i = 1; i < e820.nr_map; i++) {
-               struct e820entry *ei = &e820.map[i];
-
-               if (paddr < ei->addr)
-                       register_nosave_region(PFN_DOWN(paddr),
-                                               PFN_UP(ei->addr));
-
-               paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
-               if (ei->type != E820_RAM)
-                       register_nosave_region(PFN_UP(ei->addr),
-                                               PFN_DOWN(paddr));
-
-               if (paddr >= (end_pfn << PAGE_SHIFT))
-                       break;
-       }
-}
-
-/*
- * Finds an active region in the address range from start_pfn to end_pfn and
- * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
- */
-static int __init e820_find_active_region(const struct e820entry *ei,
-                                         unsigned long start_pfn,
-                                         unsigned long end_pfn,
-                                         unsigned long *ei_startpfn,
-                                         unsigned long *ei_endpfn)
-{
-       *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
-       *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
-
-       /* Skip map entries smaller than a page */
-       if (*ei_startpfn >= *ei_endpfn)
-               return 0;
-
-       /* Check if end_pfn_map should be updated */
-       if (ei->type != E820_RAM && *ei_endpfn > end_pfn_map)
-               end_pfn_map = *ei_endpfn;
-
-       /* Skip if map is outside the node */
-       if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
-                                   *ei_startpfn >= end_pfn)
-               return 0;
-
-       /* Check for overlaps */
-       if (*ei_startpfn < start_pfn)
-               *ei_startpfn = start_pfn;
-       if (*ei_endpfn > end_pfn)
-               *ei_endpfn = end_pfn;
-
-       /* Obey end_user_pfn to save on memmap */
-       if (*ei_startpfn >= end_user_pfn)
-               return 0;
-       if (*ei_endpfn > end_user_pfn)
-               *ei_endpfn = end_user_pfn;
-
-       return 1;
-}
-
-/* Walk the e820 map and register active regions within a node */
-void __init
-e820_register_active_regions(int nid, unsigned long start_pfn,
-                                                       unsigned long end_pfn)
-{
-       unsigned long ei_startpfn;
-       unsigned long ei_endpfn;
-       int i;
-
-       for (i = 0; i < e820.nr_map; i++)
-               if (e820_find_active_region(&e820.map[i],
-                                           start_pfn, end_pfn,
-                                           &ei_startpfn, &ei_endpfn))
-                       add_active_range(nid, ei_startpfn, ei_endpfn);
-}
-
-/* 
- * Add a memory region to the kernel e820 map.
- */ 
-void __init add_memory_region(unsigned long start, unsigned long size, int type)
-{
-       int x = e820.nr_map;
-
-       if (x == E820MAX) {
-               printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
-               return;
-       }
-
-       e820.map[x].addr = start;
-       e820.map[x].size = size;
-       e820.map[x].type = type;
-       e820.nr_map++;
-}
-
-/*
- * Find the hole size (in bytes) in the memory range.
- * @start: starting address of the memory range to scan
- * @end: ending address of the memory range to scan
- */
-unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
-{
-       unsigned long start_pfn = start >> PAGE_SHIFT;
-       unsigned long end_pfn = end >> PAGE_SHIFT;
-       unsigned long ei_startpfn;
-       unsigned long ei_endpfn;
-       unsigned long ram = 0;
-       int i;
-
-       for (i = 0; i < e820.nr_map; i++) {
-               if (e820_find_active_region(&e820.map[i],
-                                           start_pfn, end_pfn,
-                                           &ei_startpfn, &ei_endpfn))
-                       ram += ei_endpfn - ei_startpfn;
-       }
-       return end - start - (ram << PAGE_SHIFT);
-}
-
-void __init e820_print_map(char *who)
-{
-       int i;
-
-       for (i = 0; i < e820.nr_map; i++) {
-               printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
-                       (unsigned long long) e820.map[i].addr,
-                       (unsigned long long) (e820.map[i].addr + e820.map[i].size));
-               switch (e820.map[i].type) {
-               case E820_RAM:  printk("(usable)\n");
-                               break;
-               case E820_RESERVED:
-                               printk("(reserved)\n");
-                               break;
-               case E820_ACPI:
-                               printk("(ACPI data)\n");
-                               break;
-               case E820_NVS:
-                               printk("(ACPI NVS)\n");
-                               break;
-               default:        printk("type %u\n", e820.map[i].type);
-                               break;
-               }
-       }
-}
-
-/*
- * Sanitize the BIOS e820 map.
- *
- * Some e820 responses include overlapping entries.  The following 
- * replaces the original e820 map with a new one, removing overlaps.
- *
- */
-static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
-{
-       struct change_member {
-               struct e820entry *pbios; /* pointer to original bios entry */
-               unsigned long long addr; /* address for this change point */
-       };
-       static struct change_member change_point_list[2*E820MAX] __initdata;
-       static struct change_member *change_point[2*E820MAX] __initdata;
-       static struct e820entry *overlap_list[E820MAX] __initdata;
-       static struct e820entry new_bios[E820MAX] __initdata;
-       struct change_member *change_tmp;
-       unsigned long current_type, last_type;
-       unsigned long long last_addr;
-       int chgidx, still_changing;
-       int overlap_entries;
-       int new_bios_entry;
-       int old_nr, new_nr, chg_nr;
-       int i;
-
-       /*
-               Visually we're performing the following (1,2,3,4 = memory types)...
-
-               Sample memory map (w/overlaps):
-                  ____22__________________
-                  ______________________4_
-                  ____1111________________
-                  _44_____________________
-                  11111111________________
-                  ____________________33__
-                  ___________44___________
-                  __________33333_________
-                  ______________22________
-                  ___________________2222_
-                  _________111111111______
-                  _____________________11_
-                  _________________4______
-
-               Sanitized equivalent (no overlap):
-                  1_______________________
-                  _44_____________________
-                  ___1____________________
-                  ____22__________________
-                  ______11________________
-                  _________1______________
-                  __________3_____________
-                  ___________44___________
-                  _____________33_________
-                  _______________2________
-                  ________________1_______
-                  _________________4______
-                  ___________________2____
-                  ____________________33__
-                  ______________________4_
-       */
-
-       /* if there's only one memory region, don't bother */
-       if (*pnr_map < 2)
-               return -1;
-
-       old_nr = *pnr_map;
-
-       /* bail out if we find any unreasonable addresses in bios map */
-       for (i=0; i<old_nr; i++)
-               if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
-                       return -1;
-
-       /* create pointers for initial change-point information (for sorting) */
-       for (i=0; i < 2*old_nr; i++)
-               change_point[i] = &change_point_list[i];
-
-       /* record all known change-points (starting and ending addresses),
-          omitting those that are for empty memory regions */
-       chgidx = 0;
-       for (i=0; i < old_nr; i++)      {
-               if (biosmap[i].size != 0) {
-                       change_point[chgidx]->addr = biosmap[i].addr;
-                       change_point[chgidx++]->pbios = &biosmap[i];
-                       change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
-                       change_point[chgidx++]->pbios = &biosmap[i];
-               }
-       }
-       chg_nr = chgidx;
-
-       /* sort change-point list by memory addresses (low -> high) */
-       still_changing = 1;
-       while (still_changing)  {
-               still_changing = 0;
-               for (i=1; i < chg_nr; i++)  {
-                       /* if <current_addr> > <last_addr>, swap */
-                       /* or, if current=<start_addr> & last=<end_addr>, swap */
-                       if ((change_point[i]->addr < change_point[i-1]->addr) ||
-                               ((change_point[i]->addr == change_point[i-1]->addr) &&
-                                (change_point[i]->addr == change_point[i]->pbios->addr) &&
-                                (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
-                          )
-                       {
-                               change_tmp = change_point[i];
-                               change_point[i] = change_point[i-1];
-                               change_point[i-1] = change_tmp;
-                               still_changing=1;
-                       }
-               }
-       }
-
-       /* create a new bios memory map, removing overlaps */
-       overlap_entries=0;       /* number of entries in the overlap table */
-       new_bios_entry=0;        /* index for creating new bios map entries */
-       last_type = 0;           /* start with undefined memory type */
-       last_addr = 0;           /* start with 0 as last starting address */
-       /* loop through change-points, determining affect on the new bios map */
-       for (chgidx=0; chgidx < chg_nr; chgidx++)
-       {
-               /* keep track of all overlapping bios entries */
-               if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
-               {
-                       /* add map entry to overlap list (> 1 entry implies an overlap) */
-                       overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
-               }
-               else
-               {
-                       /* remove entry from list (order independent, so swap with last) */
-                       for (i=0; i<overlap_entries; i++)
-                       {
-                               if (overlap_list[i] == change_point[chgidx]->pbios)
-                                       overlap_list[i] = overlap_list[overlap_entries-1];
-                       }
-                       overlap_entries--;
-               }
-               /* if there are overlapping entries, decide which "type" to use */
-               /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
-               current_type = 0;
-               for (i=0; i<overlap_entries; i++)
-                       if (overlap_list[i]->type > current_type)
-                               current_type = overlap_list[i]->type;
-               /* continue building up new bios map based on this information */
-               if (current_type != last_type)  {
-                       if (last_type != 0)      {
-                               new_bios[new_bios_entry].size =
-                                       change_point[chgidx]->addr - last_addr;
-                               /* move forward only if the new size was non-zero */
-                               if (new_bios[new_bios_entry].size != 0)
-                                       if (++new_bios_entry >= E820MAX)
-                                               break;  /* no more space left for new bios entries */
-                       }
-                       if (current_type != 0)  {
-                               new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
-                               new_bios[new_bios_entry].type = current_type;
-                               last_addr=change_point[chgidx]->addr;
-                       }
-                       last_type = current_type;
-               }
-       }
-       new_nr = new_bios_entry;   /* retain count for new bios entries */
-
-       /* copy new bios mapping into original location */
-       memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
-       *pnr_map = new_nr;
-
-       return 0;
-}
-
-/*
- * Copy the BIOS e820 map into a safe place.
- *
- * Sanity-check it while we're at it..
- *
- * If we're lucky and live on a modern system, the setup code
- * will have given us a memory map that we can use to properly
- * set up memory.  If we aren't, we'll fake a memory map.
- */
-static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
-{
-       /* Only one memory region (or negative)? Ignore it */
-       if (nr_map < 2)
-               return -1;
-
-       do {
-               unsigned long start = biosmap->addr;
-               unsigned long size = biosmap->size;
-               unsigned long end = start + size;
-               unsigned long type = biosmap->type;
-
-               /* Overflow in 64 bits? Ignore the memory map. */
-               if (start > end)
-                       return -1;
-
-               add_memory_region(start, size, type);
-       } while (biosmap++,--nr_map);
-       return 0;
-}
-
-void early_panic(char *msg)
-{
-       early_printk(msg);
-       panic(msg);
-}
-
-void __init setup_memory_region(void)
-{
-       /*
-        * Try to copy the BIOS-supplied E820-map.
-        *
-        * Otherwise fake a memory map; one section from 0k->640k,
-        * the next section from 1mb->appropriate_mem_k
-        */
-       sanitize_e820_map(E820_MAP, &E820_MAP_NR);
-       if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0)
-               early_panic("Cannot find a valid memory map");
-       printk(KERN_INFO "BIOS-provided physical RAM map:\n");
-       e820_print_map("BIOS-e820");
-}
-
-static int __init parse_memopt(char *p)
-{
-       if (!p)
-               return -EINVAL;
-       end_user_pfn = memparse(p, &p);
-       end_user_pfn >>= PAGE_SHIFT;    
-       return 0;
-} 
-early_param("mem", parse_memopt);
-
-static int userdef __initdata;
-
-static int __init parse_memmap_opt(char *p)
-{
-       char *oldp;
-       unsigned long long start_at, mem_size;
-
-       if (!strcmp(p, "exactmap")) {
-#ifdef CONFIG_CRASH_DUMP
-               /* If we are doing a crash dump, we
-                * still need to know the real mem
-                * size before original memory map is
-                * reset.
-                */
-               e820_register_active_regions(0, 0, -1UL);
-               saved_max_pfn = e820_end_of_ram();
-               remove_all_active_ranges();
-#endif
-               end_pfn_map = 0;
-               e820.nr_map = 0;
-               userdef = 1;
-               return 0;
-       }
-
-       oldp = p;
-       mem_size = memparse(p, &p);
-       if (p == oldp)
-               return -EINVAL;
-       if (*p == '@') {
-               start_at = memparse(p+1, &p);
-               add_memory_region(start_at, mem_size, E820_RAM);
-       } else if (*p == '#') {
-               start_at = memparse(p+1, &p);
-               add_memory_region(start_at, mem_size, E820_ACPI);
-       } else if (*p == '$') {
-               start_at = memparse(p+1, &p);
-               add_memory_region(start_at, mem_size, E820_RESERVED);
-       } else {
-               end_user_pfn = (mem_size >> PAGE_SHIFT);
-       }
-       return *p == '\0' ? 0 : -EINVAL;
-}
-early_param("memmap", parse_memmap_opt);
-
-void __init finish_e820_parsing(void)
-{
-       if (userdef) {
-               printk(KERN_INFO "user-defined physical RAM map:\n");
-               e820_print_map("user");
-       }
-}
-
-unsigned long pci_mem_start = 0xaeedbabe;
-EXPORT_SYMBOL(pci_mem_start);
-
-/*
- * Search for the biggest gap in the low 32 bits of the e820
- * memory space.  We pass this space to PCI to assign MMIO resources
- * for hotplug or unconfigured devices in.
- * Hopefully the BIOS let enough space left.
- */
-__init void e820_setup_gap(void)
-{
-       unsigned long gapstart, gapsize, round;
-       unsigned long last;
-       int i;
-       int found = 0;
-
-       last = 0x100000000ull;
-       gapstart = 0x10000000;
-       gapsize = 0x400000;
-       i = e820.nr_map;
-       while (--i >= 0) {
-               unsigned long long start = e820.map[i].addr;
-               unsigned long long end = start + e820.map[i].size;
-
-               /*
-                * Since "last" is at most 4GB, we know we'll
-                * fit in 32 bits if this condition is true
-                */
-               if (last > end) {
-                       unsigned long gap = last - end;
-
-                       if (gap > gapsize) {
-                               gapsize = gap;
-                               gapstart = end;
-                               found = 1;
-                       }
-               }
-               if (start < last)
-                       last = start;
-       }
-
-       if (!found) {
-               gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
-               printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
-                      KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
-       }
-
-       /*
-        * See how much we want to round up: start off with
-        * rounding to the next 1MB area.
-        */
-       round = 0x100000;
-       while ((gapsize >> 4) > round)
-               round += round;
-       /* Fun with two's complement */
-       pci_mem_start = (gapstart + round) & -round;
-
-       printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
-               pci_mem_start, gapstart, gapsize);
-}
diff --git a/arch/x86_64/kernel/early-quirks_64.c b/arch/x86_64/kernel/early-quirks_64.c
deleted file mode 100644 (file)
index 13aa4fd..0000000
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Various workarounds for chipset bugs.
-   This code runs very early and can't use the regular PCI subsystem
-   The entries are keyed to PCI bridges which usually identify chipsets
-   uniquely.
-   This is only for whole classes of chipsets with specific problems which
-   need early invasive action (e.g. before the timers are initialized).
-   Most PCI device specific workarounds can be done later and should be
-   in standard PCI quirks
-   Mainboard specific bugs should be handled by DMI entries.
-   CPU specific bugs in setup.c */
-
-#include <linux/pci.h>
-#include <linux/acpi.h>
-#include <linux/pci_ids.h>
-#include <asm/pci-direct.h>
-#include <asm/proto.h>
-#include <asm/iommu.h>
-#include <asm/dma.h>
-
-static void __init via_bugs(void)
-{
-#ifdef CONFIG_IOMMU
-       if ((end_pfn > MAX_DMA32_PFN ||  force_iommu) &&
-           !iommu_aperture_allowed) {
-               printk(KERN_INFO
-  "Looks like a VIA chipset. Disabling IOMMU. Override with iommu=allowed\n");
-               iommu_aperture_disabled = 1;
-       }
-#endif
-}
-
-#ifdef CONFIG_ACPI
-
-static int __init nvidia_hpet_check(struct acpi_table_header *header)
-{
-       return 0;
-}
-#endif
-
-static void __init nvidia_bugs(void)
-{
-#ifdef CONFIG_ACPI
-       /*
-        * All timer overrides on Nvidia are
-        * wrong unless HPET is enabled.
-        * Unfortunately that's not true on many Asus boards.
-        * We don't know yet how to detect this automatically, but
-        * at least allow a command line override.
-        */
-       if (acpi_use_timer_override)
-               return;
-
-       if (acpi_table_parse(ACPI_SIG_HPET, nvidia_hpet_check)) {
-               acpi_skip_timer_override = 1;
-               printk(KERN_INFO "Nvidia board "
-                      "detected. Ignoring ACPI "
-                      "timer override.\n");
-               printk(KERN_INFO "If you got timer trouble "
-                       "try acpi_use_timer_override\n");
-       }
-#endif
-       /* RED-PEN skip them on mptables too? */
-
-}
-
-static void __init ati_bugs(void)
-{
-       if (timer_over_8254 == 1) {
-               timer_over_8254 = 0;
-               printk(KERN_INFO
-               "ATI board detected. Disabling timer routing over 8254.\n");
-       }
-}
-
-struct chipset {
-       u16 vendor;
-       void (*f)(void);
-};
-
-static struct chipset early_qrk[] __initdata = {
-       { PCI_VENDOR_ID_NVIDIA, nvidia_bugs },
-       { PCI_VENDOR_ID_VIA, via_bugs },
-       { PCI_VENDOR_ID_ATI, ati_bugs },
-       {}
-};
-
-void __init early_quirks(void)
-{
-       int num, slot, func;
-
-       if (!early_pci_allowed())
-               return;
-
-       /* Poor man's PCI discovery */
-       for (num = 0; num < 32; num++) {
-               for (slot = 0; slot < 32; slot++) {
-                       for (func = 0; func < 8; func++) {
-                               u32 class;
-                               u32 vendor;
-                               u8 type;
-                               int i;
-                               class = read_pci_config(num,slot,func,
-                                                       PCI_CLASS_REVISION);
-                               if (class == 0xffffffff)
-                                       break;
-
-                               if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
-                                       continue;
-
-                               vendor = read_pci_config(num, slot, func,
-                                                        PCI_VENDOR_ID);
-                               vendor &= 0xffff;
-
-                               for (i = 0; early_qrk[i].f; i++)
-                                       if (early_qrk[i].vendor == vendor) {
-                                               early_qrk[i].f();
-                                               return;
-                                       }
-
-                               type = read_pci_config_byte(num, slot, func,
-                                                           PCI_HEADER_TYPE);
-                               if (!(type & 0x80))
-                                       break;
-                       }
-               }
-       }
-}
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c
deleted file mode 100644 (file)
index fd9aff3..0000000
+++ /dev/null
@@ -1,259 +0,0 @@
-#include <linux/console.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/screen_info.h>
-#include <asm/io.h>
-#include <asm/processor.h>
-#include <asm/fcntl.h>
-#include <xen/hvc-console.h>
-
-/* Simple VGA output */
-
-#ifdef __i386__
-#include <asm/setup.h>
-#else
-#include <asm/bootsetup.h>
-#endif
-#define VGABASE                (__ISA_IO_base + 0xb8000)
-
-static int max_ypos = 25, max_xpos = 80;
-static int current_ypos = 25, current_xpos = 0;
-
-static void early_vga_write(struct console *con, const char *str, unsigned n)
-{
-       char c;
-       int  i, k, j;
-
-       while ((c = *str++) != '\0' && n-- > 0) {
-               if (current_ypos >= max_ypos) {
-                       /* scroll 1 line up */
-                       for (k = 1, j = 0; k < max_ypos; k++, j++) {
-                               for (i = 0; i < max_xpos; i++) {
-                                       writew(readw(VGABASE+2*(max_xpos*k+i)),
-                                              VGABASE + 2*(max_xpos*j + i));
-                               }
-                       }
-                       for (i = 0; i < max_xpos; i++)
-                               writew(0x720, VGABASE + 2*(max_xpos*j + i));
-                       current_ypos = max_ypos-1;
-               }
-               if (c == '\n') {
-                       current_xpos = 0;
-                       current_ypos++;
-               } else if (c != '\r')  {
-                       writew(((0x7 << 8) | (unsigned short) c),
-                              VGABASE + 2*(max_xpos*current_ypos +
-                                               current_xpos++));
-                       if (current_xpos >= max_xpos) {
-                               current_xpos = 0;
-                               current_ypos++;
-                       }
-               }
-       }
-}
-
-static struct console early_vga_console = {
-       .name =         "earlyvga",
-       .write =        early_vga_write,
-       .flags =        CON_PRINTBUFFER,
-       .index =        -1,
-};
-
-/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
-
-static int early_serial_base = 0x3f8;  /* ttyS0 */
-
-#define XMTRDY          0x20
-
-#define DLAB           0x80
-
-#define TXR             0       /*  Transmit register (WRITE) */
-#define RXR             0       /*  Receive register  (READ)  */
-#define IER             1       /*  Interrupt Enable          */
-#define IIR             2       /*  Interrupt ID              */
-#define FCR             2       /*  FIFO control              */
-#define LCR             3       /*  Line control              */
-#define MCR             4       /*  Modem control             */
-#define LSR             5       /*  Line Status               */
-#define MSR             6       /*  Modem Status              */
-#define DLL             0       /*  Divisor Latch Low         */
-#define DLH             1       /*  Divisor latch High        */
-
-static int early_serial_putc(unsigned char ch)
-{
-       unsigned timeout = 0xffff;
-       while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
-               cpu_relax();
-       outb(ch, early_serial_base + TXR);
-       return timeout ? 0 : -1;
-}
-
-static void early_serial_write(struct console *con, const char *s, unsigned n)
-{
-       while (*s && n-- > 0) {
-               if (*s == '\n')
-                       early_serial_putc('\r');
-               early_serial_putc(*s);
-               s++;
-       }
-}
-
-#define DEFAULT_BAUD 9600
-
-static __init void early_serial_init(char *s)
-{
-       unsigned char c;
-       unsigned divisor;
-       unsigned baud = DEFAULT_BAUD;
-       char *e;
-
-       if (*s == ',')
-               ++s;
-
-       if (*s) {
-               unsigned port;
-               if (!strncmp(s,"0x",2)) {
-                       early_serial_base = simple_strtoul(s, &e, 16);
-               } else {
-                       static int bases[] = { 0x3f8, 0x2f8 };
-
-                       if (!strncmp(s,"ttyS",4))
-                               s += 4;
-                       port = simple_strtoul(s, &e, 10);
-                       if (port > 1 || s == e)
-                               port = 0;
-                       early_serial_base = bases[port];
-               }
-               s += strcspn(s, ",");
-               if (*s == ',')
-                       s++;
-       }
-
-       outb(0x3, early_serial_base + LCR);     /* 8n1 */
-       outb(0, early_serial_base + IER);       /* no interrupt */
-       outb(0, early_serial_base + FCR);       /* no fifo */
-       outb(0x3, early_serial_base + MCR);     /* DTR + RTS */
-
-       if (*s) {
-               baud = simple_strtoul(s, &e, 0);
-               if (baud == 0 || s == e)
-                       baud = DEFAULT_BAUD;
-       }
-
-       divisor = 115200 / baud;
-       c = inb(early_serial_base + LCR);
-       outb(c | DLAB, early_serial_base + LCR);
-       outb(divisor & 0xff, early_serial_base + DLL);
-       outb((divisor >> 8) & 0xff, early_serial_base + DLH);
-       outb(c & ~DLAB, early_serial_base + LCR);
-}
-
-static struct console early_serial_console = {
-       .name =         "earlyser",
-       .write =        early_serial_write,
-       .flags =        CON_PRINTBUFFER,
-       .index =        -1,
-};
-
-/* Console interface to a host file on AMD's SimNow! */
-
-static int simnow_fd;
-
-enum {
-       MAGIC1 = 0xBACCD00A,
-       MAGIC2 = 0xCA110000,
-       XOPEN = 5,
-       XWRITE = 4,
-};
-
-static noinline long simnow(long cmd, long a, long b, long c)
-{
-       long ret;
-       asm volatile("cpuid" :
-                    "=a" (ret) :
-                    "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
-       return ret;
-}
-
-static void __init simnow_init(char *str)
-{
-       char *fn = "klog";
-       if (*str == '=')
-               fn = ++str;
-       /* error ignored */
-       simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644);
-}
-
-static void simnow_write(struct console *con, const char *s, unsigned n)
-{
-       simnow(XWRITE, simnow_fd, (unsigned long)s, n);
-}
-
-static struct console simnow_console = {
-       .name =         "simnow",
-       .write =        simnow_write,
-       .flags =        CON_PRINTBUFFER,
-       .index =        -1,
-};
-
-/* Direct interface for emergencies */
-struct console *early_console = &early_vga_console;
-static int early_console_initialized = 0;
-
-void early_printk(const char *fmt, ...)
-{
-       char buf[512];
-       int n;
-       va_list ap;
-
-       va_start(ap,fmt);
-       n = vscnprintf(buf,512,fmt,ap);
-       early_console->write(early_console,buf,n);
-       va_end(ap);
-}
-
-static int __initdata keep_early;
-
-static int __init setup_early_printk(char *buf)
-{
-       if (!buf)
-               return 0;
-
-       if (early_console_initialized)
-               return 0;
-       early_console_initialized = 1;
-
-       if (strstr(buf, "keep"))
-               keep_early = 1;
-
-       if (!strncmp(buf, "serial", 6)) {
-               early_serial_init(buf + 6);
-               early_console = &early_serial_console;
-       } else if (!strncmp(buf, "ttyS", 4)) {
-               early_serial_init(buf);
-               early_console = &early_serial_console;
-       } else if (!strncmp(buf, "vga", 3)
-                  && SCREEN_INFO.orig_video_isVGA == 1) {
-               max_xpos = SCREEN_INFO.orig_video_cols;
-               max_ypos = SCREEN_INFO.orig_video_lines;
-               current_ypos = SCREEN_INFO.orig_y;
-               early_console = &early_vga_console;
-       } else if (!strncmp(buf, "simnow", 6)) {
-               simnow_init(buf + 6);
-               early_console = &simnow_console;
-               keep_early = 1;
-#ifdef CONFIG_HVC_XEN
-       } else if (!strncmp(buf, "xen", 3)) {
-               early_console = &xenboot_console;
-#endif
-       }
-
-       if (keep_early)
-               early_console->flags &= ~CON_BOOT;
-       else
-               early_console->flags |= CON_BOOT;
-       register_console(early_console);
-       return 0;
-}
-early_param("earlyprintk", setup_early_printk);
diff --git a/arch/x86_64/kernel/entry_64.S b/arch/x86_64/kernel/entry_64.S
deleted file mode 100644 (file)
index 1d232e5..0000000
+++ /dev/null
@@ -1,1172 +0,0 @@
-/*
- *  linux/arch/x86_64/entry.S
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- *  Copyright (C) 2000, 2001, 2002  Andi Kleen SuSE Labs
- *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
- */
-
-/*
- * entry.S contains the system-call and fault low-level handling routines.
- *
- * NOTE: This code handles signal-recognition, which happens every time
- * after an interrupt and after each system call.
- * 
- * Normal syscalls and interrupts don't save a full stack frame, this is 
- * only done for syscall tracing, signals or fork/exec et.al.
- * 
- * A note on terminology:       
- * - top of stack: Architecture defined interrupt frame from SS to RIP 
- * at the top of the kernel process stack.     
- * - partial stack frame: partially saved registers upto R11.
- * - full stack frame: Like partial stack frame, but all register saved. 
- *
- * Some macro usage:
- * - CFI macros are used to generate dwarf2 unwind information for better
- * backtraces. They don't change any code.
- * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
- * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
- * There are unfortunately lots of special cases where some registers
- * not touched. The macro is a big mess that should be cleaned up.
- * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
- * Gives a full stack frame.
- * - ENTRY/END Define functions in the symbol table.
- * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
- * frame that is otherwise undefined after a SYSCALL
- * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
- * - errorentry/paranoidentry/zeroentry - Define exception entry points.
- */
-
-#include <linux/linkage.h>
-#include <asm/segment.h>
-#include <asm/cache.h>
-#include <asm/errno.h>
-#include <asm/dwarf2.h>
-#include <asm/calling.h>
-#include <asm/asm-offsets.h>
-#include <asm/msr.h>
-#include <asm/unistd.h>
-#include <asm/thread_info.h>
-#include <asm/hw_irq.h>
-#include <asm/page.h>
-#include <asm/irqflags.h>
-
-       .code64
-
-#ifndef CONFIG_PREEMPT
-#define retint_kernel retint_restore_args
-#endif 
-
-
-.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
-#ifdef CONFIG_TRACE_IRQFLAGS
-       bt   $9,EFLAGS-\offset(%rsp)    /* interrupts off? */
-       jnc  1f
-       TRACE_IRQS_ON
-1:
-#endif
-.endm
-
-/*
- * C code is not supposed to know about undefined top of stack. Every time 
- * a C function with an pt_regs argument is called from the SYSCALL based 
- * fast path FIXUP_TOP_OF_STACK is needed.
- * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
- * manipulation.
- */            
-               
-       /* %rsp:at FRAMEEND */ 
-       .macro FIXUP_TOP_OF_STACK tmp
-       movq    %gs:pda_oldrsp,\tmp
-       movq    \tmp,RSP(%rsp)
-       movq    $__USER_DS,SS(%rsp)
-       movq    $__USER_CS,CS(%rsp)
-       movq    $-1,RCX(%rsp)
-       movq    R11(%rsp),\tmp  /* get eflags */
-       movq    \tmp,EFLAGS(%rsp)
-       .endm
-
-       .macro RESTORE_TOP_OF_STACK tmp,offset=0
-       movq   RSP-\offset(%rsp),\tmp
-       movq   \tmp,%gs:pda_oldrsp
-       movq   EFLAGS-\offset(%rsp),\tmp
-       movq   \tmp,R11-\offset(%rsp)
-       .endm
-
-       .macro FAKE_STACK_FRAME child_rip
-       /* push in order ss, rsp, eflags, cs, rip */
-       xorl %eax, %eax
-       pushq %rax /* ss */
-       CFI_ADJUST_CFA_OFFSET   8
-       /*CFI_REL_OFFSET        ss,0*/
-       pushq %rax /* rsp */
-       CFI_ADJUST_CFA_OFFSET   8
-       CFI_REL_OFFSET  rsp,0
-       pushq $(1<<9) /* eflags - interrupts on */
-       CFI_ADJUST_CFA_OFFSET   8
-       /*CFI_REL_OFFSET        rflags,0*/
-       pushq $__KERNEL_CS /* cs */
-       CFI_ADJUST_CFA_OFFSET   8
-       /*CFI_REL_OFFSET        cs,0*/
-       pushq \child_rip /* rip */
-       CFI_ADJUST_CFA_OFFSET   8
-       CFI_REL_OFFSET  rip,0
-       pushq   %rax /* orig rax */
-       CFI_ADJUST_CFA_OFFSET   8
-       .endm
-
-       .macro UNFAKE_STACK_FRAME
-       addq $8*6, %rsp
-       CFI_ADJUST_CFA_OFFSET   -(6*8)
-       .endm
-
-       .macro  CFI_DEFAULT_STACK start=1
-       .if \start
-       CFI_STARTPROC   simple
-       CFI_SIGNAL_FRAME
-       CFI_DEF_CFA     rsp,SS+8
-       .else
-       CFI_DEF_CFA_OFFSET SS+8
-       .endif
-       CFI_REL_OFFSET  r15,R15
-       CFI_REL_OFFSET  r14,R14
-       CFI_REL_OFFSET  r13,R13
-       CFI_REL_OFFSET  r12,R12
-       CFI_REL_OFFSET  rbp,RBP
-       CFI_REL_OFFSET  rbx,RBX
-       CFI_REL_OFFSET  r11,R11
-       CFI_REL_OFFSET  r10,R10
-       CFI_REL_OFFSET  r9,R9
-       CFI_REL_OFFSET  r8,R8
-       CFI_REL_OFFSET  rax,RAX
-       CFI_REL_OFFSET  rcx,RCX
-       CFI_REL_OFFSET  rdx,RDX
-       CFI_REL_OFFSET  rsi,RSI
-       CFI_REL_OFFSET  rdi,RDI
-       CFI_REL_OFFSET  rip,RIP
-       /*CFI_REL_OFFSET        cs,CS*/
-       /*CFI_REL_OFFSET        rflags,EFLAGS*/
-       CFI_REL_OFFSET  rsp,RSP
-       /*CFI_REL_OFFSET        ss,SS*/
-       .endm
-/*
- * A newly forked process directly context switches into this.
- */    
-/* rdi:        prev */ 
-ENTRY(ret_from_fork)
-       CFI_DEFAULT_STACK
-       push kernel_eflags(%rip)
-       CFI_ADJUST_CFA_OFFSET 4
-       popf                            # reset kernel eflags
-       CFI_ADJUST_CFA_OFFSET -4
-       call schedule_tail
-       GET_THREAD_INFO(%rcx)
-       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
-       jnz rff_trace
-rff_action:    
-       RESTORE_REST
-       testl $3,CS-ARGOFFSET(%rsp)     # from kernel_thread?
-       je   int_ret_from_sys_call
-       testl $_TIF_IA32,threadinfo_flags(%rcx)
-       jnz  int_ret_from_sys_call
-       RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
-       jmp ret_from_sys_call
-rff_trace:
-       movq %rsp,%rdi
-       call syscall_trace_leave
-       GET_THREAD_INFO(%rcx)   
-       jmp rff_action
-       CFI_ENDPROC
-END(ret_from_fork)
-
-/*
- * System call entry. Upto 6 arguments in registers are supported.
- *
- * SYSCALL does not save anything on the stack and does not change the
- * stack pointer.
- */
-               
-/*
- * Register setup:     
- * rax  system call number
- * rdi  arg0
- * rcx  return address for syscall/sysret, C arg3 
- * rsi  arg1
- * rdx  arg2   
- * r10  arg3   (--> moved to rcx for C)
- * r8   arg4
- * r9   arg5
- * r11  eflags for syscall/sysret, temporary for C
- * r12-r15,rbp,rbx saved by C code, not touched.               
- * 
- * Interrupts are off on entry.
- * Only called from user space.
- *
- * XXX if we had a free scratch register we could save the RSP into the stack frame
- *      and report it properly in ps. Unfortunately we haven't.
- *
- * When user can change the frames always force IRET. That is because
- * it deals with uncanonical addresses better. SYSRET has trouble
- * with them due to bugs in both AMD and Intel CPUs.
- */                                    
-
-ENTRY(system_call)
-       CFI_STARTPROC   simple
-       CFI_SIGNAL_FRAME
-       CFI_DEF_CFA     rsp,PDA_STACKOFFSET
-       CFI_REGISTER    rip,rcx
-       /*CFI_REGISTER  rflags,r11*/
-       swapgs
-       movq    %rsp,%gs:pda_oldrsp 
-       movq    %gs:pda_kernelstack,%rsp
-       /*
-        * No need to follow this irqs off/on section - it's straight
-        * and short:
-        */
-       sti                                     
-       SAVE_ARGS 8,1
-       movq  %rax,ORIG_RAX-ARGOFFSET(%rsp) 
-       movq  %rcx,RIP-ARGOFFSET(%rsp)
-       CFI_REL_OFFSET rip,RIP-ARGOFFSET
-       GET_THREAD_INFO(%rcx)
-       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
-       jnz tracesys
-       cmpq $__NR_syscall_max,%rax
-       ja badsys
-       movq %r10,%rcx
-       call *sys_call_table(,%rax,8)  # XXX:    rip relative
-       movq %rax,RAX-ARGOFFSET(%rsp)
-/*
- * Syscall return path ending with SYSRET (fast path)
- * Has incomplete stack frame and undefined top of stack. 
- */            
-ret_from_sys_call:
-       movl $_TIF_ALLWORK_MASK,%edi
-       /* edi: flagmask */
-sysret_check:          
-       GET_THREAD_INFO(%rcx)
-       cli
-       TRACE_IRQS_OFF
-       movl threadinfo_flags(%rcx),%edx
-       andl %edi,%edx
-       jnz  sysret_careful 
-       CFI_REMEMBER_STATE
-       /*
-        * sysretq will re-enable interrupts:
-        */
-       TRACE_IRQS_ON
-       movq RIP-ARGOFFSET(%rsp),%rcx
-       CFI_REGISTER    rip,rcx
-       RESTORE_ARGS 0,-ARG_SKIP,1
-       /*CFI_REGISTER  rflags,r11*/
-       movq    %gs:pda_oldrsp,%rsp
-       swapgs
-       sysretq
-
-       CFI_RESTORE_STATE
-       /* Handle reschedules */
-       /* edx: work, edi: workmask */  
-sysret_careful:
-       bt $TIF_NEED_RESCHED,%edx
-       jnc sysret_signal
-       TRACE_IRQS_ON
-       sti
-       pushq %rdi
-       CFI_ADJUST_CFA_OFFSET 8
-       call schedule
-       popq  %rdi
-       CFI_ADJUST_CFA_OFFSET -8
-       jmp sysret_check
-
-       /* Handle a signal */ 
-sysret_signal:
-       TRACE_IRQS_ON
-       sti
-       testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
-       jz    1f
-
-       /* Really a signal */
-       /* edx: work flags (arg3) */
-       leaq do_notify_resume(%rip),%rax
-       leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
-       xorl %esi,%esi # oldset -> arg2
-       call ptregscall_common
-1:     movl $_TIF_NEED_RESCHED,%edi
-       /* Use IRET because user could have changed frame. This
-          works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
-       cli
-       TRACE_IRQS_OFF
-       jmp int_with_check
-       
-badsys:
-       movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
-       jmp ret_from_sys_call
-
-       /* Do syscall tracing */
-tracesys:                       
-       SAVE_REST
-       movq $-ENOSYS,RAX(%rsp)
-       FIXUP_TOP_OF_STACK %rdi
-       movq %rsp,%rdi
-       call syscall_trace_enter
-       LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
-       RESTORE_REST
-       cmpq $__NR_syscall_max,%rax
-       movq $-ENOSYS,%rcx
-       cmova %rcx,%rax
-       ja  1f
-       movq %r10,%rcx  /* fixup for C */
-       call *sys_call_table(,%rax,8)
-1:     movq %rax,RAX-ARGOFFSET(%rsp)
-       /* Use IRET because user could have changed frame */
-               
-/* 
- * Syscall return path ending with IRET.
- * Has correct top of stack, but partial stack frame.
- */
-       .globl int_ret_from_sys_call
-int_ret_from_sys_call:
-       cli
-       TRACE_IRQS_OFF
-       testl $3,CS-ARGOFFSET(%rsp)
-       je retint_restore_args
-       movl $_TIF_ALLWORK_MASK,%edi
-       /* edi: mask to check */
-int_with_check:
-       GET_THREAD_INFO(%rcx)
-       movl threadinfo_flags(%rcx),%edx
-       andl %edi,%edx
-       jnz   int_careful
-       andl    $~TS_COMPAT,threadinfo_status(%rcx)
-       jmp   retint_swapgs
-
-       /* Either reschedule or signal or syscall exit tracking needed. */
-       /* First do a reschedule test. */
-       /* edx: work, edi: workmask */
-int_careful:
-       bt $TIF_NEED_RESCHED,%edx
-       jnc  int_very_careful
-       TRACE_IRQS_ON
-       sti
-       pushq %rdi
-       CFI_ADJUST_CFA_OFFSET 8
-       call schedule
-       popq %rdi
-       CFI_ADJUST_CFA_OFFSET -8
-       cli
-       TRACE_IRQS_OFF
-       jmp int_with_check
-
-       /* handle signals and tracing -- both require a full stack frame */
-int_very_careful:
-       TRACE_IRQS_ON
-       sti
-       SAVE_REST
-       /* Check for syscall exit trace */      
-       testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
-       jz int_signal
-       pushq %rdi
-       CFI_ADJUST_CFA_OFFSET 8
-       leaq 8(%rsp),%rdi       # &ptregs -> arg1       
-       call syscall_trace_leave
-       popq %rdi
-       CFI_ADJUST_CFA_OFFSET -8
-       andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
-       jmp int_restore_rest
-       
-int_signal:
-       testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
-       jz 1f
-       movq %rsp,%rdi          # &ptregs -> arg1
-       xorl %esi,%esi          # oldset -> arg2
-       call do_notify_resume
-1:     movl $_TIF_NEED_RESCHED,%edi    
-int_restore_rest:
-       RESTORE_REST
-       cli
-       TRACE_IRQS_OFF
-       jmp int_with_check
-       CFI_ENDPROC
-END(system_call)
-               
-/* 
- * Certain special system calls that need to save a complete full stack frame.
- */                                                            
-       
-       .macro PTREGSCALL label,func,arg
-       .globl \label
-\label:
-       leaq    \func(%rip),%rax
-       leaq    -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
-       jmp     ptregscall_common
-END(\label)
-       .endm
-
-       CFI_STARTPROC
-
-       PTREGSCALL stub_clone, sys_clone, %r8
-       PTREGSCALL stub_fork, sys_fork, %rdi
-       PTREGSCALL stub_vfork, sys_vfork, %rdi
-       PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
-       PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
-       PTREGSCALL stub_iopl, sys_iopl, %rsi
-
-ENTRY(ptregscall_common)
-       popq %r11
-       CFI_ADJUST_CFA_OFFSET -8
-       CFI_REGISTER rip, r11
-       SAVE_REST
-       movq %r11, %r15
-       CFI_REGISTER rip, r15
-       FIXUP_TOP_OF_STACK %r11
-       call *%rax
-       RESTORE_TOP_OF_STACK %r11
-       movq %r15, %r11
-       CFI_REGISTER rip, r11
-       RESTORE_REST
-       pushq %r11
-       CFI_ADJUST_CFA_OFFSET 8
-       CFI_REL_OFFSET rip, 0
-       ret
-       CFI_ENDPROC
-END(ptregscall_common)
-       
-ENTRY(stub_execve)
-       CFI_STARTPROC
-       popq %r11
-       CFI_ADJUST_CFA_OFFSET -8
-       CFI_REGISTER rip, r11
-       SAVE_REST
-       FIXUP_TOP_OF_STACK %r11
-       call sys_execve
-       RESTORE_TOP_OF_STACK %r11
-       movq %rax,RAX(%rsp)
-       RESTORE_REST
-       jmp int_ret_from_sys_call
-       CFI_ENDPROC
-END(stub_execve)
-       
-/*
- * sigreturn is special because it needs to restore all registers on return.
- * This cannot be done with SYSRET, so use the IRET return path instead.
- */                
-ENTRY(stub_rt_sigreturn)
-       CFI_STARTPROC
-       addq $8, %rsp
-       CFI_ADJUST_CFA_OFFSET   -8
-       SAVE_REST
-       movq %rsp,%rdi
-       FIXUP_TOP_OF_STACK %r11
-       call sys_rt_sigreturn
-       movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
-       RESTORE_REST
-       jmp int_ret_from_sys_call
-       CFI_ENDPROC
-END(stub_rt_sigreturn)
-
-/*
- * initial frame state for interrupts and exceptions
- */
-       .macro _frame ref
-       CFI_STARTPROC simple
-       CFI_SIGNAL_FRAME
-       CFI_DEF_CFA rsp,SS+8-\ref
-       /*CFI_REL_OFFSET ss,SS-\ref*/
-       CFI_REL_OFFSET rsp,RSP-\ref
-       /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
-       /*CFI_REL_OFFSET cs,CS-\ref*/
-       CFI_REL_OFFSET rip,RIP-\ref
-       .endm
-
-/* initial frame state for interrupts (and exceptions without error code) */
-#define INTR_FRAME _frame RIP
-/* initial frame state for exceptions with error code (and interrupts with
-   vector already pushed) */
-#define XCPT_FRAME _frame ORIG_RAX
-
-/* 
- * Interrupt entry/exit.
- *
- * Interrupt entry points save only callee clobbered registers in fast path.
- *     
- * Entry runs with interrupts off.     
- */ 
-
-/* 0(%rsp): interrupt number */ 
-       .macro interrupt func
-       cld
-       SAVE_ARGS
-       leaq -ARGOFFSET(%rsp),%rdi      # arg1 for handler
-       pushq %rbp
-       CFI_ADJUST_CFA_OFFSET   8
-       CFI_REL_OFFSET          rbp, 0
-       movq %rsp,%rbp
-       CFI_DEF_CFA_REGISTER    rbp
-       testl $3,CS(%rdi)
-       je 1f
-       swapgs  
-       /* irqcount is used to check if a CPU is already on an interrupt
-          stack or not. While this is essentially redundant with preempt_count
-          it is a little cheaper to use a separate counter in the PDA
-          (short of moving irq_enter into assembly, which would be too
-           much work) */
-1:     incl    %gs:pda_irqcount
-       cmoveq %gs:pda_irqstackptr,%rsp
-       push    %rbp                    # backlink for old unwinder
-       /*
-        * We entered an interrupt context - irqs are off:
-        */
-       TRACE_IRQS_OFF
-       call \func
-       .endm
-
-ENTRY(common_interrupt)
-       XCPT_FRAME
-       interrupt do_IRQ
-       /* 0(%rsp): oldrsp-ARGOFFSET */
-ret_from_intr:
-       cli     
-       TRACE_IRQS_OFF
-       decl %gs:pda_irqcount
-       leaveq
-       CFI_DEF_CFA_REGISTER    rsp
-       CFI_ADJUST_CFA_OFFSET   -8
-exit_intr:
-       GET_THREAD_INFO(%rcx)
-       testl $3,CS-ARGOFFSET(%rsp)
-       je retint_kernel
-       
-       /* Interrupt came from user space */
-       /*
-        * Has a correct top of stack, but a partial stack frame
-        * %rcx: thread info. Interrupts off.
-        */             
-retint_with_reschedule:
-       movl $_TIF_WORK_MASK,%edi
-retint_check:
-       movl threadinfo_flags(%rcx),%edx
-       andl %edi,%edx
-       CFI_REMEMBER_STATE
-       jnz  retint_careful
-retint_swapgs:         
-       /*
-        * The iretq could re-enable interrupts:
-        */
-       cli
-       TRACE_IRQS_IRETQ
-       swapgs 
-       jmp restore_args
-
-retint_restore_args:                           
-       cli
-       /*
-        * The iretq could re-enable interrupts:
-        */
-       TRACE_IRQS_IRETQ
-restore_args:
-       RESTORE_ARGS 0,8,0                                              
-iret_label:    
-       iretq
-
-       .section __ex_table,"a"
-       .quad iret_label,bad_iret       
-       .previous
-       .section .fixup,"ax"
-       /* force a signal here? this matches i386 behaviour */
-       /* running with kernel gs */
-bad_iret:
-       movq $11,%rdi   /* SIGSEGV */
-       TRACE_IRQS_ON
-       sti
-       jmp do_exit                     
-       .previous       
-       
-       /* edi: workmask, edx: work */
-retint_careful:
-       CFI_RESTORE_STATE
-       bt    $TIF_NEED_RESCHED,%edx
-       jnc   retint_signal
-       TRACE_IRQS_ON
-       sti
-       pushq %rdi
-       CFI_ADJUST_CFA_OFFSET   8
-       call  schedule
-       popq %rdi               
-       CFI_ADJUST_CFA_OFFSET   -8
-       GET_THREAD_INFO(%rcx)
-       cli
-       TRACE_IRQS_OFF
-       jmp retint_check
-       
-retint_signal:
-       testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
-       jz    retint_swapgs
-       TRACE_IRQS_ON
-       sti
-       SAVE_REST
-       movq $-1,ORIG_RAX(%rsp)                         
-       xorl %esi,%esi          # oldset
-       movq %rsp,%rdi          # &pt_regs
-       call do_notify_resume
-       RESTORE_REST
-       cli
-       TRACE_IRQS_OFF
-       movl $_TIF_NEED_RESCHED,%edi
-       GET_THREAD_INFO(%rcx)
-       jmp retint_check
-
-#ifdef CONFIG_PREEMPT
-       /* Returning to kernel space. Check if we need preemption */
-       /* rcx:  threadinfo. interrupts off. */
-ENTRY(retint_kernel)
-       cmpl $0,threadinfo_preempt_count(%rcx)
-       jnz  retint_restore_args
-       bt  $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
-       jnc  retint_restore_args
-       bt   $9,EFLAGS-ARGOFFSET(%rsp)  /* interrupts off? */
-       jnc  retint_restore_args
-       call preempt_schedule_irq
-       jmp exit_intr
-#endif 
-
-       CFI_ENDPROC
-END(common_interrupt)
-       
-/*
- * APIC interrupts.
- */            
-       .macro apicinterrupt num,func
-       INTR_FRAME
-       pushq $~(\num)
-       CFI_ADJUST_CFA_OFFSET 8
-       interrupt \func
-       jmp ret_from_intr
-       CFI_ENDPROC
-       .endm
-
-ENTRY(thermal_interrupt)
-       apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
-END(thermal_interrupt)
-
-ENTRY(threshold_interrupt)
-       apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
-END(threshold_interrupt)
-
-#ifdef CONFIG_SMP      
-ENTRY(reschedule_interrupt)
-       apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
-END(reschedule_interrupt)
-
-       .macro INVALIDATE_ENTRY num
-ENTRY(invalidate_interrupt\num)
-       apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt 
-END(invalidate_interrupt\num)
-       .endm
-
-       INVALIDATE_ENTRY 0
-       INVALIDATE_ENTRY 1
-       INVALIDATE_ENTRY 2
-       INVALIDATE_ENTRY 3
-       INVALIDATE_ENTRY 4
-       INVALIDATE_ENTRY 5
-       INVALIDATE_ENTRY 6
-       INVALIDATE_ENTRY 7
-
-ENTRY(call_function_interrupt)
-       apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
-END(call_function_interrupt)
-ENTRY(irq_move_cleanup_interrupt)
-       apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
-END(irq_move_cleanup_interrupt)
-#endif
-
-ENTRY(apic_timer_interrupt)
-       apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
-END(apic_timer_interrupt)
-
-ENTRY(error_interrupt)
-       apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
-END(error_interrupt)
-
-ENTRY(spurious_interrupt)
-       apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
-END(spurious_interrupt)
-                               
-/*
- * Exception entry points.
- */            
-       .macro zeroentry sym
-       INTR_FRAME
-       pushq $0        /* push error code/oldrax */ 
-       CFI_ADJUST_CFA_OFFSET 8
-       pushq %rax      /* push real oldrax to the rdi slot */ 
-       CFI_ADJUST_CFA_OFFSET 8
-       CFI_REL_OFFSET rax,0
-       leaq  \sym(%rip),%rax
-       jmp error_entry
-       CFI_ENDPROC
-       .endm   
-
-       .macro errorentry sym
-       XCPT_FRAME
-       pushq %rax
-       CFI_ADJUST_CFA_OFFSET 8
-       CFI_REL_OFFSET rax,0
-       leaq  \sym(%rip),%rax
-       jmp error_entry
-       CFI_ENDPROC
-       .endm
-
-       /* error code is on the stack already */
-       /* handle NMI like exceptions that can happen everywhere */
-       .macro paranoidentry sym, ist=0, irqtrace=1
-       SAVE_ALL
-       cld
-       movl $1,%ebx
-       movl  $MSR_GS_BASE,%ecx
-       rdmsr
-       testl %edx,%edx
-       js    1f
-       swapgs
-       xorl  %ebx,%ebx
-1:
-       .if \ist
-       movq    %gs:pda_data_offset, %rbp
-       .endif
-       movq %rsp,%rdi
-       movq ORIG_RAX(%rsp),%rsi
-       movq $-1,ORIG_RAX(%rsp)
-       .if \ist
-       subq    $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
-       .endif
-       call \sym
-       .if \ist
-       addq    $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
-       .endif
-       cli
-       .if \irqtrace
-       TRACE_IRQS_OFF
-       .endif
-       .endm
-
-       /*
-        * "Paranoid" exit path from exception stack.
-        * Paranoid because this is used by NMIs and cannot take
-        * any kernel state for granted.
-        * We don't do kernel preemption checks here, because only
-        * NMI should be common and it does not enable IRQs and
-        * cannot get reschedule ticks.
-        *
-        * "trace" is 0 for the NMI handler only, because irq-tracing
-        * is fundamentally NMI-unsafe. (we cannot change the soft and
-        * hard flags at once, atomically)
-        */
-       .macro paranoidexit trace=1
-       /* ebx: no swapgs flag */
-paranoid_exit\trace:
-       testl %ebx,%ebx                         /* swapgs needed? */
-       jnz paranoid_restore\trace
-       testl $3,CS(%rsp)
-       jnz   paranoid_userspace\trace
-paranoid_swapgs\trace:
-       .if \trace
-       TRACE_IRQS_IRETQ 0
-       .endif
-       swapgs
-paranoid_restore\trace:
-       RESTORE_ALL 8
-       iretq
-paranoid_userspace\trace:
-       GET_THREAD_INFO(%rcx)
-       movl threadinfo_flags(%rcx),%ebx
-       andl $_TIF_WORK_MASK,%ebx
-       jz paranoid_swapgs\trace
-       movq %rsp,%rdi                  /* &pt_regs */
-       call sync_regs
-       movq %rax,%rsp                  /* switch stack for scheduling */
-       testl $_TIF_NEED_RESCHED,%ebx
-       jnz paranoid_schedule\trace
-       movl %ebx,%edx                  /* arg3: thread flags */
-       .if \trace
-       TRACE_IRQS_ON
-       .endif
-       sti
-       xorl %esi,%esi                  /* arg2: oldset */
-       movq %rsp,%rdi                  /* arg1: &pt_regs */
-       call do_notify_resume
-       cli
-       .if \trace
-       TRACE_IRQS_OFF
-       .endif
-       jmp paranoid_userspace\trace
-paranoid_schedule\trace:
-       .if \trace
-       TRACE_IRQS_ON
-       .endif
-       sti
-       call schedule
-       cli
-       .if \trace
-       TRACE_IRQS_OFF
-       .endif
-       jmp paranoid_userspace\trace
-       CFI_ENDPROC
-       .endm
-
-/*
- * Exception entry point. This expects an error code/orig_rax on the stack
- * and the exception handler in %rax.  
- */                                            
-KPROBE_ENTRY(error_entry)
-       _frame RDI
-       CFI_REL_OFFSET rax,0
-       /* rdi slot contains rax, oldrax contains error code */
-       cld     
-       subq  $14*8,%rsp
-       CFI_ADJUST_CFA_OFFSET   (14*8)
-       movq %rsi,13*8(%rsp)
-       CFI_REL_OFFSET  rsi,RSI
-       movq 14*8(%rsp),%rsi    /* load rax from rdi slot */
-       CFI_REGISTER    rax,rsi
-       movq %rdx,12*8(%rsp)
-       CFI_REL_OFFSET  rdx,RDX
-       movq %rcx,11*8(%rsp)
-       CFI_REL_OFFSET  rcx,RCX
-       movq %rsi,10*8(%rsp)    /* store rax */ 
-       CFI_REL_OFFSET  rax,RAX
-       movq %r8, 9*8(%rsp)
-       CFI_REL_OFFSET  r8,R8
-       movq %r9, 8*8(%rsp)
-       CFI_REL_OFFSET  r9,R9
-       movq %r10,7*8(%rsp)
-       CFI_REL_OFFSET  r10,R10
-       movq %r11,6*8(%rsp)
-       CFI_REL_OFFSET  r11,R11
-       movq %rbx,5*8(%rsp) 
-       CFI_REL_OFFSET  rbx,RBX
-       movq %rbp,4*8(%rsp) 
-       CFI_REL_OFFSET  rbp,RBP
-       movq %r12,3*8(%rsp) 
-       CFI_REL_OFFSET  r12,R12
-       movq %r13,2*8(%rsp) 
-       CFI_REL_OFFSET  r13,R13
-       movq %r14,1*8(%rsp) 
-       CFI_REL_OFFSET  r14,R14
-       movq %r15,(%rsp) 
-       CFI_REL_OFFSET  r15,R15
-       xorl %ebx,%ebx  
-       testl $3,CS(%rsp)
-       je  error_kernelspace
-error_swapgs:  
-       swapgs
-error_sti:     
-       movq %rdi,RDI(%rsp)     
-       CFI_REL_OFFSET  rdi,RDI
-       movq %rsp,%rdi
-       movq ORIG_RAX(%rsp),%rsi        /* get error code */ 
-       movq $-1,ORIG_RAX(%rsp)
-       call *%rax
-       /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */     
-error_exit:            
-       movl %ebx,%eax          
-       RESTORE_REST
-       cli
-       TRACE_IRQS_OFF
-       GET_THREAD_INFO(%rcx)   
-       testl %eax,%eax
-       jne  retint_kernel
-       movl  threadinfo_flags(%rcx),%edx
-       movl  $_TIF_WORK_MASK,%edi
-       andl  %edi,%edx
-       jnz  retint_careful
-       /*
-        * The iret might restore flags:
-        */
-       TRACE_IRQS_IRETQ
-       swapgs 
-       RESTORE_ARGS 0,8,0                                              
-       jmp iret_label
-       CFI_ENDPROC
-
-error_kernelspace:
-       incl %ebx
-       /* There are two places in the kernel that can potentially fault with
-          usergs. Handle them here. The exception handlers after
-          iret run with kernel gs again, so don't set the user space flag.
-          B stepping K8s sometimes report an truncated RIP for IRET 
-          exceptions returning to compat mode. Check for these here too. */
-       leaq iret_label(%rip),%rbp
-       cmpq %rbp,RIP(%rsp) 
-       je   error_swapgs
-       movl %ebp,%ebp  /* zero extend */
-       cmpq %rbp,RIP(%rsp) 
-       je   error_swapgs
-       cmpq $gs_change,RIP(%rsp)
-        je   error_swapgs
-       jmp  error_sti
-KPROBE_END(error_entry)
-       
-       /* Reload gs selector with exception handling */
-       /* edi:  new selector */ 
-ENTRY(load_gs_index)
-       CFI_STARTPROC
-       pushf
-       CFI_ADJUST_CFA_OFFSET 8
-       cli
-        swapgs
-gs_change:     
-        movl %edi,%gs   
-2:     mfence          /* workaround */
-       swapgs
-        popf
-       CFI_ADJUST_CFA_OFFSET -8
-        ret
-       CFI_ENDPROC
-ENDPROC(load_gs_index)
-       
-        .section __ex_table,"a"
-        .align 8
-        .quad gs_change,bad_gs
-        .previous
-        .section .fixup,"ax"
-       /* running with kernelgs */
-bad_gs: 
-       swapgs                  /* switch back to user gs */
-       xorl %eax,%eax
-        movl %eax,%gs
-        jmp  2b
-        .previous       
-       
-/*
- * Create a kernel thread.
- *
- * C extern interface:
- *     extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
- *
- * asm input arguments:
- *     rdi: fn, rsi: arg, rdx: flags
- */
-ENTRY(kernel_thread)
-       CFI_STARTPROC
-       FAKE_STACK_FRAME $child_rip
-       SAVE_ALL
-
-       # rdi: flags, rsi: usp, rdx: will be &pt_regs
-       movq %rdx,%rdi
-       orq  kernel_thread_flags(%rip),%rdi
-       movq $-1, %rsi
-       movq %rsp, %rdx
-
-       xorl %r8d,%r8d
-       xorl %r9d,%r9d
-       
-       # clone now
-       call do_fork
-       movq %rax,RAX(%rsp)
-       xorl %edi,%edi
-
-       /*
-        * It isn't worth to check for reschedule here,
-        * so internally to the x86_64 port you can rely on kernel_thread()
-        * not to reschedule the child before returning, this avoids the need
-        * of hacks for example to fork off the per-CPU idle tasks.
-         * [Hopefully no generic code relies on the reschedule -AK]    
-        */
-       RESTORE_ALL
-       UNFAKE_STACK_FRAME
-       ret
-       CFI_ENDPROC
-ENDPROC(kernel_thread)
-       
-child_rip:
-       pushq $0                # fake return address
-       CFI_STARTPROC
-       /*
-        * Here we are in the child and the registers are set as they were
-        * at kernel_thread() invocation in the parent.
-        */
-       movq %rdi, %rax
-       movq %rsi, %rdi
-       call *%rax
-       # exit
-       xorl %edi, %edi
-       call do_exit
-       CFI_ENDPROC
-ENDPROC(child_rip)
-
-/*
- * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
- *
- * C extern interface:
- *      extern long execve(char *name, char **argv, char **envp)
- *
- * asm input arguments:
- *     rdi: name, rsi: argv, rdx: envp
- *
- * We want to fallback into:
- *     extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
- *
- * do_sys_execve asm fallback arguments:
- *     rdi: name, rsi: argv, rdx: envp, fake frame on the stack
- */
-ENTRY(kernel_execve)
-       CFI_STARTPROC
-       FAKE_STACK_FRAME $0
-       SAVE_ALL        
-       call sys_execve
-       movq %rax, RAX(%rsp)    
-       RESTORE_REST
-       testq %rax,%rax
-       je int_ret_from_sys_call
-       RESTORE_ARGS
-       UNFAKE_STACK_FRAME
-       ret
-       CFI_ENDPROC
-ENDPROC(kernel_execve)
-
-KPROBE_ENTRY(page_fault)
-       errorentry do_page_fault
-KPROBE_END(page_fault)
-
-ENTRY(coprocessor_error)
-       zeroentry do_coprocessor_error
-END(coprocessor_error)
-
-ENTRY(simd_coprocessor_error)
-       zeroentry do_simd_coprocessor_error     
-END(simd_coprocessor_error)
-
-ENTRY(device_not_available)
-       zeroentry math_state_restore
-END(device_not_available)
-
-       /* runs on exception stack */
-KPROBE_ENTRY(debug)
-       INTR_FRAME
-       pushq $0
-       CFI_ADJUST_CFA_OFFSET 8         
-       paranoidentry do_debug, DEBUG_STACK
-       paranoidexit
-KPROBE_END(debug)
-
-       /* runs on exception stack */   
-KPROBE_ENTRY(nmi)
-       INTR_FRAME
-       pushq $-1
-       CFI_ADJUST_CFA_OFFSET 8
-       paranoidentry do_nmi, 0, 0
-#ifdef CONFIG_TRACE_IRQFLAGS
-       paranoidexit 0
-#else
-       jmp paranoid_exit1
-       CFI_ENDPROC
-#endif
-KPROBE_END(nmi)
-
-KPROBE_ENTRY(int3)
-       INTR_FRAME
-       pushq $0
-       CFI_ADJUST_CFA_OFFSET 8
-       paranoidentry do_int3, DEBUG_STACK
-       jmp paranoid_exit1
-       CFI_ENDPROC
-KPROBE_END(int3)
-
-ENTRY(overflow)
-       zeroentry do_overflow
-END(overflow)
-
-ENTRY(bounds)
-       zeroentry do_bounds
-END(bounds)
-
-ENTRY(invalid_op)
-       zeroentry do_invalid_op 
-END(invalid_op)
-
-ENTRY(coprocessor_segment_overrun)
-       zeroentry do_coprocessor_segment_overrun
-END(coprocessor_segment_overrun)
-
-ENTRY(reserved)
-       zeroentry do_reserved
-END(reserved)
-
-       /* runs on exception stack */
-ENTRY(double_fault)
-       XCPT_FRAME
-       paranoidentry do_double_fault
-       jmp paranoid_exit1
-       CFI_ENDPROC
-END(double_fault)
-
-ENTRY(invalid_TSS)
-       errorentry do_invalid_TSS
-END(invalid_TSS)
-
-ENTRY(segment_not_present)
-       errorentry do_segment_not_present
-END(segment_not_present)
-
-       /* runs on exception stack */
-ENTRY(stack_segment)
-       XCPT_FRAME
-       paranoidentry do_stack_segment
-       jmp paranoid_exit1
-       CFI_ENDPROC
-END(stack_segment)
-
-KPROBE_ENTRY(general_protection)
-       errorentry do_general_protection
-KPROBE_END(general_protection)
-
-ENTRY(alignment_check)
-       errorentry do_alignment_check
-END(alignment_check)
-
-ENTRY(divide_error)
-       zeroentry do_divide_error
-END(divide_error)
-
-ENTRY(spurious_interrupt_bug)
-       zeroentry do_spurious_interrupt_bug
-END(spurious_interrupt_bug)
-
-#ifdef CONFIG_X86_MCE
-       /* runs on exception stack */
-ENTRY(machine_check)
-       INTR_FRAME
-       pushq $0
-       CFI_ADJUST_CFA_OFFSET 8 
-       paranoidentry do_machine_check
-       jmp paranoid_exit1
-       CFI_ENDPROC
-END(machine_check)
-#endif
-
-/* Call softirq on interrupt stack. Interrupts are off. */
-ENTRY(call_softirq)
-       CFI_STARTPROC
-       push %rbp
-       CFI_ADJUST_CFA_OFFSET   8
-       CFI_REL_OFFSET rbp,0
-       mov  %rsp,%rbp
-       CFI_DEF_CFA_REGISTER rbp
-       incl %gs:pda_irqcount
-       cmove %gs:pda_irqstackptr,%rsp
-       push  %rbp                      # backlink for old unwinder
-       call __do_softirq
-       leaveq
-       CFI_DEF_CFA_REGISTER    rsp
-       CFI_ADJUST_CFA_OFFSET   -8
-       decl %gs:pda_irqcount
-       ret
-       CFI_ENDPROC
-ENDPROC(call_softirq)
-
-KPROBE_ENTRY(ignore_sysret)
-       CFI_STARTPROC
-       mov $-ENOSYS,%eax
-       sysret
-       CFI_ENDPROC
-ENDPROC(ignore_sysret)
diff --git a/arch/x86_64/kernel/genapic_64.c b/arch/x86_64/kernel/genapic_64.c
deleted file mode 100644 (file)
index 47496a4..0000000
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright 2004 James Cleverdon, IBM.
- * Subject to the GNU Public License, v.2
- *
- * Generic APIC sub-arch probe layer.
- *
- * Hacked for x86-64 by James Cleverdon from i386 architecture code by
- * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
- * James Cleverdon.
- */
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/init.h>
-
-#include <asm/smp.h>
-#include <asm/ipi.h>
-#include <asm/genapic.h>
-
-#ifdef CONFIG_ACPI
-#include <acpi/acpi_bus.h>
-#endif
-
-/* which logical CPU number maps to which CPU (physical APIC ID) */
-u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly
-                                       = { [0 ... NR_CPUS-1] = BAD_APICID };
-EXPORT_SYMBOL(x86_cpu_to_apicid);
-
-u8 x86_cpu_to_log_apicid[NR_CPUS]      = { [0 ... NR_CPUS-1] = BAD_APICID };
-
-struct genapic __read_mostly *genapic = &apic_flat;
-
-/*
- * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
- */
-void __init setup_apic_routing(void)
-{
-#ifdef CONFIG_ACPI
-       /*
-        * Quirk: some x86_64 machines can only use physical APIC mode
-        * regardless of how many processors are present (x86_64 ES7000
-        * is an example).
-        */
-       if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
-                       (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
-               genapic = &apic_physflat;
-       else
-#endif
-
-       if (cpus_weight(cpu_possible_map) <= 8)
-               genapic = &apic_flat;
-       else
-               genapic = &apic_physflat;
-
-       printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
-}
-
-/* Same for both flat and physical. */
-
-void send_IPI_self(int vector)
-{
-       __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
-}
diff --git a/arch/x86_64/kernel/genapic_flat_64.c b/arch/x86_64/kernel/genapic_flat_64.c
deleted file mode 100644 (file)
index ecb01ee..0000000
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
- * Copyright 2004 James Cleverdon, IBM.
- * Subject to the GNU Public License, v.2
- *
- * Flat APIC subarch code.
- *
- * Hacked for x86-64 by James Cleverdon from i386 architecture code by
- * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
- * James Cleverdon.
- */
-#include <linux/errno.h>
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/init.h>
-#include <asm/smp.h>
-#include <asm/ipi.h>
-#include <asm/genapic.h>
-
-static cpumask_t flat_target_cpus(void)
-{
-       return cpu_online_map;
-}
-
-static cpumask_t flat_vector_allocation_domain(int cpu)
-{
-       /* Careful. Some cpus do not strictly honor the set of cpus
-        * specified in the interrupt destination when using lowest
-        * priority interrupt delivery mode.
-        *
-        * In particular there was a hyperthreading cpu observed to
-        * deliver interrupts to the wrong hyperthread when only one
-        * hyperthread was specified in the interrupt desitination.
-        */
-       cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
-       return domain;
-}
-
-/*
- * Set up the logical destination ID.
- *
- * Intel recommends to set DFR, LDR and TPR before enabling
- * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116).  So here it goes...
- */
-static void flat_init_apic_ldr(void)
-{
-       unsigned long val;
-       unsigned long num, id;
-
-       num = smp_processor_id();
-       id = 1UL << num;
-       x86_cpu_to_log_apicid[num] = id;
-       apic_write(APIC_DFR, APIC_DFR_FLAT);
-       val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
-       val |= SET_APIC_LOGICAL_ID(id);
-       apic_write(APIC_LDR, val);
-}
-
-static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
-{
-       unsigned long mask = cpus_addr(cpumask)[0];
-       unsigned long flags;
-
-       local_irq_save(flags);
-       __send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL);
-       local_irq_restore(flags);
-}
-
-static void flat_send_IPI_allbutself(int vector)
-{
-#ifdef CONFIG_HOTPLUG_CPU
-       int hotplug = 1;
-#else
-       int hotplug = 0;
-#endif
-       if (hotplug || vector == NMI_VECTOR) {
-               cpumask_t allbutme = cpu_online_map;
-
-               cpu_clear(smp_processor_id(), allbutme);
-
-               if (!cpus_empty(allbutme))
-                       flat_send_IPI_mask(allbutme, vector);
-       } else if (num_online_cpus() > 1) {
-               __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL);
-       }
-}
-
-static void flat_send_IPI_all(int vector)
-{
-       if (vector == NMI_VECTOR)
-               flat_send_IPI_mask(cpu_online_map, vector);
-       else
-               __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
-}
-
-static int flat_apic_id_registered(void)
-{
-       return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map);
-}
-
-static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
-{
-       return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
-}
-
-static unsigned int phys_pkg_id(int index_msb)
-{
-       return hard_smp_processor_id() >> index_msb;
-}
-
-struct genapic apic_flat =  {
-       .name = "flat",
-       .int_delivery_mode = dest_LowestPrio,
-       .int_dest_mode = (APIC_DEST_LOGICAL != 0),
-       .target_cpus = flat_target_cpus,
-       .vector_allocation_domain = flat_vector_allocation_domain,
-       .apic_id_registered = flat_apic_id_registered,
-       .init_apic_ldr = flat_init_apic_ldr,
-       .send_IPI_all = flat_send_IPI_all,
-       .send_IPI_allbutself = flat_send_IPI_allbutself,
-       .send_IPI_mask = flat_send_IPI_mask,
-       .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
-       .phys_pkg_id = phys_pkg_id,
-};
-
-/*
- * Physflat mode is used when there are more than 8 CPUs on a AMD system.
- * We cannot use logical delivery in this case because the mask
- * overflows, so use physical mode.
- */
-
-static cpumask_t physflat_target_cpus(void)
-{
-       return cpu_online_map;
-}
-
-static cpumask_t physflat_vector_allocation_domain(int cpu)
-{
-       cpumask_t domain = CPU_MASK_NONE;
-       cpu_set(cpu, domain);
-       return domain;
-}
-
-
-static void physflat_send_IPI_mask(cpumask_t cpumask, int vector)
-{
-       send_IPI_mask_sequence(cpumask, vector);
-}
-
-static void physflat_send_IPI_allbutself(int vector)
-{
-       cpumask_t allbutme = cpu_online_map;
-
-       cpu_clear(smp_processor_id(), allbutme);
-       physflat_send_IPI_mask(allbutme, vector);
-}
-
-static void physflat_send_IPI_all(int vector)
-{
-       physflat_send_IPI_mask(cpu_online_map, vector);
-}
-
-static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
-{
-       int cpu;
-
-       /*
-        * We're using fixed IRQ delivery, can only return one phys APIC ID.
-        * May as well be the first.
-        */
-       cpu = first_cpu(cpumask);
-       if ((unsigned)cpu < NR_CPUS)
-               return x86_cpu_to_apicid[cpu];
-       else
-               return BAD_APICID;
-}
-
-struct genapic apic_physflat =  {
-       .name = "physical flat",
-       .int_delivery_mode = dest_Fixed,
-       .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
-       .target_cpus = physflat_target_cpus,
-       .vector_allocation_domain = physflat_vector_allocation_domain,
-       .apic_id_registered = flat_apic_id_registered,
-       .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/
-       .send_IPI_all = physflat_send_IPI_all,
-       .send_IPI_allbutself = physflat_send_IPI_allbutself,
-       .send_IPI_mask = physflat_send_IPI_mask,
-       .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
-       .phys_pkg_id = phys_pkg_id,
-};
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
deleted file mode 100644 (file)
index 6c34bdd..0000000
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- *  linux/arch/x86_64/kernel/head64.c -- prepare to run common code
- *
- *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
- */
-
-#include <linux/init.h>
-#include <linux/linkage.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/percpu.h>
-
-#include <asm/processor.h>
-#include <asm/proto.h>
-#include <asm/smp.h>
-#include <asm/bootsetup.h>
-#include <asm/setup.h>
-#include <asm/desc.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <asm/sections.h>
-
-static void __init zap_identity_mappings(void)
-{
-       pgd_t *pgd = pgd_offset_k(0UL);
-       pgd_clear(pgd);
-       __flush_tlb();
-}
-
-/* Don't add a printk in there. printk relies on the PDA which is not initialized 
-   yet. */
-static void __init clear_bss(void)
-{
-       memset(__bss_start, 0,
-              (unsigned long) __bss_stop - (unsigned long) __bss_start);
-}
-
-#define NEW_CL_POINTER         0x228   /* Relative to real mode data */
-#define OLD_CL_MAGIC_ADDR      0x20
-#define OLD_CL_MAGIC            0xA33F
-#define OLD_CL_OFFSET           0x22
-
-static void __init copy_bootdata(char *real_mode_data)
-{
-       unsigned long new_data;
-       char * command_line;
-
-       memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE);
-       new_data = *(u32 *) (x86_boot_params + NEW_CL_POINTER);
-       if (!new_data) {
-               if (OLD_CL_MAGIC != *(u16 *)(real_mode_data + OLD_CL_MAGIC_ADDR)) {
-                       return;
-               }
-               new_data = __pa(real_mode_data) + *(u16 *)(real_mode_data + OLD_CL_OFFSET);
-       }
-       command_line = __va(new_data);
-       memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
-}
-
-void __init x86_64_start_kernel(char * real_mode_data)
-{
-       int i;
-
-       /* clear bss before set_intr_gate with early_idt_handler */
-       clear_bss();
-
-       /* Make NULL pointers segfault */
-       zap_identity_mappings();
-
-       for (i = 0; i < IDT_ENTRIES; i++)
-               set_intr_gate(i, early_idt_handler);
-       asm volatile("lidt %0" :: "m" (idt_descr));
-
-       early_printk("Kernel alive\n");
-
-       for (i = 0; i < NR_CPUS; i++)
-               cpu_pda(i) = &boot_cpu_pda[i];
-
-       pda_init(0);
-       copy_bootdata(__va(real_mode_data));
-#ifdef CONFIG_SMP
-       cpu_set(0, cpu_online_map);
-#endif
-       start_kernel();
-}
diff --git a/arch/x86_64/kernel/head_64.S b/arch/x86_64/kernel/head_64.S
deleted file mode 100644 (file)
index b6167fe..0000000
+++ /dev/null
@@ -1,416 +0,0 @@
-/*
- *  linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
- *
- *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
- *  Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
- *  Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
- *  Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
- *  Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com>
- */
-
-
-#include <linux/linkage.h>
-#include <linux/threads.h>
-#include <linux/init.h>
-#include <asm/desc.h>
-#include <asm/segment.h>
-#include <asm/pgtable.h>
-#include <asm/page.h>
-#include <asm/msr.h>
-#include <asm/cache.h>
-
-/* we are not able to switch in one step to the final KERNEL ADRESS SPACE
- * because we need identity-mapped pages.
- *
- */
-
-       .text
-       .section .text.head
-       .code64
-       .globl startup_64
-startup_64:
-
-       /*
-        * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
-        * and someone has loaded an identity mapped page table
-        * for us.  These identity mapped page tables map all of the
-        * kernel pages and possibly all of memory.
-        *
-        * %esi holds a physical pointer to real_mode_data.
-        *
-        * We come here either directly from a 64bit bootloader, or from
-        * arch/x86_64/boot/compressed/head.S.
-        *
-        * We only come here initially at boot nothing else comes here.
-        *
-        * Since we may be loaded at an address different from what we were
-        * compiled to run at we first fixup the physical addresses in our page
-        * tables and then reload them.
-        */
-
-       /* Compute the delta between the address I am compiled to run at and the
-        * address I am actually running at.
-        */
-       leaq    _text(%rip), %rbp
-       subq    $_text - __START_KERNEL_map, %rbp
-
-       /* Is the address not 2M aligned? */
-       movq    %rbp, %rax
-       andl    $~LARGE_PAGE_MASK, %eax
-       testl   %eax, %eax
-       jnz     bad_address
-
-       /* Is the address too large? */
-       leaq    _text(%rip), %rdx
-       movq    $PGDIR_SIZE, %rax
-       cmpq    %rax, %rdx
-       jae     bad_address
-
-       /* Fixup the physical addresses in the page table
-        */
-       addq    %rbp, init_level4_pgt + 0(%rip)
-       addq    %rbp, init_level4_pgt + (258*8)(%rip)
-       addq    %rbp, init_level4_pgt + (511*8)(%rip)
-
-       addq    %rbp, level3_ident_pgt + 0(%rip)
-
-       addq    %rbp, level3_kernel_pgt + (510*8)(%rip)
-       addq    %rbp, level3_kernel_pgt + (511*8)(%rip)
-
-       addq    %rbp, level2_fixmap_pgt + (506*8)(%rip)
-
-       /* Add an Identity mapping if I am above 1G */
-       leaq    _text(%rip), %rdi
-       andq    $LARGE_PAGE_MASK, %rdi
-
-       movq    %rdi, %rax
-       shrq    $PUD_SHIFT, %rax
-       andq    $(PTRS_PER_PUD - 1), %rax
-       jz      ident_complete
-
-       leaq    (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
-       leaq    level3_ident_pgt(%rip), %rbx
-       movq    %rdx, 0(%rbx, %rax, 8)
-
-       movq    %rdi, %rax
-       shrq    $PMD_SHIFT, %rax
-       andq    $(PTRS_PER_PMD - 1), %rax
-       leaq    __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx
-       leaq    level2_spare_pgt(%rip), %rbx
-       movq    %rdx, 0(%rbx, %rax, 8)
-ident_complete:
-
-       /* Fixup the kernel text+data virtual addresses
-        */
-       leaq    level2_kernel_pgt(%rip), %rdi
-       leaq    4096(%rdi), %r8
-       /* See if it is a valid page table entry */
-1:     testq   $1, 0(%rdi)
-       jz      2f
-       addq    %rbp, 0(%rdi)
-       /* Go to the next page */
-2:     addq    $8, %rdi
-       cmp     %r8, %rdi
-       jne     1b
-
-       /* Fixup phys_base */
-       addq    %rbp, phys_base(%rip)
-
-#ifdef CONFIG_SMP
-       addq    %rbp, trampoline_level4_pgt + 0(%rip)
-       addq    %rbp, trampoline_level4_pgt + (511*8)(%rip)
-#endif
-#ifdef CONFIG_ACPI_SLEEP
-       addq    %rbp, wakeup_level4_pgt + 0(%rip)
-       addq    %rbp, wakeup_level4_pgt + (511*8)(%rip)
-#endif
-
-       /* Due to ENTRY(), sometimes the empty space gets filled with
-        * zeros. Better take a jmp than relying on empty space being
-        * filled with 0x90 (nop)
-        */
-       jmp secondary_startup_64
-ENTRY(secondary_startup_64)
-       /*
-        * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
-        * and someone has loaded a mapped page table.
-        *
-        * %esi holds a physical pointer to real_mode_data.
-        *
-        * We come here either from startup_64 (using physical addresses)
-        * or from trampoline.S (using virtual addresses).
-        *
-        * Using virtual addresses from trampoline.S removes the need
-        * to have any identity mapped pages in the kernel page table
-        * after the boot processor executes this code.
-        */
-
-       /* Enable PAE mode and PGE */
-       xorq    %rax, %rax
-       btsq    $5, %rax
-       btsq    $7, %rax
-       movq    %rax, %cr4
-
-       /* Setup early boot stage 4 level pagetables. */
-       movq    $(init_level4_pgt - __START_KERNEL_map), %rax
-       addq    phys_base(%rip), %rax
-       movq    %rax, %cr3
-
-       /* Ensure I am executing from virtual addresses */
-       movq    $1f, %rax
-       jmp     *%rax
-1:
-
-       /* Check if nx is implemented */
-       movl    $0x80000001, %eax
-       cpuid
-       movl    %edx,%edi
-
-       /* Setup EFER (Extended Feature Enable Register) */
-       movl    $MSR_EFER, %ecx
-       rdmsr
-       btsl    $_EFER_SCE, %eax        /* Enable System Call */
-       btl     $20,%edi                /* No Execute supported? */
-       jnc     1f
-       btsl    $_EFER_NX, %eax
-1:     wrmsr                           /* Make changes effective */
-
-       /* Setup cr0 */
-#define CR0_PM                         1               /* protected mode */
-#define CR0_MP                         (1<<1)
-#define CR0_ET                         (1<<4)
-#define CR0_NE                         (1<<5)
-#define CR0_WP                         (1<<16)
-#define CR0_AM                         (1<<18)
-#define CR0_PAGING                     (1<<31)
-       movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax
-       /* Make changes effective */
-       movq    %rax, %cr0
-
-       /* Setup a boot time stack */
-       movq init_rsp(%rip),%rsp
-
-       /* zero EFLAGS after setting rsp */
-       pushq $0
-       popfq
-
-       /*
-        * We must switch to a new descriptor in kernel space for the GDT
-        * because soon the kernel won't have access anymore to the userspace
-        * addresses where we're currently running on. We have to do that here
-        * because in 32bit we couldn't load a 64bit linear address.
-        */
-       lgdt    cpu_gdt_descr(%rip)
-
-       /* set up data segments. actually 0 would do too */
-       movl $__KERNEL_DS,%eax
-       movl %eax,%ds
-       movl %eax,%ss
-       movl %eax,%es
-
-       /*
-        * We don't really need to load %fs or %gs, but load them anyway
-        * to kill any stale realmode selectors.  This allows execution
-        * under VT hardware.
-        */
-       movl %eax,%fs
-       movl %eax,%gs
-
-       /* 
-        * Setup up a dummy PDA. this is just for some early bootup code
-        * that does in_interrupt() 
-        */ 
-       movl    $MSR_GS_BASE,%ecx
-       movq    $empty_zero_page,%rax
-       movq    %rax,%rdx
-       shrq    $32,%rdx
-       wrmsr   
-
-       /* esi is pointer to real mode structure with interesting info.
-          pass it to C */
-       movl    %esi, %edi
-       
-       /* Finally jump to run C code and to be on real kernel address
-        * Since we are running on identity-mapped space we have to jump
-        * to the full 64bit address, this is only possible as indirect
-        * jump.  In addition we need to ensure %cs is set so we make this
-        * a far return.
-        */
-       movq    initial_code(%rip),%rax
-       pushq   $0              # fake return address to stop unwinder
-       pushq   $__KERNEL_CS    # set correct cs
-       pushq   %rax            # target address in negative space
-       lretq
-
-       /* SMP bootup changes these two */
-#ifndef CONFIG_HOTPLUG_CPU
-       .pushsection .init.data
-#endif
-       .align  8
-       .globl  initial_code
-initial_code:
-       .quad   x86_64_start_kernel
-#ifndef CONFIG_HOTPLUG_CPU
-       .popsection
-#endif
-       .globl init_rsp
-init_rsp:
-       .quad  init_thread_union+THREAD_SIZE-8
-
-bad_address:
-       jmp bad_address
-
-ENTRY(early_idt_handler)
-       cmpl $2,early_recursion_flag(%rip)
-       jz  1f
-       incl early_recursion_flag(%rip)
-       xorl %eax,%eax
-       movq 8(%rsp),%rsi       # get rip
-       movq (%rsp),%rdx
-       movq %cr2,%rcx
-       leaq early_idt_msg(%rip),%rdi
-       call early_printk
-       cmpl $2,early_recursion_flag(%rip)
-       jz  1f
-       call dump_stack
-#ifdef CONFIG_KALLSYMS 
-       leaq early_idt_ripmsg(%rip),%rdi
-       movq 8(%rsp),%rsi       # get rip again
-       call __print_symbol
-#endif
-1:     hlt
-       jmp 1b
-early_recursion_flag:
-       .long 0
-
-early_idt_msg:
-       .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n"
-early_idt_ripmsg:
-       .asciz "RIP %s\n"
-
-.balign PAGE_SIZE
-
-#define NEXT_PAGE(name) \
-       .balign PAGE_SIZE; \
-ENTRY(name)
-
-/* Automate the creation of 1 to 1 mapping pmd entries */
-#define PMDS(START, PERM, COUNT)               \
-       i = 0 ;                                 \
-       .rept (COUNT) ;                         \
-       .quad   (START) + (i << 21) + (PERM) ;  \
-       i = i + 1 ;                             \
-       .endr
-
-       /*
-        * This default setting generates an ident mapping at address 0x100000
-        * and a mapping for the kernel that precisely maps virtual address
-        * 0xffffffff80000000 to physical address 0x000000. (always using
-        * 2Mbyte large pages provided by PAE mode)
-        */
-NEXT_PAGE(init_level4_pgt)
-       .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
-       .fill   257,8,0
-       .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
-       .fill   252,8,0
-       /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
-       .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
-
-NEXT_PAGE(level3_ident_pgt)
-       .quad   level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
-       .fill   511,8,0
-
-NEXT_PAGE(level3_kernel_pgt)
-       .fill   510,8,0
-       /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
-       .quad   level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
-       .quad   level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
-
-NEXT_PAGE(level2_fixmap_pgt)
-       .fill   506,8,0
-       .quad   level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
-       /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
-       .fill   5,8,0
-
-NEXT_PAGE(level1_fixmap_pgt)
-       .fill   512,8,0
-
-NEXT_PAGE(level2_ident_pgt)
-       /* Since I easily can, map the first 1G.
-        * Don't set NX because code runs from these pages.
-        */
-       PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD)
-
-NEXT_PAGE(level2_kernel_pgt)
-       /* 40MB kernel mapping. The kernel code cannot be bigger than that.
-          When you change this change KERNEL_TEXT_SIZE in page.h too. */
-       /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */
-       PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, KERNEL_TEXT_SIZE/PMD_SIZE)
-       /* Module mapping starts here */
-       .fill   (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0
-
-NEXT_PAGE(level2_spare_pgt)
-       .fill   512,8,0
-
-#undef PMDS
-#undef NEXT_PAGE
-
-       .data
-       .align 16
-       .globl cpu_gdt_descr
-cpu_gdt_descr:
-       .word   gdt_end-cpu_gdt_table-1
-gdt:
-       .quad   cpu_gdt_table
-#ifdef CONFIG_SMP
-       .rept   NR_CPUS-1
-       .word   0
-       .quad   0
-       .endr
-#endif
-
-ENTRY(phys_base)
-       /* This must match the first entry in level2_kernel_pgt */
-       .quad   0x0000000000000000
-
-/* We need valid kernel segments for data and code in long mode too
- * IRET will check the segment types  kkeil 2000/10/28
- * Also sysret mandates a special GDT layout 
- */
-                               
-       .section .data.page_aligned, "aw"
-       .align PAGE_SIZE
-
-/* The TLS descriptors are currently at a different place compared to i386.
-   Hopefully nobody expects them at a fixed place (Wine?) */
-       
-ENTRY(cpu_gdt_table)
-       .quad   0x0000000000000000      /* NULL descriptor */
-       .quad   0x00cf9b000000ffff      /* __KERNEL32_CS */
-       .quad   0x00af9b000000ffff      /* __KERNEL_CS */
-       .quad   0x00cf93000000ffff      /* __KERNEL_DS */
-       .quad   0x00cffb000000ffff      /* __USER32_CS */
-       .quad   0x00cff3000000ffff      /* __USER_DS, __USER32_DS  */
-       .quad   0x00affb000000ffff      /* __USER_CS */
-       .quad   0x0                     /* unused */
-       .quad   0,0                     /* TSS */
-       .quad   0,0                     /* LDT */
-       .quad   0,0,0                   /* three TLS descriptors */ 
-       .quad   0x0000f40000000000      /* node/CPU stored in limit */
-gdt_end:       
-       /* asm/segment.h:GDT_ENTRIES must match this */ 
-       /* This should be a multiple of the cache line size */
-       /* GDTs of other CPUs are now dynamically allocated */
-
-       /* zero the remaining page */
-       .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0
-
-       .section .bss, "aw", @nobits
-       .align L1_CACHE_BYTES
-ENTRY(idt_table)
-       .skip 256 * 16
-
-       .section .bss.page_aligned, "aw", @nobits
-       .align PAGE_SIZE
-ENTRY(empty_zero_page)
-       .skip PAGE_SIZE
diff --git a/arch/x86_64/kernel/hpet_64.c b/arch/x86_64/kernel/hpet_64.c
deleted file mode 100644 (file)
index e2d1b91..0000000
+++ /dev/null
@@ -1,493 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/mc146818rtc.h>
-#include <linux/time.h>
-#include <linux/clocksource.h>
-#include <linux/ioport.h>
-#include <linux/acpi.h>
-#include <linux/hpet.h>
-#include <asm/pgtable.h>
-#include <asm/vsyscall.h>
-#include <asm/timex.h>
-#include <asm/hpet.h>
-
-#define HPET_MASK      0xFFFFFFFF
-#define HPET_SHIFT     22
-
-/* FSEC = 10^-15 NSEC = 10^-9 */
-#define FSEC_PER_NSEC  1000000
-
-int nohpet __initdata;
-
-unsigned long hpet_address;
-unsigned long hpet_period;     /* fsecs / HPET clock */
-unsigned long hpet_tick;       /* HPET clocks / interrupt */
-
-int hpet_use_timer;            /* Use counter of hpet for time keeping,
-                                * otherwise PIT
-                                */
-
-#ifdef CONFIG_HPET
-static __init int late_hpet_init(void)
-{
-       struct hpet_data        hd;
-       unsigned int            ntimer;
-
-       if (!hpet_address)
-               return 0;
-
-       memset(&hd, 0, sizeof(hd));
-
-       ntimer = hpet_readl(HPET_ID);
-       ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
-       ntimer++;
-
-       /*
-        * Register with driver.
-        * Timer0 and Timer1 is used by platform.
-        */
-       hd.hd_phys_address = hpet_address;
-       hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE);
-       hd.hd_nirqs = ntimer;
-       hd.hd_flags = HPET_DATA_PLATFORM;
-       hpet_reserve_timer(&hd, 0);
-#ifdef CONFIG_HPET_EMULATE_RTC
-       hpet_reserve_timer(&hd, 1);
-#endif
-       hd.hd_irq[0] = HPET_LEGACY_8254;
-       hd.hd_irq[1] = HPET_LEGACY_RTC;
-       if (ntimer > 2) {
-               struct hpet             *hpet;
-               struct hpet_timer       *timer;
-               int                     i;
-
-               hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE);
-               timer = &hpet->hpet_timers[2];
-               for (i = 2; i < ntimer; timer++, i++)
-                       hd.hd_irq[i] = (timer->hpet_config &
-                                       Tn_INT_ROUTE_CNF_MASK) >>
-                               Tn_INT_ROUTE_CNF_SHIFT;
-
-       }
-
-       hpet_alloc(&hd);
-       return 0;
-}
-fs_initcall(late_hpet_init);
-#endif
-
-int hpet_timer_stop_set_go(unsigned long tick)
-{
-       unsigned int cfg;
-
-/*
- * Stop the timers and reset the main counter.
- */
-
-       cfg = hpet_readl(HPET_CFG);
-       cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
-       hpet_writel(cfg, HPET_CFG);
-       hpet_writel(0, HPET_COUNTER);
-       hpet_writel(0, HPET_COUNTER + 4);
-
-/*
- * Set up timer 0, as periodic with first interrupt to happen at hpet_tick,
- * and period also hpet_tick.
- */
-       if (hpet_use_timer) {
-               hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
-                   HPET_TN_32BIT, HPET_T0_CFG);
-               hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */
-               hpet_writel(hpet_tick, HPET_T0_CMP); /* period */
-               cfg |= HPET_CFG_LEGACY;
-       }
-/*
- * Go!
- */
-
-       cfg |= HPET_CFG_ENABLE;
-       hpet_writel(cfg, HPET_CFG);
-
-       return 0;
-}
-
-static cycle_t read_hpet(void)
-{
-       return (cycle_t)hpet_readl(HPET_COUNTER);
-}
-
-static cycle_t __vsyscall_fn vread_hpet(void)
-{
-       return readl((void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
-}
-
-struct clocksource clocksource_hpet = {
-       .name           = "hpet",
-       .rating         = 250,
-       .read           = read_hpet,
-       .mask           = (cycle_t)HPET_MASK,
-       .mult           = 0, /* set below */
-       .shift          = HPET_SHIFT,
-       .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
-       .vread          = vread_hpet,
-};
-
-int __init hpet_arch_init(void)
-{
-       unsigned int id;
-       u64 tmp;
-
-       if (!hpet_address)
-               return -1;
-       set_fixmap_nocache(FIX_HPET_BASE, hpet_address);
-       __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
-
-/*
- * Read the period, compute tick and quotient.
- */
-
-       id = hpet_readl(HPET_ID);
-
-       if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER))
-               return -1;
-
-       hpet_period = hpet_readl(HPET_PERIOD);
-       if (hpet_period < 100000 || hpet_period > 100000000)
-               return -1;
-
-       hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period;
-
-       hpet_use_timer = (id & HPET_ID_LEGSUP);
-
-       /*
-        * hpet period is in femto seconds per cycle
-        * so we need to convert this to ns/cyc units
-        * aproximated by mult/2^shift
-        *
-        *  fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
-        *  fsec/cyc * 1ns/1000000fsec * 2^shift = mult
-        *  fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
-        *  (fsec/cyc << shift)/1000000 = mult
-        *  (hpet_period << shift)/FSEC_PER_NSEC = mult
-        */
-       tmp = (u64)hpet_period << HPET_SHIFT;
-       do_div(tmp, FSEC_PER_NSEC);
-       clocksource_hpet.mult = (u32)tmp;
-       clocksource_register(&clocksource_hpet);
-
-       return hpet_timer_stop_set_go(hpet_tick);
-}
-
-int hpet_reenable(void)
-{
-       return hpet_timer_stop_set_go(hpet_tick);
-}
-
-/*
- * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing
- * it to the HPET timer of known frequency.
- */
-
-#define TICK_COUNT 100000000
-#define SMI_THRESHOLD 50000
-#define MAX_TRIES  5
-
-/*
- * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none
- * occurs between the reads of the hpet & TSC.
- */
-static void __init read_hpet_tsc(int *hpet, int *tsc)
-{
-       int tsc1, tsc2, hpet1, i;
-
-       for (i = 0; i < MAX_TRIES; i++) {
-               tsc1 = get_cycles_sync();
-               hpet1 = hpet_readl(HPET_COUNTER);
-               tsc2 = get_cycles_sync();
-               if ((tsc2 - tsc1) < SMI_THRESHOLD)
-                       break;
-       }
-       *hpet = hpet1;
-       *tsc = tsc2;
-}
-
-unsigned int __init hpet_calibrate_tsc(void)
-{
-       int tsc_start, hpet_start;
-       int tsc_now, hpet_now;
-       unsigned long flags;
-
-       local_irq_save(flags);
-
-       read_hpet_tsc(&hpet_start, &tsc_start);
-
-       do {
-               local_irq_disable();
-               read_hpet_tsc(&hpet_now, &tsc_now);
-               local_irq_restore(flags);
-       } while ((tsc_now - tsc_start) < TICK_COUNT &&
-               (hpet_now - hpet_start) < TICK_COUNT);
-
-       return (tsc_now - tsc_start) * 1000000000L
-               / ((hpet_now - hpet_start) * hpet_period / 1000);
-}
-
-#ifdef CONFIG_HPET_EMULATE_RTC
-/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
- * is enabled, we support RTC interrupt functionality in software.
- * RTC has 3 kinds of interrupts:
- * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
- *    is updated
- * 2) Alarm Interrupt - generate an interrupt at a specific time of day
- * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
- *    2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
- * (1) and (2) above are implemented using polling at a frequency of
- * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
- * overhead. (DEFAULT_RTC_INT_FREQ)
- * For (3), we use interrupts at 64Hz or user specified periodic
- * frequency, whichever is higher.
- */
-#include <linux/rtc.h>
-
-#define DEFAULT_RTC_INT_FREQ   64
-#define RTC_NUM_INTS           1
-
-static unsigned long UIE_on;
-static unsigned long prev_update_sec;
-
-static unsigned long AIE_on;
-static struct rtc_time alarm_time;
-
-static unsigned long PIE_on;
-static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ;
-static unsigned long PIE_count;
-
-static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */
-static unsigned int hpet_t1_cmp; /* cached comparator register */
-
-int is_hpet_enabled(void)
-{
-       return hpet_address != 0;
-}
-
-/*
- * Timer 1 for RTC, we do not use periodic interrupt feature,
- * even if HPET supports periodic interrupts on Timer 1.
- * The reason being, to set up a periodic interrupt in HPET, we need to
- * stop the main counter. And if we do that everytime someone diables/enables
- * RTC, we will have adverse effect on main kernel timer running on Timer 0.
- * So, for the time being, simulate the periodic interrupt in software.
- *
- * hpet_rtc_timer_init() is called for the first time and during subsequent
- * interuppts reinit happens through hpet_rtc_timer_reinit().
- */
-int hpet_rtc_timer_init(void)
-{
-       unsigned int cfg, cnt;
-       unsigned long flags;
-
-       if (!is_hpet_enabled())
-               return 0;
-       /*
-        * Set the counter 1 and enable the interrupts.
-        */
-       if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
-               hpet_rtc_int_freq = PIE_freq;
-       else
-               hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
-
-       local_irq_save(flags);
-
-       cnt = hpet_readl(HPET_COUNTER);
-       cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
-       hpet_writel(cnt, HPET_T1_CMP);
-       hpet_t1_cmp = cnt;
-
-       cfg = hpet_readl(HPET_T1_CFG);
-       cfg &= ~HPET_TN_PERIODIC;
-       cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
-       hpet_writel(cfg, HPET_T1_CFG);
-
-       local_irq_restore(flags);
-
-       return 1;
-}
-
-static void hpet_rtc_timer_reinit(void)
-{
-       unsigned int cfg, cnt, ticks_per_int, lost_ints;
-
-       if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
-               cfg = hpet_readl(HPET_T1_CFG);
-               cfg &= ~HPET_TN_ENABLE;
-               hpet_writel(cfg, HPET_T1_CFG);
-               return;
-       }
-
-       if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ))
-               hpet_rtc_int_freq = PIE_freq;
-       else
-               hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
-
-       /* It is more accurate to use the comparator value than current count.*/
-       ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq;
-       hpet_t1_cmp += ticks_per_int;
-       hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
-
-       /*
-        * If the interrupt handler was delayed too long, the write above tries
-        * to schedule the next interrupt in the past and the hardware would
-        * not interrupt until the counter had wrapped around.
-        * So we have to check that the comparator wasn't set to a past time.
-        */
-       cnt = hpet_readl(HPET_COUNTER);
-       if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) {
-               lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1;
-               /* Make sure that, even with the time needed to execute
-                * this code, the next scheduled interrupt has been moved
-                * back to the future: */
-               lost_ints++;
-
-               hpet_t1_cmp += lost_ints * ticks_per_int;
-               hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
-
-               if (PIE_on)
-                       PIE_count += lost_ints;
-
-               if (printk_ratelimit())
-                       printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
-                              hpet_rtc_int_freq);
-       }
-}
-
-/*
- * The functions below are called from rtc driver.
- * Return 0 if HPET is not being used.
- * Otherwise do the necessary changes and return 1.
- */
-int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
-{
-       if (!is_hpet_enabled())
-               return 0;
-
-       if (bit_mask & RTC_UIE)
-               UIE_on = 0;
-       if (bit_mask & RTC_PIE)
-               PIE_on = 0;
-       if (bit_mask & RTC_AIE)
-               AIE_on = 0;
-
-       return 1;
-}
-
-int hpet_set_rtc_irq_bit(unsigned long bit_mask)
-{
-       int timer_init_reqd = 0;
-
-       if (!is_hpet_enabled())
-               return 0;
-
-       if (!(PIE_on | AIE_on | UIE_on))
-               timer_init_reqd = 1;
-
-       if (bit_mask & RTC_UIE) {
-               UIE_on = 1;
-       }
-       if (bit_mask & RTC_PIE) {
-               PIE_on = 1;
-               PIE_count = 0;
-       }
-       if (bit_mask & RTC_AIE) {
-               AIE_on = 1;
-       }
-
-       if (timer_init_reqd)
-               hpet_rtc_timer_init();
-
-       return 1;
-}
-
-int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec)
-{
-       if (!is_hpet_enabled())
-               return 0;
-
-       alarm_time.tm_hour = hrs;
-       alarm_time.tm_min = min;
-       alarm_time.tm_sec = sec;
-
-       return 1;
-}
-
-int hpet_set_periodic_freq(unsigned long freq)
-{
-       if (!is_hpet_enabled())
-               return 0;
-
-       PIE_freq = freq;
-       PIE_count = 0;
-
-       return 1;
-}
-
-int hpet_rtc_dropped_irq(void)
-{
-       if (!is_hpet_enabled())
-               return 0;
-
-       return 1;
-}
-
-irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
-{
-       struct rtc_time curr_time;
-       unsigned long rtc_int_flag = 0;
-       int call_rtc_interrupt = 0;
-
-       hpet_rtc_timer_reinit();
-
-       if (UIE_on | AIE_on) {
-               rtc_get_rtc_time(&curr_time);
-       }
-       if (UIE_on) {
-               if (curr_time.tm_sec != prev_update_sec) {
-                       /* Set update int info, call real rtc int routine */
-                       call_rtc_interrupt = 1;
-                       rtc_int_flag = RTC_UF;
-                       prev_update_sec = curr_time.tm_sec;
-               }
-       }
-       if (PIE_on) {
-               PIE_count++;
-               if (PIE_count >= hpet_rtc_int_freq/PIE_freq) {
-                       /* Set periodic int info, call real rtc int routine */
-                       call_rtc_interrupt = 1;
-                       rtc_int_flag |= RTC_PF;
-                       PIE_count = 0;
-               }
-       }
-       if (AIE_on) {
-               if ((curr_time.tm_sec == alarm_time.tm_sec) &&
-                   (curr_time.tm_min == alarm_time.tm_min) &&
-                   (curr_time.tm_hour == alarm_time.tm_hour)) {
-                       /* Set alarm int info, call real rtc int routine */
-                       call_rtc_interrupt = 1;
-                       rtc_int_flag |= RTC_AF;
-               }
-       }
-       if (call_rtc_interrupt) {
-               rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
-               rtc_interrupt(rtc_int_flag, dev_id);
-       }
-       return IRQ_HANDLED;
-}
-#endif
-
-static int __init nohpet_setup(char *s)
-{
-       nohpet = 1;
-       return 1;
-}
-
-__setup("nohpet", nohpet_setup);
diff --git a/arch/x86_64/kernel/i387_64.c b/arch/x86_64/kernel/i387_64.c
deleted file mode 100644 (file)
index 1d58c13..0000000
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- *  linux/arch/x86_64/kernel/i387.c
- *
- *  Copyright (C) 1994 Linus Torvalds
- *  Copyright (C) 2002 Andi Kleen, SuSE Labs
- *
- *  Pentium III FXSR, SSE support
- *  General FPU state handling cleanups
- *     Gareth Hughes <gareth@valinux.com>, May 2000
- * 
- *  x86-64 rework 2002 Andi Kleen. 
- *  Does direct fxsave in and out of user space now for signal handlers.
- *  All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation,
- *  the 64bit user space sees a FXSAVE frame directly. 
- */
-
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/sigcontext.h>
-#include <asm/user.h>
-#include <asm/ptrace.h>
-#include <asm/uaccess.h>
-
-unsigned int mxcsr_feature_mask __read_mostly = 0xffffffff;
-
-void mxcsr_feature_mask_init(void)
-{
-       unsigned int mask;
-       clts();
-       memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
-       asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
-       mask = current->thread.i387.fxsave.mxcsr_mask;
-       if (mask == 0) mask = 0x0000ffbf;
-       mxcsr_feature_mask &= mask;
-       stts();
-}
-
-/*
- * Called at bootup to set up the initial FPU state that is later cloned
- * into all processes.
- */
-void __cpuinit fpu_init(void)
-{
-       unsigned long oldcr0 = read_cr0();
-       extern void __bad_fxsave_alignment(void);
-               
-       if (offsetof(struct task_struct, thread.i387.fxsave) & 15)
-               __bad_fxsave_alignment();
-       set_in_cr4(X86_CR4_OSFXSR);
-       set_in_cr4(X86_CR4_OSXMMEXCPT);
-
-       write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */
-
-       mxcsr_feature_mask_init();
-       /* clean state in init */
-       current_thread_info()->status = 0;
-       clear_used_math();
-}
-
-void init_fpu(struct task_struct *child)
-{
-       if (tsk_used_math(child)) {
-               if (child == current)
-                       unlazy_fpu(child);
-               return;
-       }       
-       memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
-       child->thread.i387.fxsave.cwd = 0x37f;
-       child->thread.i387.fxsave.mxcsr = 0x1f80;
-       /* only the device not available exception or ptrace can call init_fpu */
-       set_stopped_child_used_math(child);
-}
-
-/*
- * Signal frame handlers.
- */
-
-int save_i387(struct _fpstate __user *buf)
-{
-       struct task_struct *tsk = current;
-       int err = 0;
-
-       BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
-                       sizeof(tsk->thread.i387.fxsave));
-
-       if ((unsigned long)buf % 16) 
-               printk("save_i387: bad fpstate %p\n",buf); 
-
-       if (!used_math())
-               return 0;
-       clear_used_math(); /* trigger finit */
-       if (task_thread_info(tsk)->status & TS_USEDFPU) {
-               err = save_i387_checking((struct i387_fxsave_struct __user *)buf);
-               if (err) return err;
-               stts();
-               } else {
-               if (__copy_to_user(buf, &tsk->thread.i387.fxsave, 
-                                  sizeof(struct i387_fxsave_struct)))
-                       return -1;
-       } 
-               return 1;
-}
-
-/*
- * ptrace request handlers.
- */
-
-int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk)
-{
-       init_fpu(tsk);
-       return __copy_to_user(buf, &tsk->thread.i387.fxsave,
-                              sizeof(struct user_i387_struct)) ? -EFAULT : 0;
-}
-
-int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf)
-{
-       if (__copy_from_user(&tsk->thread.i387.fxsave, buf, 
-                            sizeof(struct user_i387_struct)))
-               return -EFAULT;
-               return 0;
-}
-
-/*
- * FPU state for core dumps.
- */
-
-int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
-{
-       struct task_struct *tsk = current;
-
-       if (!used_math())
-               return 0;
-
-       unlazy_fpu(tsk);
-       memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); 
-       return 1; 
-}
-
-int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
-{
-       int fpvalid = !!tsk_used_math(tsk);
-
-       if (fpvalid) {
-               if (tsk == current)
-                       unlazy_fpu(tsk);
-               memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct));         
-}
-       return fpvalid;
-}
diff --git a/arch/x86_64/kernel/i8259_64.c b/arch/x86_64/kernel/i8259_64.c
deleted file mode 100644 (file)
index 948cae6..0000000
+++ /dev/null
@@ -1,544 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/ioport.h>
-#include <linux/interrupt.h>
-#include <linux/timex.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/init.h>
-#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/bitops.h>
-
-#include <asm/acpi.h>
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <asm/io.h>
-#include <asm/hw_irq.h>
-#include <asm/pgtable.h>
-#include <asm/delay.h>
-#include <asm/desc.h>
-#include <asm/apic.h>
-
-/*
- * Common place to define all x86 IRQ vectors
- *
- * This builds up the IRQ handler stubs using some ugly macros in irq.h
- *
- * These macros create the low-level assembly IRQ routines that save
- * register context and call do_IRQ(). do_IRQ() then does all the
- * operations that are needed to keep the AT (or SMP IOAPIC)
- * interrupt-controller happy.
- */
-
-#define BI(x,y) \
-       BUILD_IRQ(x##y)
-
-#define BUILD_16_IRQS(x) \
-       BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
-       BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
-       BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
-       BI(x,c) BI(x,d) BI(x,e) BI(x,f)
-
-/*
- * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
- * (these are usually mapped to vectors 0x30-0x3f)
- */
-
-/*
- * The IO-APIC gives us many more interrupt sources. Most of these 
- * are unused but an SMP system is supposed to have enough memory ...
- * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
- * across the spectrum, so we really want to be prepared to get all
- * of these. Plus, more powerful systems might have more than 64
- * IO-APIC registers.
- *
- * (these are usually mapped into the 0x30-0xff vector range)
- */
-                                     BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
-BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
-BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
-BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
-
-#undef BUILD_16_IRQS
-#undef BI
-
-
-#define IRQ(x,y) \
-       IRQ##x##y##_interrupt
-
-#define IRQLIST_16(x) \
-       IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
-       IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
-       IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
-       IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
-
-/* for the irq vectors */
-static void (*interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
-                                         IRQLIST_16(0x2), IRQLIST_16(0x3),
-       IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
-       IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
-       IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
-};
-
-#undef IRQ
-#undef IRQLIST_16
-
-/*
- * This is the 'legacy' 8259A Programmable Interrupt Controller,
- * present in the majority of PC/AT boxes.
- * plus some generic x86 specific things if generic specifics makes
- * any sense at all.
- * this file should become arch/i386/kernel/irq.c when the old irq.c
- * moves to arch independent land
- */
-
-static int i8259A_auto_eoi;
-DEFINE_SPINLOCK(i8259A_lock);
-static void mask_and_ack_8259A(unsigned int);
-
-static struct irq_chip i8259A_chip = {
-       .name           = "XT-PIC",
-       .mask           = disable_8259A_irq,
-       .disable        = disable_8259A_irq,
-       .unmask         = enable_8259A_irq,
-       .mask_ack       = mask_and_ack_8259A,
-};
-
-/*
- * 8259A PIC functions to handle ISA devices:
- */
-
-/*
- * This contains the irq mask for both 8259A irq controllers,
- */
-static unsigned int cached_irq_mask = 0xffff;
-
-#define __byte(x,y)    (((unsigned char *)&(y))[x])
-#define cached_21      (__byte(0,cached_irq_mask))
-#define cached_A1      (__byte(1,cached_irq_mask))
-
-/*
- * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
- * boards the timer interrupt is not really connected to any IO-APIC pin,
- * it's fed to the master 8259A's IR0 line only.
- *
- * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
- * this 'mixed mode' IRQ handling costs nothing because it's only used
- * at IRQ setup time.
- */
-unsigned long io_apic_irqs;
-
-void disable_8259A_irq(unsigned int irq)
-{
-       unsigned int mask = 1 << irq;
-       unsigned long flags;
-
-       spin_lock_irqsave(&i8259A_lock, flags);
-       cached_irq_mask |= mask;
-       if (irq & 8)
-               outb(cached_A1,0xA1);
-       else
-               outb(cached_21,0x21);
-       spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
-void enable_8259A_irq(unsigned int irq)
-{
-       unsigned int mask = ~(1 << irq);
-       unsigned long flags;
-
-       spin_lock_irqsave(&i8259A_lock, flags);
-       cached_irq_mask &= mask;
-       if (irq & 8)
-               outb(cached_A1,0xA1);
-       else
-               outb(cached_21,0x21);
-       spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
-int i8259A_irq_pending(unsigned int irq)
-{
-       unsigned int mask = 1<<irq;
-       unsigned long flags;
-       int ret;
-
-       spin_lock_irqsave(&i8259A_lock, flags);
-       if (irq < 8)
-               ret = inb(0x20) & mask;
-       else
-               ret = inb(0xA0) & (mask >> 8);
-       spin_unlock_irqrestore(&i8259A_lock, flags);
-
-       return ret;
-}
-
-void make_8259A_irq(unsigned int irq)
-{
-       disable_irq_nosync(irq);
-       io_apic_irqs &= ~(1<<irq);
-       set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
-                                     "XT");
-       enable_irq(irq);
-}
-
-/*
- * This function assumes to be called rarely. Switching between
- * 8259A registers is slow.
- * This has to be protected by the irq controller spinlock
- * before being called.
- */
-static inline int i8259A_irq_real(unsigned int irq)
-{
-       int value;
-       int irqmask = 1<<irq;
-
-       if (irq < 8) {
-               outb(0x0B,0x20);                /* ISR register */
-               value = inb(0x20) & irqmask;
-               outb(0x0A,0x20);                /* back to the IRR register */
-               return value;
-       }
-       outb(0x0B,0xA0);                /* ISR register */
-       value = inb(0xA0) & (irqmask >> 8);
-       outb(0x0A,0xA0);                /* back to the IRR register */
-       return value;
-}
-
-/*
- * Careful! The 8259A is a fragile beast, it pretty
- * much _has_ to be done exactly like this (mask it
- * first, _then_ send the EOI, and the order of EOI
- * to the two 8259s is important!
- */
-static void mask_and_ack_8259A(unsigned int irq)
-{
-       unsigned int irqmask = 1 << irq;
-       unsigned long flags;
-
-       spin_lock_irqsave(&i8259A_lock, flags);
-       /*
-        * Lightweight spurious IRQ detection. We do not want
-        * to overdo spurious IRQ handling - it's usually a sign
-        * of hardware problems, so we only do the checks we can
-        * do without slowing down good hardware unnecessarily.
-        *
-        * Note that IRQ7 and IRQ15 (the two spurious IRQs
-        * usually resulting from the 8259A-1|2 PICs) occur
-        * even if the IRQ is masked in the 8259A. Thus we
-        * can check spurious 8259A IRQs without doing the
-        * quite slow i8259A_irq_real() call for every IRQ.
-        * This does not cover 100% of spurious interrupts,
-        * but should be enough to warn the user that there
-        * is something bad going on ...
-        */
-       if (cached_irq_mask & irqmask)
-               goto spurious_8259A_irq;
-       cached_irq_mask |= irqmask;
-
-handle_real_irq:
-       if (irq & 8) {
-               inb(0xA1);              /* DUMMY - (do we need this?) */
-               outb(cached_A1,0xA1);
-               outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */
-               outb(0x62,0x20);        /* 'Specific EOI' to master-IRQ2 */
-       } else {
-               inb(0x21);              /* DUMMY - (do we need this?) */
-               outb(cached_21,0x21);
-               outb(0x60+irq,0x20);    /* 'Specific EOI' to master */
-       }
-       spin_unlock_irqrestore(&i8259A_lock, flags);
-       return;
-
-spurious_8259A_irq:
-       /*
-        * this is the slow path - should happen rarely.
-        */
-       if (i8259A_irq_real(irq))
-               /*
-                * oops, the IRQ _is_ in service according to the
-                * 8259A - not spurious, go handle it.
-                */
-               goto handle_real_irq;
-
-       {
-               static int spurious_irq_mask;
-               /*
-                * At this point we can be sure the IRQ is spurious,
-                * lets ACK and report it. [once per IRQ]
-                */
-               if (!(spurious_irq_mask & irqmask)) {
-                       printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
-                       spurious_irq_mask |= irqmask;
-               }
-               atomic_inc(&irq_err_count);
-               /*
-                * Theoretically we do not have to handle this IRQ,
-                * but in Linux this does not cause problems and is
-                * simpler for us.
-                */
-               goto handle_real_irq;
-       }
-}
-
-void init_8259A(int auto_eoi)
-{
-       unsigned long flags;
-
-       i8259A_auto_eoi = auto_eoi;
-
-       spin_lock_irqsave(&i8259A_lock, flags);
-
-       outb(0xff, 0x21);       /* mask all of 8259A-1 */
-       outb(0xff, 0xA1);       /* mask all of 8259A-2 */
-
-       /*
-        * outb_p - this has to work on a wide range of PC hardware.
-        */
-       outb_p(0x11, 0x20);     /* ICW1: select 8259A-1 init */
-       outb_p(IRQ0_VECTOR, 0x21);      /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
-       outb_p(0x04, 0x21);     /* 8259A-1 (the master) has a slave on IR2 */
-       if (auto_eoi)
-               outb_p(0x03, 0x21);     /* master does Auto EOI */
-       else
-               outb_p(0x01, 0x21);     /* master expects normal EOI */
-
-       outb_p(0x11, 0xA0);     /* ICW1: select 8259A-2 init */
-       outb_p(IRQ8_VECTOR, 0xA1);      /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
-       outb_p(0x02, 0xA1);     /* 8259A-2 is a slave on master's IR2 */
-       outb_p(0x01, 0xA1);     /* (slave's support for AEOI in flat mode
-                                   is to be investigated) */
-
-       if (auto_eoi)
-               /*
-                * in AEOI mode we just have to mask the interrupt
-                * when acking.
-                */
-               i8259A_chip.mask_ack = disable_8259A_irq;
-       else
-               i8259A_chip.mask_ack = mask_and_ack_8259A;
-
-       udelay(100);            /* wait for 8259A to initialize */
-
-       outb(cached_21, 0x21);  /* restore master IRQ mask */
-       outb(cached_A1, 0xA1);  /* restore slave IRQ mask */
-
-       spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
-static char irq_trigger[2];
-/**
- * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
- */
-static void restore_ELCR(char *trigger)
-{
-       outb(trigger[0], 0x4d0);
-       outb(trigger[1], 0x4d1);
-}
-
-static void save_ELCR(char *trigger)
-{
-       /* IRQ 0,1,2,8,13 are marked as reserved */
-       trigger[0] = inb(0x4d0) & 0xF8;
-       trigger[1] = inb(0x4d1) & 0xDE;
-}
-
-static int i8259A_resume(struct sys_device *dev)
-{
-       init_8259A(i8259A_auto_eoi);
-       restore_ELCR(irq_trigger);
-       return 0;
-}
-
-static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
-{
-       save_ELCR(irq_trigger);
-       return 0;
-}
-
-static int i8259A_shutdown(struct sys_device *dev)
-{
-       /* Put the i8259A into a quiescent state that
-        * the kernel initialization code can get it
-        * out of.
-        */
-       outb(0xff, 0x21);       /* mask all of 8259A-1 */
-       outb(0xff, 0xA1);       /* mask all of 8259A-1 */
-       return 0;
-}
-
-static struct sysdev_class i8259_sysdev_class = {
-       set_kset_name("i8259"),
-       .suspend = i8259A_suspend,
-       .resume = i8259A_resume,
-       .shutdown = i8259A_shutdown,
-};
-
-static struct sys_device device_i8259A = {
-       .id     = 0,
-       .cls    = &i8259_sysdev_class,
-};
-
-static int __init i8259A_init_sysfs(void)
-{
-       int error = sysdev_class_register(&i8259_sysdev_class);
-       if (!error)
-               error = sysdev_register(&device_i8259A);
-       return error;
-}
-
-device_initcall(i8259A_init_sysfs);
-
-/*
- * IRQ2 is cascade interrupt to second interrupt controller
- */
-
-static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
-       [0 ... IRQ0_VECTOR - 1] = -1,
-       [IRQ0_VECTOR] = 0,
-       [IRQ1_VECTOR] = 1,
-       [IRQ2_VECTOR] = 2,
-       [IRQ3_VECTOR] = 3,
-       [IRQ4_VECTOR] = 4,
-       [IRQ5_VECTOR] = 5,
-       [IRQ6_VECTOR] = 6,
-       [IRQ7_VECTOR] = 7,
-       [IRQ8_VECTOR] = 8,
-       [IRQ9_VECTOR] = 9,
-       [IRQ10_VECTOR] = 10,
-       [IRQ11_VECTOR] = 11,
-       [IRQ12_VECTOR] = 12,
-       [IRQ13_VECTOR] = 13,
-       [IRQ14_VECTOR] = 14,
-       [IRQ15_VECTOR] = 15,
-       [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
-};
-
-void __init init_ISA_irqs (void)
-{
-       int i;
-
-       init_bsp_APIC();
-       init_8259A(0);
-
-       for (i = 0; i < NR_IRQS; i++) {
-               irq_desc[i].status = IRQ_DISABLED;
-               irq_desc[i].action = NULL;
-               irq_desc[i].depth = 1;
-
-               if (i < 16) {
-                       /*
-                        * 16 old-style INTA-cycle interrupts:
-                        */
-                       set_irq_chip_and_handler_name(i, &i8259A_chip,
-                                                     handle_level_irq, "XT");
-               } else {
-                       /*
-                        * 'high' PCI IRQs filled in on demand
-                        */
-                       irq_desc[i].chip = &no_irq_chip;
-               }
-       }
-}
-
-static void setup_timer_hardware(void)
-{
-       outb_p(0x34,0x43);              /* binary, mode 2, LSB/MSB, ch 0 */
-       udelay(10);
-       outb_p(LATCH & 0xff , 0x40);    /* LSB */
-       udelay(10);
-       outb(LATCH >> 8 , 0x40);        /* MSB */
-}
-
-static int timer_resume(struct sys_device *dev)
-{
-       setup_timer_hardware();
-       return 0;
-}
-
-void i8254_timer_resume(void)
-{
-       setup_timer_hardware();
-}
-
-static struct sysdev_class timer_sysclass = {
-       set_kset_name("timer_pit"),
-       .resume         = timer_resume,
-};
-
-static struct sys_device device_timer = {
-       .id             = 0,
-       .cls            = &timer_sysclass,
-};
-
-static int __init init_timer_sysfs(void)
-{
-       int error = sysdev_class_register(&timer_sysclass);
-       if (!error)
-               error = sysdev_register(&device_timer);
-       return error;
-}
-
-device_initcall(init_timer_sysfs);
-
-void __init init_IRQ(void)
-{
-       int i;
-
-       init_ISA_irqs();
-       /*
-        * Cover the whole vector space, no vector can escape
-        * us. (some of these will be overridden and become
-        * 'special' SMP interrupts)
-        */
-       for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
-               int vector = FIRST_EXTERNAL_VECTOR + i;
-               if (vector != IA32_SYSCALL_VECTOR)
-                       set_intr_gate(vector, interrupt[i]);
-       }
-
-#ifdef CONFIG_SMP
-       /*
-        * The reschedule interrupt is a CPU-to-CPU reschedule-helper
-        * IPI, driven by wakeup.
-        */
-       set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
-       /* IPIs for invalidation */
-       set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
-       set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
-       set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
-       set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
-       set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
-       set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
-       set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
-       set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
-
-       /* IPI for generic function call */
-       set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-
-       /* Low priority IPI to cleanup after moving an irq */
-       set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
-#endif
-       set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-       set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
-
-       /* self generated IPI for local APIC timer */
-       set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
-       /* IPI vectors for APIC spurious and error interrupts */
-       set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
-       set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-
-       /*
-        * Set the clock to HZ Hz, we already have a valid
-        * vector now:
-        */
-       setup_timer_hardware();
-
-       if (!acpi_ioapic)
-               setup_irq(2, &irq2);
-}
diff --git a/arch/x86_64/kernel/init_task_64.c b/arch/x86_64/kernel/init_task_64.c
deleted file mode 100644 (file)
index 4ff33d4..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/init_task.h>
-#include <linux/fs.h>
-#include <linux/mqueue.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-
-static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
-static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
-static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-struct mm_struct init_mm = INIT_MM(init_mm);
-
-EXPORT_SYMBOL(init_mm);
-
-/*
- * Initial task structure.
- *
- * We need to make sure that this is 8192-byte aligned due to the
- * way process stacks are handled. This is done by having a special
- * "init_task" linker map entry..
- */
-union thread_union init_thread_union 
-       __attribute__((__section__(".data.init_task"))) =
-               { INIT_THREAD_INFO(init_task) };
-
-/*
- * Initial task structure.
- *
- * All other task structs will be allocated on slabs in fork.c
- */
-struct task_struct init_task = INIT_TASK(init_task);
-
-EXPORT_SYMBOL(init_task);
-/*
- * per-CPU TSS segments. Threads are completely 'soft' on Linux,
- * no more per-task TSS's. The TSS size is kept cacheline-aligned
- * so they are allowed to end up in the .data.cacheline_aligned
- * section. Since TSS's are completely CPU-local, we want them
- * on exact cacheline boundaries, to eliminate cacheline ping-pong.
- */ 
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
-
-/* Copies of the original ist values from the tss are only accessed during
- * debugging, no special alignment required.
- */
-DEFINE_PER_CPU(struct orig_ist, orig_ist);
-
-#define ALIGN_TO_4K __attribute__((section(".data.init_task")))
diff --git a/arch/x86_64/kernel/io_apic_64.c b/arch/x86_64/kernel/io_apic_64.c
deleted file mode 100644 (file)
index 966fa10..0000000
+++ /dev/null
@@ -1,2202 +0,0 @@
-/*
- *     Intel IO-APIC support for multi-Pentium hosts.
- *
- *     Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
- *
- *     Many thanks to Stig Venaas for trying out countless experimental
- *     patches and reporting/debugging problems patiently!
- *
- *     (c) 1999, Multiple IO-APIC support, developed by
- *     Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
- *      Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
- *     further tested and cleaned up by Zach Brown <zab@redhat.com>
- *     and Ingo Molnar <mingo@redhat.com>
- *
- *     Fixes
- *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
- *                                     thanks to Eric Gilmore
- *                                     and Rolf G. Tews
- *                                     for testing these extensively
- *     Paul Diefenbaugh        :       Added full ACPI support
- */
-
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/sched.h>
-#include <linux/pci.h>
-#include <linux/mc146818rtc.h>
-#include <linux/acpi.h>
-#include <linux/sysdev.h>
-#include <linux/msi.h>
-#include <linux/htirq.h>
-#ifdef CONFIG_ACPI
-#include <acpi/acpi_bus.h>
-#endif
-
-#include <asm/idle.h>
-#include <asm/io.h>
-#include <asm/smp.h>
-#include <asm/desc.h>
-#include <asm/proto.h>
-#include <asm/mach_apic.h>
-#include <asm/acpi.h>
-#include <asm/dma.h>
-#include <asm/nmi.h>
-#include <asm/msidef.h>
-#include <asm/hypertransport.h>
-
-struct irq_cfg {
-       cpumask_t domain;
-       cpumask_t old_domain;
-       unsigned move_cleanup_count;
-       u8 vector;
-       u8 move_in_progress : 1;
-};
-
-/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
-struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
-       [0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
-       [1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
-       [2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
-       [3]  = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
-       [4]  = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
-       [5]  = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
-       [6]  = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
-       [7]  = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
-       [8]  = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
-       [9]  = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
-       [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
-       [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
-       [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
-       [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
-       [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
-       [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
-};
-
-static int assign_irq_vector(int irq, cpumask_t mask);
-
-#define __apicdebuginit  __init
-
-int sis_apic_bug; /* not actually supported, dummy for compile */
-
-static int no_timer_check;
-
-static int disable_timer_pin_1 __initdata;
-
-int timer_over_8254 __initdata = 1;
-
-/* Where if anywhere is the i8259 connect in external int mode */
-static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
-
-static DEFINE_SPINLOCK(ioapic_lock);
-DEFINE_SPINLOCK(vector_lock);
-
-/*
- * # of IRQ routing registers
- */
-int nr_ioapic_registers[MAX_IO_APICS];
-
-/*
- * Rough estimation of how many shared IRQs there are, can
- * be changed anytime.
- */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
-#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
-
-/*
- * This is performance-critical, we want to do it O(1)
- *
- * the indexing order of this array favors 1:1 mappings
- * between pins and IRQs.
- */
-
-static struct irq_pin_list {
-       short apic, pin, next;
-} irq_2_pin[PIN_MAP_SIZE];
-
-struct io_apic {
-       unsigned int index;
-       unsigned int unused[3];
-       unsigned int data;
-};
-
-static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
-{
-       return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
-               + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
-}
-
-static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
-{
-       struct io_apic __iomem *io_apic = io_apic_base(apic);
-       writel(reg, &io_apic->index);
-       return readl(&io_apic->data);
-}
-
-static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
-{
-       struct io_apic __iomem *io_apic = io_apic_base(apic);
-       writel(reg, &io_apic->index);
-       writel(value, &io_apic->data);
-}
-
-/*
- * Re-write a value: to be used for read-modify-write
- * cycles where the read already set up the index register.
- */
-static inline void io_apic_modify(unsigned int apic, unsigned int value)
-{
-       struct io_apic __iomem *io_apic = io_apic_base(apic);
-       writel(value, &io_apic->data);
-}
-
-static int io_apic_level_ack_pending(unsigned int irq)
-{
-       struct irq_pin_list *entry;
-       unsigned long flags;
-       int pending = 0;
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       entry = irq_2_pin + irq;
-       for (;;) {
-               unsigned int reg;
-               int pin;
-
-               pin = entry->pin;
-               if (pin == -1)
-                       break;
-               reg = io_apic_read(entry->apic, 0x10 + pin*2);
-               /* Is the remote IRR bit set? */
-               pending |= (reg >> 14) & 1;
-               if (!entry->next)
-                       break;
-               entry = irq_2_pin + entry->next;
-       }
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-       return pending;
-}
-
-/*
- * Synchronize the IO-APIC and the CPU by doing
- * a dummy read from the IO-APIC
- */
-static inline void io_apic_sync(unsigned int apic)
-{
-       struct io_apic __iomem *io_apic = io_apic_base(apic);
-       readl(&io_apic->data);
-}
-
-#define __DO_ACTION(R, ACTION, FINAL)                                  \
-                                                                       \
-{                                                                      \
-       int pin;                                                        \
-       struct irq_pin_list *entry = irq_2_pin + irq;                   \
-                                                                       \
-       BUG_ON(irq >= NR_IRQS);                                         \
-       for (;;) {                                                      \
-               unsigned int reg;                                       \
-               pin = entry->pin;                                       \
-               if (pin == -1)                                          \
-                       break;                                          \
-               reg = io_apic_read(entry->apic, 0x10 + R + pin*2);      \
-               reg ACTION;                                             \
-               io_apic_modify(entry->apic, reg);                       \
-               FINAL;                                                  \
-               if (!entry->next)                                       \
-                       break;                                          \
-               entry = irq_2_pin + entry->next;                        \
-       }                                                               \
-}
-
-union entry_union {
-       struct { u32 w1, w2; };
-       struct IO_APIC_route_entry entry;
-};
-
-static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
-{
-       union entry_union eu;
-       unsigned long flags;
-       spin_lock_irqsave(&ioapic_lock, flags);
-       eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
-       eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-       return eu.entry;
-}
-
-/*
- * When we write a new IO APIC routing entry, we need to write the high
- * word first! If the mask bit in the low word is clear, we will enable
- * the interrupt, and we need to make sure the entry is fully populated
- * before that happens.
- */
-static void
-__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
-{
-       union entry_union eu;
-       eu.entry = e;
-       io_apic_write(apic, 0x11 + 2*pin, eu.w2);
-       io_apic_write(apic, 0x10 + 2*pin, eu.w1);
-}
-
-static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
-{
-       unsigned long flags;
-       spin_lock_irqsave(&ioapic_lock, flags);
-       __ioapic_write_entry(apic, pin, e);
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-/*
- * When we mask an IO APIC routing entry, we need to write the low
- * word first, in order to set the mask bit before we change the
- * high bits!
- */
-static void ioapic_mask_entry(int apic, int pin)
-{
-       unsigned long flags;
-       union entry_union eu = { .entry.mask = 1 };
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       io_apic_write(apic, 0x10 + 2*pin, eu.w1);
-       io_apic_write(apic, 0x11 + 2*pin, eu.w2);
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-#ifdef CONFIG_SMP
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
-{
-       int apic, pin;
-       struct irq_pin_list *entry = irq_2_pin + irq;
-
-       BUG_ON(irq >= NR_IRQS);
-       for (;;) {
-               unsigned int reg;
-               apic = entry->apic;
-               pin = entry->pin;
-               if (pin == -1)
-                       break;
-               io_apic_write(apic, 0x11 + pin*2, dest);
-               reg = io_apic_read(apic, 0x10 + pin*2);
-               reg &= ~0x000000ff;
-               reg |= vector;
-               io_apic_modify(apic, reg);
-               if (!entry->next)
-                       break;
-               entry = irq_2_pin + entry->next;
-       }
-}
-
-static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
-{
-       struct irq_cfg *cfg = irq_cfg + irq;
-       unsigned long flags;
-       unsigned int dest;
-       cpumask_t tmp;
-
-       cpus_and(tmp, mask, cpu_online_map);
-       if (cpus_empty(tmp))
-               return;
-
-       if (assign_irq_vector(irq, mask))
-               return;
-
-       cpus_and(tmp, cfg->domain, mask);
-       dest = cpu_mask_to_apicid(tmp);
-
-       /*
-        * Only the high 8 bits are valid.
-        */
-       dest = SET_APIC_LOGICAL_ID(dest);
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       __target_IO_APIC_irq(irq, dest, cfg->vector);
-       irq_desc[irq].affinity = mask;
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-#endif
-
-/*
- * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
- * shared ISA-space IRQs, so we have to support them. We are super
- * fast in the common case, and fast for shared ISA-space IRQs.
- */
-static void add_pin_to_irq(unsigned int irq, int apic, int pin)
-{
-       static int first_free_entry = NR_IRQS;
-       struct irq_pin_list *entry = irq_2_pin + irq;
-
-       BUG_ON(irq >= NR_IRQS);
-       while (entry->next)
-               entry = irq_2_pin + entry->next;
-
-       if (entry->pin != -1) {
-               entry->next = first_free_entry;
-               entry = irq_2_pin + entry->next;
-               if (++first_free_entry >= PIN_MAP_SIZE)
-                       panic("io_apic.c: ran out of irq_2_pin entries!");
-       }
-       entry->apic = apic;
-       entry->pin = pin;
-}
-
-
-#define DO_ACTION(name,R,ACTION, FINAL)                                        \
-                                                                       \
-       static void name##_IO_APIC_irq (unsigned int irq)               \
-       __DO_ACTION(R, ACTION, FINAL)
-
-DO_ACTION( __mask,             0, |= 0x00010000, io_apic_sync(entry->apic) )
-                                               /* mask = 1 */
-DO_ACTION( __unmask,           0, &= 0xfffeffff, )
-                                               /* mask = 0 */
-
-static void mask_IO_APIC_irq (unsigned int irq)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       __mask_IO_APIC_irq(irq);
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-static void unmask_IO_APIC_irq (unsigned int irq)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       __unmask_IO_APIC_irq(irq);
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
-{
-       struct IO_APIC_route_entry entry;
-
-       /* Check delivery_mode to be sure we're not clearing an SMI pin */
-       entry = ioapic_read_entry(apic, pin);
-       if (entry.delivery_mode == dest_SMI)
-               return;
-       /*
-        * Disable it in the IO-APIC irq-routing table:
-        */
-       ioapic_mask_entry(apic, pin);
-}
-
-static void clear_IO_APIC (void)
-{
-       int apic, pin;
-
-       for (apic = 0; apic < nr_ioapics; apic++)
-               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
-                       clear_IO_APIC_pin(apic, pin);
-}
-
-int skip_ioapic_setup;
-int ioapic_force;
-
-static int __init parse_noapic(char *str)
-{
-       disable_ioapic_setup();
-       return 0;
-}
-early_param("noapic", parse_noapic);
-
-/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
-static int __init disable_timer_pin_setup(char *arg)
-{
-       disable_timer_pin_1 = 1;
-       return 1;
-}
-__setup("disable_timer_pin_1", disable_timer_pin_setup);
-
-static int __init setup_disable_8254_timer(char *s)
-{
-       timer_over_8254 = -1;
-       return 1;
-}
-static int __init setup_enable_8254_timer(char *s)
-{
-       timer_over_8254 = 2;
-       return 1;
-}
-
-__setup("disable_8254_timer", setup_disable_8254_timer);
-__setup("enable_8254_timer", setup_enable_8254_timer);
-
-
-/*
- * Find the IRQ entry number of a certain pin.
- */
-static int find_irq_entry(int apic, int pin, int type)
-{
-       int i;
-
-       for (i = 0; i < mp_irq_entries; i++)
-               if (mp_irqs[i].mpc_irqtype == type &&
-                   (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
-                    mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
-                   mp_irqs[i].mpc_dstirq == pin)
-                       return i;
-
-       return -1;
-}
-
-/*
- * Find the pin to which IRQ[irq] (ISA) is connected
- */
-static int __init find_isa_irq_pin(int irq, int type)
-{
-       int i;
-
-       for (i = 0; i < mp_irq_entries; i++) {
-               int lbus = mp_irqs[i].mpc_srcbus;
-
-               if (test_bit(lbus, mp_bus_not_pci) &&
-                   (mp_irqs[i].mpc_irqtype == type) &&
-                   (mp_irqs[i].mpc_srcbusirq == irq))
-
-                       return mp_irqs[i].mpc_dstirq;
-       }
-       return -1;
-}
-
-static int __init find_isa_irq_apic(int irq, int type)
-{
-       int i;
-
-       for (i = 0; i < mp_irq_entries; i++) {
-               int lbus = mp_irqs[i].mpc_srcbus;
-
-               if (test_bit(lbus, mp_bus_not_pci) &&
-                   (mp_irqs[i].mpc_irqtype == type) &&
-                   (mp_irqs[i].mpc_srcbusirq == irq))
-                       break;
-       }
-       if (i < mp_irq_entries) {
-               int apic;
-               for(apic = 0; apic < nr_ioapics; apic++) {
-                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
-                               return apic;
-               }
-       }
-
-       return -1;
-}
-
-/*
- * Find a specific PCI IRQ entry.
- * Not an __init, possibly needed by modules
- */
-static int pin_2_irq(int idx, int apic, int pin);
-
-int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
-{
-       int apic, i, best_guess = -1;
-
-       apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
-               bus, slot, pin);
-       if (mp_bus_id_to_pci_bus[bus] == -1) {
-               apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
-               return -1;
-       }
-       for (i = 0; i < mp_irq_entries; i++) {
-               int lbus = mp_irqs[i].mpc_srcbus;
-
-               for (apic = 0; apic < nr_ioapics; apic++)
-                       if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
-                           mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
-                               break;
-
-               if (!test_bit(lbus, mp_bus_not_pci) &&
-                   !mp_irqs[i].mpc_irqtype &&
-                   (bus == lbus) &&
-                   (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
-                       int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
-
-                       if (!(apic || IO_APIC_IRQ(irq)))
-                               continue;
-
-                       if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
-                               return irq;
-                       /*
-                        * Use the first all-but-pin matching entry as a
-                        * best-guess fuzzy result for broken mptables.
-                        */
-                       if (best_guess < 0)
-                               best_guess = irq;
-               }
-       }
-       BUG_ON(best_guess >= NR_IRQS);
-       return best_guess;
-}
-
-/* ISA interrupts are always polarity zero edge triggered,
- * when listed as conforming in the MP table. */
-
-#define default_ISA_trigger(idx)       (0)
-#define default_ISA_polarity(idx)      (0)
-
-/* PCI interrupts are always polarity one level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_PCI_trigger(idx)       (1)
-#define default_PCI_polarity(idx)      (1)
-
-static int __init MPBIOS_polarity(int idx)
-{
-       int bus = mp_irqs[idx].mpc_srcbus;
-       int polarity;
-
-       /*
-        * Determine IRQ line polarity (high active or low active):
-        */
-       switch (mp_irqs[idx].mpc_irqflag & 3)
-       {
-               case 0: /* conforms, ie. bus-type dependent polarity */
-                       if (test_bit(bus, mp_bus_not_pci))
-                               polarity = default_ISA_polarity(idx);
-                       else
-                               polarity = default_PCI_polarity(idx);
-                       break;
-               case 1: /* high active */
-               {
-                       polarity = 0;
-                       break;
-               }
-               case 2: /* reserved */
-               {
-                       printk(KERN_WARNING "broken BIOS!!\n");
-                       polarity = 1;
-                       break;
-               }
-               case 3: /* low active */
-               {
-                       polarity = 1;
-                       break;
-               }
-               default: /* invalid */
-               {
-                       printk(KERN_WARNING "broken BIOS!!\n");
-                       polarity = 1;
-                       break;
-               }
-       }
-       return polarity;
-}
-
-static int MPBIOS_trigger(int idx)
-{
-       int bus = mp_irqs[idx].mpc_srcbus;
-       int trigger;
-
-       /*
-        * Determine IRQ trigger mode (edge or level sensitive):
-        */
-       switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
-       {
-               case 0: /* conforms, ie. bus-type dependent */
-                       if (test_bit(bus, mp_bus_not_pci))
-                               trigger = default_ISA_trigger(idx);
-                       else
-                               trigger = default_PCI_trigger(idx);
-                       break;
-               case 1: /* edge */
-               {
-                       trigger = 0;
-                       break;
-               }
-               case 2: /* reserved */
-               {
-                       printk(KERN_WARNING "broken BIOS!!\n");
-                       trigger = 1;
-                       break;
-               }
-               case 3: /* level */
-               {
-                       trigger = 1;
-                       break;
-               }
-               default: /* invalid */
-               {
-                       printk(KERN_WARNING "broken BIOS!!\n");
-                       trigger = 0;
-                       break;
-               }
-       }
-       return trigger;
-}
-
-static inline int irq_polarity(int idx)
-{
-       return MPBIOS_polarity(idx);
-}
-
-static inline int irq_trigger(int idx)
-{
-       return MPBIOS_trigger(idx);
-}
-
-static int pin_2_irq(int idx, int apic, int pin)
-{
-       int irq, i;
-       int bus = mp_irqs[idx].mpc_srcbus;
-
-       /*
-        * Debugging check, we are in big trouble if this message pops up!
-        */
-       if (mp_irqs[idx].mpc_dstirq != pin)
-               printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
-
-       if (test_bit(bus, mp_bus_not_pci)) {
-               irq = mp_irqs[idx].mpc_srcbusirq;
-       } else {
-               /*
-                * PCI IRQs are mapped in order
-                */
-               i = irq = 0;
-               while (i < apic)
-                       irq += nr_ioapic_registers[i++];
-               irq += pin;
-       }
-       BUG_ON(irq >= NR_IRQS);
-       return irq;
-}
-
-static int __assign_irq_vector(int irq, cpumask_t mask)
-{
-       /*
-        * NOTE! The local APIC isn't very good at handling
-        * multiple interrupts at the same interrupt level.
-        * As the interrupt level is determined by taking the
-        * vector number and shifting that right by 4, we
-        * want to spread these out a bit so that they don't
-        * all fall in the same interrupt level.
-        *
-        * Also, we've got to be careful not to trash gate
-        * 0x80, because int 0x80 is hm, kind of importantish. ;)
-        */
-       static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
-       unsigned int old_vector;
-       int cpu;
-       struct irq_cfg *cfg;
-
-       BUG_ON((unsigned)irq >= NR_IRQS);
-       cfg = &irq_cfg[irq];
-
-       /* Only try and allocate irqs on cpus that are present */
-       cpus_and(mask, mask, cpu_online_map);
-
-       if ((cfg->move_in_progress) || cfg->move_cleanup_count)
-               return -EBUSY;
-
-       old_vector = cfg->vector;
-       if (old_vector) {
-               cpumask_t tmp;
-               cpus_and(tmp, cfg->domain, mask);
-               if (!cpus_empty(tmp))
-                       return 0;
-       }
-
-       for_each_cpu_mask(cpu, mask) {
-               cpumask_t domain, new_mask;
-               int new_cpu;
-               int vector, offset;
-
-               domain = vector_allocation_domain(cpu);
-               cpus_and(new_mask, domain, cpu_online_map);
-
-               vector = current_vector;
-               offset = current_offset;
-next:
-               vector += 8;
-               if (vector >= FIRST_SYSTEM_VECTOR) {
-                       /* If we run out of vectors on large boxen, must share them. */
-                       offset = (offset + 1) % 8;
-                       vector = FIRST_DEVICE_VECTOR + offset;
-               }
-               if (unlikely(current_vector == vector))
-                       continue;
-               if (vector == IA32_SYSCALL_VECTOR)
-                       goto next;
-               for_each_cpu_mask(new_cpu, new_mask)
-                       if (per_cpu(vector_irq, new_cpu)[vector] != -1)
-                               goto next;
-               /* Found one! */
-               current_vector = vector;
-               current_offset = offset;
-               if (old_vector) {
-                       cfg->move_in_progress = 1;
-                       cfg->old_domain = cfg->domain;
-               }
-               for_each_cpu_mask(new_cpu, new_mask)
-                       per_cpu(vector_irq, new_cpu)[vector] = irq;
-               cfg->vector = vector;
-               cfg->domain = domain;
-               return 0;
-       }
-       return -ENOSPC;
-}
-
-static int assign_irq_vector(int irq, cpumask_t mask)
-{
-       int err;
-       unsigned long flags;
-
-       spin_lock_irqsave(&vector_lock, flags);
-       err = __assign_irq_vector(irq, mask);
-       spin_unlock_irqrestore(&vector_lock, flags);
-       return err;
-}
-
-static void __clear_irq_vector(int irq)
-{
-       struct irq_cfg *cfg;
-       cpumask_t mask;
-       int cpu, vector;
-
-       BUG_ON((unsigned)irq >= NR_IRQS);
-       cfg = &irq_cfg[irq];
-       BUG_ON(!cfg->vector);
-
-       vector = cfg->vector;
-       cpus_and(mask, cfg->domain, cpu_online_map);
-       for_each_cpu_mask(cpu, mask)
-               per_cpu(vector_irq, cpu)[vector] = -1;
-
-       cfg->vector = 0;
-       cfg->domain = CPU_MASK_NONE;
-}
-
-void __setup_vector_irq(int cpu)
-{
-       /* Initialize vector_irq on a new cpu */
-       /* This function must be called with vector_lock held */
-       int irq, vector;
-
-       /* Mark the inuse vectors */
-       for (irq = 0; irq < NR_IRQS; ++irq) {
-               if (!cpu_isset(cpu, irq_cfg[irq].domain))
-                       continue;
-               vector = irq_cfg[irq].vector;
-               per_cpu(vector_irq, cpu)[vector] = irq;
-       }
-       /* Mark the free vectors */
-       for (vector = 0; vector < NR_VECTORS; ++vector) {
-               irq = per_cpu(vector_irq, cpu)[vector];
-               if (irq < 0)
-                       continue;
-               if (!cpu_isset(cpu, irq_cfg[irq].domain))
-                       per_cpu(vector_irq, cpu)[vector] = -1;
-       }
-}
-
-
-static struct irq_chip ioapic_chip;
-
-static void ioapic_register_intr(int irq, unsigned long trigger)
-{
-       if (trigger) {
-               irq_desc[irq].status |= IRQ_LEVEL;
-               set_irq_chip_and_handler_name(irq, &ioapic_chip,
-                                             handle_fasteoi_irq, "fasteoi");
-       } else {
-               irq_desc[irq].status &= ~IRQ_LEVEL;
-               set_irq_chip_and_handler_name(irq, &ioapic_chip,
-                                             handle_edge_irq, "edge");
-       }
-}
-
-static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
-                             int trigger, int polarity)
-{
-       struct irq_cfg *cfg = irq_cfg + irq;
-       struct IO_APIC_route_entry entry;
-       cpumask_t mask;
-
-       if (!IO_APIC_IRQ(irq))
-               return;
-
-       mask = TARGET_CPUS;
-       if (assign_irq_vector(irq, mask))
-               return;
-
-       cpus_and(mask, cfg->domain, mask);
-
-       apic_printk(APIC_VERBOSE,KERN_DEBUG
-                   "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
-                   "IRQ %d Mode:%i Active:%i)\n",
-                   apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector,
-                   irq, trigger, polarity);
-
-       /*
-        * add it to the IO-APIC irq-routing table:
-        */
-       memset(&entry,0,sizeof(entry));
-
-       entry.delivery_mode = INT_DELIVERY_MODE;
-       entry.dest_mode = INT_DEST_MODE;
-       entry.dest = cpu_mask_to_apicid(mask);
-       entry.mask = 0;                         /* enable IRQ */
-       entry.trigger = trigger;
-       entry.polarity = polarity;
-       entry.vector = cfg->vector;
-
-       /* Mask level triggered irqs.
-        * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
-        */
-       if (trigger)
-               entry.mask = 1;
-
-       ioapic_register_intr(irq, trigger);
-       if (irq < 16)
-               disable_8259A_irq(irq);
-
-       ioapic_write_entry(apic, pin, entry);
-}
-
-static void __init setup_IO_APIC_irqs(void)
-{
-       int apic, pin, idx, irq, first_notcon = 1;
-
-       apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
-
-       for (apic = 0; apic < nr_ioapics; apic++) {
-       for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
-
-               idx = find_irq_entry(apic,pin,mp_INT);
-               if (idx == -1) {
-                       if (first_notcon) {
-                               apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
-                               first_notcon = 0;
-                       } else
-                               apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
-                       continue;
-               }
-
-               irq = pin_2_irq(idx, apic, pin);
-               add_pin_to_irq(irq, apic, pin);
-
-               setup_IO_APIC_irq(apic, pin, irq,
-                                 irq_trigger(idx), irq_polarity(idx));
-       }
-       }
-
-       if (!first_notcon)
-               apic_printk(APIC_VERBOSE," not connected.\n");
-}
-
-/*
- * Set up the 8259A-master output pin as broadcast to all
- * CPUs.
- */
-static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
-{
-       struct IO_APIC_route_entry entry;
-       unsigned long flags;
-
-       memset(&entry,0,sizeof(entry));
-
-       disable_8259A_irq(0);
-
-       /* mask LVT0 */
-       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
-
-       /*
-        * We use logical delivery to get the timer IRQ
-        * to the first CPU.
-        */
-       entry.dest_mode = INT_DEST_MODE;
-       entry.mask = 0;                                 /* unmask IRQ now */
-       entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
-       entry.delivery_mode = INT_DELIVERY_MODE;
-       entry.polarity = 0;
-       entry.trigger = 0;
-       entry.vector = vector;
-
-       /*
-        * The timer IRQ doesn't have to know that behind the
-        * scene we have a 8259A-master in AEOI mode ...
-        */
-       set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
-
-       /*
-        * Add it to the IO-APIC irq-routing table:
-        */
-       spin_lock_irqsave(&ioapic_lock, flags);
-       io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
-       io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-
-       enable_8259A_irq(0);
-}
-
-void __apicdebuginit print_IO_APIC(void)
-{
-       int apic, i;
-       union IO_APIC_reg_00 reg_00;
-       union IO_APIC_reg_01 reg_01;
-       union IO_APIC_reg_02 reg_02;
-       unsigned long flags;
-
-       if (apic_verbosity == APIC_QUIET)
-               return;
-
-       printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
-       for (i = 0; i < nr_ioapics; i++)
-               printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
-                      mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
-
-       /*
-        * We are a bit conservative about what we expect.  We have to
-        * know about every hardware change ASAP.
-        */
-       printk(KERN_INFO "testing the IO APIC.......................\n");
-
-       for (apic = 0; apic < nr_ioapics; apic++) {
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       reg_00.raw = io_apic_read(apic, 0);
-       reg_01.raw = io_apic_read(apic, 1);
-       if (reg_01.bits.version >= 0x10)
-               reg_02.raw = io_apic_read(apic, 2);
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-
-       printk("\n");
-       printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
-       printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
-       printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
-
-       printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
-       printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
-
-       printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
-       printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
-
-       if (reg_01.bits.version >= 0x10) {
-               printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
-               printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
-       }
-
-       printk(KERN_DEBUG ".... IRQ redirection table:\n");
-
-       printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
-                         " Stat Dmod Deli Vect:   \n");
-
-       for (i = 0; i <= reg_01.bits.entries; i++) {
-               struct IO_APIC_route_entry entry;
-
-               entry = ioapic_read_entry(apic, i);
-
-               printk(KERN_DEBUG " %02x %03X ",
-                       i,
-                       entry.dest
-               );
-
-               printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
-                       entry.mask,
-                       entry.trigger,
-                       entry.irr,
-                       entry.polarity,
-                       entry.delivery_status,
-                       entry.dest_mode,
-                       entry.delivery_mode,
-                       entry.vector
-               );
-       }
-       }
-       printk(KERN_DEBUG "IRQ to pin mappings:\n");
-       for (i = 0; i < NR_IRQS; i++) {
-               struct irq_pin_list *entry = irq_2_pin + i;
-               if (entry->pin < 0)
-                       continue;
-               printk(KERN_DEBUG "IRQ%d ", i);
-               for (;;) {
-                       printk("-> %d:%d", entry->apic, entry->pin);
-                       if (!entry->next)
-                               break;
-                       entry = irq_2_pin + entry->next;
-               }
-               printk("\n");
-       }
-
-       printk(KERN_INFO ".................................... done.\n");
-
-       return;
-}
-
-#if 0
-
-static __apicdebuginit void print_APIC_bitfield (int base)
-{
-       unsigned int v;
-       int i, j;
-
-       if (apic_verbosity == APIC_QUIET)
-               return;
-
-       printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
-       for (i = 0; i < 8; i++) {
-               v = apic_read(base + i*0x10);
-               for (j = 0; j < 32; j++) {
-                       if (v & (1<<j))
-                               printk("1");
-                       else
-                               printk("0");
-               }
-               printk("\n");
-       }
-}
-
-void __apicdebuginit print_local_APIC(void * dummy)
-{
-       unsigned int v, ver, maxlvt;
-
-       if (apic_verbosity == APIC_QUIET)
-               return;
-
-       printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
-               smp_processor_id(), hard_smp_processor_id());
-       v = apic_read(APIC_ID);
-       printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(v));
-       v = apic_read(APIC_LVR);
-       printk(KERN_INFO "... APIC VERSION: %08x\n", v);
-       ver = GET_APIC_VERSION(v);
-       maxlvt = get_maxlvt();
-
-       v = apic_read(APIC_TASKPRI);
-       printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
-
-       v = apic_read(APIC_ARBPRI);
-       printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
-               v & APIC_ARBPRI_MASK);
-       v = apic_read(APIC_PROCPRI);
-       printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
-
-       v = apic_read(APIC_EOI);
-       printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
-       v = apic_read(APIC_RRR);
-       printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
-       v = apic_read(APIC_LDR);
-       printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
-       v = apic_read(APIC_DFR);
-       printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
-       v = apic_read(APIC_SPIV);
-       printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
-
-       printk(KERN_DEBUG "... APIC ISR field:\n");
-       print_APIC_bitfield(APIC_ISR);
-       printk(KERN_DEBUG "... APIC TMR field:\n");
-       print_APIC_bitfield(APIC_TMR);
-       printk(KERN_DEBUG "... APIC IRR field:\n");
-       print_APIC_bitfield(APIC_IRR);
-
-       v = apic_read(APIC_ESR);
-       printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
-
-       v = apic_read(APIC_ICR);
-       printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
-       v = apic_read(APIC_ICR2);
-       printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
-
-       v = apic_read(APIC_LVTT);
-       printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
-
-       if (maxlvt > 3) {                       /* PC is LVT#4. */
-               v = apic_read(APIC_LVTPC);
-               printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
-       }
-       v = apic_read(APIC_LVT0);
-       printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
-       v = apic_read(APIC_LVT1);
-       printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
-
-       if (maxlvt > 2) {                       /* ERR is LVT#3. */
-               v = apic_read(APIC_LVTERR);
-               printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
-       }
-
-       v = apic_read(APIC_TMICT);
-       printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
-       v = apic_read(APIC_TMCCT);
-       printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
-       v = apic_read(APIC_TDCR);
-       printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
-       printk("\n");
-}
-
-void print_all_local_APICs (void)
-{
-       on_each_cpu(print_local_APIC, NULL, 1, 1);
-}
-
-void __apicdebuginit print_PIC(void)
-{
-       unsigned int v;
-       unsigned long flags;
-
-       if (apic_verbosity == APIC_QUIET)
-               return;
-
-       printk(KERN_DEBUG "\nprinting PIC contents\n");
-
-       spin_lock_irqsave(&i8259A_lock, flags);
-
-       v = inb(0xa1) << 8 | inb(0x21);
-       printk(KERN_DEBUG "... PIC  IMR: %04x\n", v);
-
-       v = inb(0xa0) << 8 | inb(0x20);
-       printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
-
-       outb(0x0b,0xa0);
-       outb(0x0b,0x20);
-       v = inb(0xa0) << 8 | inb(0x20);
-       outb(0x0a,0xa0);
-       outb(0x0a,0x20);
-
-       spin_unlock_irqrestore(&i8259A_lock, flags);
-
-       printk(KERN_DEBUG "... PIC  ISR: %04x\n", v);
-
-       v = inb(0x4d1) << 8 | inb(0x4d0);
-       printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
-}
-
-#endif  /*  0  */
-
-static void __init enable_IO_APIC(void)
-{
-       union IO_APIC_reg_01 reg_01;
-       int i8259_apic, i8259_pin;
-       int i, apic;
-       unsigned long flags;
-
-       for (i = 0; i < PIN_MAP_SIZE; i++) {
-               irq_2_pin[i].pin = -1;
-               irq_2_pin[i].next = 0;
-       }
-
-       /*
-        * The number of IO-APIC IRQ registers (== #pins):
-        */
-       for (apic = 0; apic < nr_ioapics; apic++) {
-               spin_lock_irqsave(&ioapic_lock, flags);
-               reg_01.raw = io_apic_read(apic, 1);
-               spin_unlock_irqrestore(&ioapic_lock, flags);
-               nr_ioapic_registers[apic] = reg_01.bits.entries+1;
-       }
-       for(apic = 0; apic < nr_ioapics; apic++) {
-               int pin;
-               /* See if any of the pins is in ExtINT mode */
-               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
-                       struct IO_APIC_route_entry entry;
-                       entry = ioapic_read_entry(apic, pin);
-
-                       /* If the interrupt line is enabled and in ExtInt mode
-                        * I have found the pin where the i8259 is connected.
-                        */
-                       if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
-                               ioapic_i8259.apic = apic;
-                               ioapic_i8259.pin  = pin;
-                               goto found_i8259;
-                       }
-               }
-       }
- found_i8259:
-       /* Look to see what if the MP table has reported the ExtINT */
-       i8259_pin  = find_isa_irq_pin(0, mp_ExtINT);
-       i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
-       /* Trust the MP table if nothing is setup in the hardware */
-       if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
-               printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
-               ioapic_i8259.pin  = i8259_pin;
-               ioapic_i8259.apic = i8259_apic;
-       }
-       /* Complain if the MP table and the hardware disagree */
-       if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
-               (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
-       {
-               printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
-       }
-
-       /*
-        * Do not trust the IO-APIC being empty at bootup
-        */
-       clear_IO_APIC();
-}
-
-/*
- * Not an __init, needed by the reboot code
- */
-void disable_IO_APIC(void)
-{
-       /*
-        * Clear the IO-APIC before rebooting:
-        */
-       clear_IO_APIC();
-
-       /*
-        * If the i8259 is routed through an IOAPIC
-        * Put that IOAPIC in virtual wire mode
-        * so legacy interrupts can be delivered.
-        */
-       if (ioapic_i8259.pin != -1) {
-               struct IO_APIC_route_entry entry;
-
-               memset(&entry, 0, sizeof(entry));
-               entry.mask            = 0; /* Enabled */
-               entry.trigger         = 0; /* Edge */
-               entry.irr             = 0;
-               entry.polarity        = 0; /* High */
-               entry.delivery_status = 0;
-               entry.dest_mode       = 0; /* Physical */
-               entry.delivery_mode   = dest_ExtINT; /* ExtInt */
-               entry.vector          = 0;
-               entry.dest          = GET_APIC_ID(apic_read(APIC_ID));
-
-               /*
-                * Add it to the IO-APIC irq-routing table:
-                */
-               ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
-       }
-
-       disconnect_bsp_APIC(ioapic_i8259.pin != -1);
-}
-
-/*
- * There is a nasty bug in some older SMP boards, their mptable lies
- * about the timer IRQ. We do the following to work around the situation:
- *
- *     - timer IRQ defaults to IO-APIC IRQ
- *     - if this function detects that timer IRQs are defunct, then we fall
- *       back to ISA timer IRQs
- */
-static int __init timer_irq_works(void)
-{
-       unsigned long t1 = jiffies;
-
-       local_irq_enable();
-       /* Let ten ticks pass... */
-       mdelay((10 * 1000) / HZ);
-
-       /*
-        * Expect a few ticks at least, to be sure some possible
-        * glue logic does not lock up after one or two first
-        * ticks in a non-ExtINT mode.  Also the local APIC
-        * might have cached one ExtINT interrupt.  Finally, at
-        * least one tick may be lost due to delays.
-        */
-
-       /* jiffies wrap? */
-       if (jiffies - t1 > 4)
-               return 1;
-       return 0;
-}
-
-/*
- * In the SMP+IOAPIC case it might happen that there are an unspecified
- * number of pending IRQ events unhandled. These cases are very rare,
- * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
- * better to do it this way as thus we do not have to be aware of
- * 'pending' interrupts in the IRQ path, except at this point.
- */
-/*
- * Edge triggered needs to resend any interrupt
- * that was delayed but this is now handled in the device
- * independent code.
- */
-
-/*
- * Starting up a edge-triggered IO-APIC interrupt is
- * nasty - we need to make sure that we get the edge.
- * If it is already asserted for some reason, we need
- * return 1 to indicate that is was pending.
- *
- * This is not complete - we should be able to fake
- * an edge even if it isn't on the 8259A...
- */
-
-static unsigned int startup_ioapic_irq(unsigned int irq)
-{
-       int was_pending = 0;
-       unsigned long flags;
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       if (irq < 16) {
-               disable_8259A_irq(irq);
-               if (i8259A_irq_pending(irq))
-                       was_pending = 1;
-       }
-       __unmask_IO_APIC_irq(irq);
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-
-       return was_pending;
-}
-
-static int ioapic_retrigger_irq(unsigned int irq)
-{
-       struct irq_cfg *cfg = &irq_cfg[irq];
-       cpumask_t mask;
-       unsigned long flags;
-
-       spin_lock_irqsave(&vector_lock, flags);
-       cpus_clear(mask);
-       cpu_set(first_cpu(cfg->domain), mask);
-
-       send_IPI_mask(mask, cfg->vector);
-       spin_unlock_irqrestore(&vector_lock, flags);
-
-       return 1;
-}
-
-/*
- * Level and edge triggered IO-APIC interrupts need different handling,
- * so we use two separate IRQ descriptors. Edge triggered IRQs can be
- * handled with the level-triggered descriptor, but that one has slightly
- * more overhead. Level-triggered interrupts cannot be handled with the
- * edge-triggered handler, without risking IRQ storms and other ugly
- * races.
- */
-
-#ifdef CONFIG_SMP
-asmlinkage void smp_irq_move_cleanup_interrupt(void)
-{
-       unsigned vector, me;
-       ack_APIC_irq();
-       exit_idle();
-       irq_enter();
-
-       me = smp_processor_id();
-       for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
-               unsigned int irq;
-               struct irq_desc *desc;
-               struct irq_cfg *cfg;
-               irq = __get_cpu_var(vector_irq)[vector];
-               if (irq >= NR_IRQS)
-                       continue;
-
-               desc = irq_desc + irq;
-               cfg = irq_cfg + irq;
-               spin_lock(&desc->lock);
-               if (!cfg->move_cleanup_count)
-                       goto unlock;
-
-               if ((vector == cfg->vector) && cpu_isset(me, cfg->domain))
-                       goto unlock;
-
-               __get_cpu_var(vector_irq)[vector] = -1;
-               cfg->move_cleanup_count--;
-unlock:
-               spin_unlock(&desc->lock);
-       }
-
-       irq_exit();
-}
-
-static void irq_complete_move(unsigned int irq)
-{
-       struct irq_cfg *cfg = irq_cfg + irq;
-       unsigned vector, me;
-
-       if (likely(!cfg->move_in_progress))
-               return;
-
-       vector = ~get_irq_regs()->orig_rax;
-       me = smp_processor_id();
-       if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
-               cpumask_t cleanup_mask;
-
-               cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
-               cfg->move_cleanup_count = cpus_weight(cleanup_mask);
-               send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
-               cfg->move_in_progress = 0;
-       }
-}
-#else
-static inline void irq_complete_move(unsigned int irq) {}
-#endif
-
-static void ack_apic_edge(unsigned int irq)
-{
-       irq_complete_move(irq);
-       move_native_irq(irq);
-       ack_APIC_irq();
-}
-
-static void ack_apic_level(unsigned int irq)
-{
-       int do_unmask_irq = 0;
-
-       irq_complete_move(irq);
-#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
-       /* If we are moving the irq we need to mask it */
-       if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
-               do_unmask_irq = 1;
-               mask_IO_APIC_irq(irq);
-       }
-#endif
-
-       /*
-        * We must acknowledge the irq before we move it or the acknowledge will
-        * not propagate properly.
-        */
-       ack_APIC_irq();
-
-       /* Now we can move and renable the irq */
-       if (unlikely(do_unmask_irq)) {
-               /* Only migrate the irq if the ack has been received.
-                *
-                * On rare occasions the broadcast level triggered ack gets
-                * delayed going to ioapics, and if we reprogram the
-                * vector while Remote IRR is still set the irq will never
-                * fire again.
-                *
-                * To prevent this scenario we read the Remote IRR bit
-                * of the ioapic.  This has two effects.
-                * - On any sane system the read of the ioapic will
-                *   flush writes (and acks) going to the ioapic from
-                *   this cpu.
-                * - We get to see if the ACK has actually been delivered.
-                *
-                * Based on failed experiments of reprogramming the
-                * ioapic entry from outside of irq context starting
-                * with masking the ioapic entry and then polling until
-                * Remote IRR was clear before reprogramming the
-                * ioapic I don't trust the Remote IRR bit to be
-                * completey accurate.
-                *
-                * However there appears to be no other way to plug
-                * this race, so if the Remote IRR bit is not
-                * accurate and is causing problems then it is a hardware bug
-                * and you can go talk to the chipset vendor about it.
-                */
-               if (!io_apic_level_ack_pending(irq))
-                       move_masked_irq(irq);
-               unmask_IO_APIC_irq(irq);
-       }
-}
-
-static struct irq_chip ioapic_chip __read_mostly = {
-       .name           = "IO-APIC",
-       .startup        = startup_ioapic_irq,
-       .mask           = mask_IO_APIC_irq,
-       .unmask         = unmask_IO_APIC_irq,
-       .ack            = ack_apic_edge,
-       .eoi            = ack_apic_level,
-#ifdef CONFIG_SMP
-       .set_affinity   = set_ioapic_affinity_irq,
-#endif
-       .retrigger      = ioapic_retrigger_irq,
-};
-
-static inline void init_IO_APIC_traps(void)
-{
-       int irq;
-
-       /*
-        * NOTE! The local APIC isn't very good at handling
-        * multiple interrupts at the same interrupt level.
-        * As the interrupt level is determined by taking the
-        * vector number and shifting that right by 4, we
-        * want to spread these out a bit so that they don't
-        * all fall in the same interrupt level.
-        *
-        * Also, we've got to be careful not to trash gate
-        * 0x80, because int 0x80 is hm, kind of importantish. ;)
-        */
-       for (irq = 0; irq < NR_IRQS ; irq++) {
-               int tmp = irq;
-               if (IO_APIC_IRQ(tmp) && !irq_cfg[tmp].vector) {
-                       /*
-                        * Hmm.. We don't have an entry for this,
-                        * so default to an old-fashioned 8259
-                        * interrupt if we can..
-                        */
-                       if (irq < 16)
-                               make_8259A_irq(irq);
-                       else
-                               /* Strange. Oh, well.. */
-                               irq_desc[irq].chip = &no_irq_chip;
-               }
-       }
-}
-
-static void enable_lapic_irq (unsigned int irq)
-{
-       unsigned long v;
-
-       v = apic_read(APIC_LVT0);
-       apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
-}
-
-static void disable_lapic_irq (unsigned int irq)
-{
-       unsigned long v;
-
-       v = apic_read(APIC_LVT0);
-       apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
-}
-
-static void ack_lapic_irq (unsigned int irq)
-{
-       ack_APIC_irq();
-}
-
-static void end_lapic_irq (unsigned int i) { /* nothing */ }
-
-static struct hw_interrupt_type lapic_irq_type __read_mostly = {
-       .name = "local-APIC",
-       .typename = "local-APIC-edge",
-       .startup = NULL, /* startup_irq() not used for IRQ0 */
-       .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
-       .enable = enable_lapic_irq,
-       .disable = disable_lapic_irq,
-       .ack = ack_lapic_irq,
-       .end = end_lapic_irq,
-};
-
-static void setup_nmi (void)
-{
-       /*
-        * Dirty trick to enable the NMI watchdog ...
-        * We put the 8259A master into AEOI mode and
-        * unmask on all local APICs LVT0 as NMI.
-        *
-        * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
-        * is from Maciej W. Rozycki - so we do not have to EOI from
-        * the NMI handler or the timer interrupt.
-        */ 
-       printk(KERN_INFO "activating NMI Watchdog ...");
-
-       enable_NMI_through_LVT0(NULL);
-
-       printk(" done.\n");
-}
-
-/*
- * This looks a bit hackish but it's about the only one way of sending
- * a few INTA cycles to 8259As and any associated glue logic.  ICR does
- * not support the ExtINT mode, unfortunately.  We need to send these
- * cycles as some i82489DX-based boards have glue logic that keeps the
- * 8259A interrupt line asserted until INTA.  --macro
- */
-static inline void unlock_ExtINT_logic(void)
-{
-       int apic, pin, i;
-       struct IO_APIC_route_entry entry0, entry1;
-       unsigned char save_control, save_freq_select;
-       unsigned long flags;
-
-       pin  = find_isa_irq_pin(8, mp_INT);
-       apic = find_isa_irq_apic(8, mp_INT);
-       if (pin == -1)
-               return;
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
-       *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-       clear_IO_APIC_pin(apic, pin);
-
-       memset(&entry1, 0, sizeof(entry1));
-
-       entry1.dest_mode = 0;                   /* physical delivery */
-       entry1.mask = 0;                        /* unmask IRQ now */
-       entry1.dest = hard_smp_processor_id();
-       entry1.delivery_mode = dest_ExtINT;
-       entry1.polarity = entry0.polarity;
-       entry1.trigger = 0;
-       entry1.vector = 0;
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
-       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-
-       save_control = CMOS_READ(RTC_CONTROL);
-       save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
-       CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
-                  RTC_FREQ_SELECT);
-       CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
-
-       i = 100;
-       while (i-- > 0) {
-               mdelay(10);
-               if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
-                       i -= 10;
-       }
-
-       CMOS_WRITE(save_control, RTC_CONTROL);
-       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
-       clear_IO_APIC_pin(apic, pin);
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
-       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-/*
- * This code may look a bit paranoid, but it's supposed to cooperate with
- * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
- * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
- * fanatically on his truly buggy board.
- *
- * FIXME: really need to revamp this for modern platforms only.
- */
-static inline void check_timer(void)
-{
-       struct irq_cfg *cfg = irq_cfg + 0;
-       int apic1, pin1, apic2, pin2;
-
-       /*
-        * get/set the timer IRQ vector:
-        */
-       disable_8259A_irq(0);
-       assign_irq_vector(0, TARGET_CPUS);
-
-       /*
-        * Subtle, code in do_timer_interrupt() expects an AEOI
-        * mode for the 8259A whenever interrupts are routed
-        * through I/O APICs.  Also IRQ0 has to be enabled in
-        * the 8259A which implies the virtual wire has to be
-        * disabled in the local APIC.
-        */
-       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
-       init_8259A(1);
-       if (timer_over_8254 > 0)
-               enable_8259A_irq(0);
-
-       pin1  = find_isa_irq_pin(0, mp_INT);
-       apic1 = find_isa_irq_apic(0, mp_INT);
-       pin2  = ioapic_i8259.pin;
-       apic2 = ioapic_i8259.apic;
-
-       apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
-               cfg->vector, apic1, pin1, apic2, pin2);
-
-       if (pin1 != -1) {
-               /*
-                * Ok, does IRQ0 through the IOAPIC work?
-                */
-               unmask_IO_APIC_irq(0);
-               if (!no_timer_check && timer_irq_works()) {
-                       nmi_watchdog_default();
-                       if (nmi_watchdog == NMI_IO_APIC) {
-                               disable_8259A_irq(0);
-                               setup_nmi();
-                               enable_8259A_irq(0);
-                       }
-                       if (disable_timer_pin_1 > 0)
-                               clear_IO_APIC_pin(0, pin1);
-                       return;
-               }
-               clear_IO_APIC_pin(apic1, pin1);
-               apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
-                               "connected to IO-APIC\n");
-       }
-
-       apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
-                               "through the 8259A ... ");
-       if (pin2 != -1) {
-               apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
-                       apic2, pin2);
-               /*
-                * legacy devices should be connected to IO APIC #0
-                */
-               setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector);
-               if (timer_irq_works()) {
-                       apic_printk(APIC_VERBOSE," works.\n");
-                       nmi_watchdog_default();
-                       if (nmi_watchdog == NMI_IO_APIC) {
-                               setup_nmi();
-                       }
-                       return;
-               }
-               /*
-                * Cleanup, just in case ...
-                */
-               clear_IO_APIC_pin(apic2, pin2);
-       }
-       apic_printk(APIC_VERBOSE," failed.\n");
-
-       if (nmi_watchdog == NMI_IO_APIC) {
-               printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
-               nmi_watchdog = 0;
-       }
-
-       apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
-
-       disable_8259A_irq(0);
-       irq_desc[0].chip = &lapic_irq_type;
-       apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);     /* Fixed mode */
-       enable_8259A_irq(0);
-
-       if (timer_irq_works()) {
-               apic_printk(APIC_VERBOSE," works.\n");
-               return;
-       }
-       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
-       apic_printk(APIC_VERBOSE," failed.\n");
-
-       apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
-
-       init_8259A(0);
-       make_8259A_irq(0);
-       apic_write(APIC_LVT0, APIC_DM_EXTINT);
-
-       unlock_ExtINT_logic();
-
-       if (timer_irq_works()) {
-               apic_printk(APIC_VERBOSE," works.\n");
-               return;
-       }
-       apic_printk(APIC_VERBOSE," failed :(.\n");
-       panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
-}
-
-static int __init notimercheck(char *s)
-{
-       no_timer_check = 1;
-       return 1;
-}
-__setup("no_timer_check", notimercheck);
-
-/*
- *
- * IRQ's that are handled by the PIC in the MPS IOAPIC case.
- * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
- *   Linux doesn't really care, as it's not actually used
- *   for any interrupt handling anyway.
- */
-#define PIC_IRQS       (1<<2)
-
-void __init setup_IO_APIC(void)
-{
-       enable_IO_APIC();
-
-       if (acpi_ioapic)
-               io_apic_irqs = ~0;      /* all IRQs go through IOAPIC */
-       else
-               io_apic_irqs = ~PIC_IRQS;
-
-       apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
-
-       sync_Arb_IDs();
-       setup_IO_APIC_irqs();
-       init_IO_APIC_traps();
-       check_timer();
-       if (!acpi_ioapic)
-               print_IO_APIC();
-}
-
-struct sysfs_ioapic_data {
-       struct sys_device dev;
-       struct IO_APIC_route_entry entry[0];
-};
-static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
-
-static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
-{
-       struct IO_APIC_route_entry *entry;
-       struct sysfs_ioapic_data *data;
-       int i;
-
-       data = container_of(dev, struct sysfs_ioapic_data, dev);
-       entry = data->entry;
-       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ )
-               *entry = ioapic_read_entry(dev->id, i);
-
-       return 0;
-}
-
-static int ioapic_resume(struct sys_device *dev)
-{
-       struct IO_APIC_route_entry *entry;
-       struct sysfs_ioapic_data *data;
-       unsigned long flags;
-       union IO_APIC_reg_00 reg_00;
-       int i;
-
-       data = container_of(dev, struct sysfs_ioapic_data, dev);
-       entry = data->entry;
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       reg_00.raw = io_apic_read(dev->id, 0);
-       if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
-               reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
-               io_apic_write(dev->id, 0, reg_00.raw);
-       }
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-       for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
-               ioapic_write_entry(dev->id, i, entry[i]);
-
-       return 0;
-}
-
-static struct sysdev_class ioapic_sysdev_class = {
-       set_kset_name("ioapic"),
-       .suspend = ioapic_suspend,
-       .resume = ioapic_resume,
-};
-
-static int __init ioapic_init_sysfs(void)
-{
-       struct sys_device * dev;
-       int i, size, error = 0;
-
-       error = sysdev_class_register(&ioapic_sysdev_class);
-       if (error)
-               return error;
-
-       for (i = 0; i < nr_ioapics; i++ ) {
-               size = sizeof(struct sys_device) + nr_ioapic_registers[i]
-                       * sizeof(struct IO_APIC_route_entry);
-               mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
-               if (!mp_ioapic_data[i]) {
-                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
-                       continue;
-               }
-               memset(mp_ioapic_data[i], 0, size);
-               dev = &mp_ioapic_data[i]->dev;
-               dev->id = i;
-               dev->cls = &ioapic_sysdev_class;
-               error = sysdev_register(dev);
-               if (error) {
-                       kfree(mp_ioapic_data[i]);
-                       mp_ioapic_data[i] = NULL;
-                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
-                       continue;
-               }
-       }
-
-       return 0;
-}
-
-device_initcall(ioapic_init_sysfs);
-
-/*
- * Dynamic irq allocate and deallocation
- */
-int create_irq(void)
-{
-       /* Allocate an unused irq */
-       int irq;
-       int new;
-       unsigned long flags;
-
-       irq = -ENOSPC;
-       spin_lock_irqsave(&vector_lock, flags);
-       for (new = (NR_IRQS - 1); new >= 0; new--) {
-               if (platform_legacy_irq(new))
-                       continue;
-               if (irq_cfg[new].vector != 0)
-                       continue;
-               if (__assign_irq_vector(new, TARGET_CPUS) == 0)
-                       irq = new;
-               break;
-       }
-       spin_unlock_irqrestore(&vector_lock, flags);
-
-       if (irq >= 0) {
-               dynamic_irq_init(irq);
-       }
-       return irq;
-}
-
-void destroy_irq(unsigned int irq)
-{
-       unsigned long flags;
-
-       dynamic_irq_cleanup(irq);
-
-       spin_lock_irqsave(&vector_lock, flags);
-       __clear_irq_vector(irq);
-       spin_unlock_irqrestore(&vector_lock, flags);
-}
-
-/*
- * MSI mesage composition
- */
-#ifdef CONFIG_PCI_MSI
-static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
-{
-       struct irq_cfg *cfg = irq_cfg + irq;
-       int err;
-       unsigned dest;
-       cpumask_t tmp;
-
-       tmp = TARGET_CPUS;
-       err = assign_irq_vector(irq, tmp);
-       if (!err) {
-               cpus_and(tmp, cfg->domain, tmp);
-               dest = cpu_mask_to_apicid(tmp);
-
-               msg->address_hi = MSI_ADDR_BASE_HI;
-               msg->address_lo =
-                       MSI_ADDR_BASE_LO |
-                       ((INT_DEST_MODE == 0) ?
-                               MSI_ADDR_DEST_MODE_PHYSICAL:
-                               MSI_ADDR_DEST_MODE_LOGICAL) |
-                       ((INT_DELIVERY_MODE != dest_LowestPrio) ?
-                               MSI_ADDR_REDIRECTION_CPU:
-                               MSI_ADDR_REDIRECTION_LOWPRI) |
-                       MSI_ADDR_DEST_ID(dest);
-
-               msg->data =
-                       MSI_DATA_TRIGGER_EDGE |
-                       MSI_DATA_LEVEL_ASSERT |
-                       ((INT_DELIVERY_MODE != dest_LowestPrio) ?
-                               MSI_DATA_DELIVERY_FIXED:
-                               MSI_DATA_DELIVERY_LOWPRI) |
-                       MSI_DATA_VECTOR(cfg->vector);
-       }
-       return err;
-}
-
-#ifdef CONFIG_SMP
-static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
-{
-       struct irq_cfg *cfg = irq_cfg + irq;
-       struct msi_msg msg;
-       unsigned int dest;
-       cpumask_t tmp;
-
-       cpus_and(tmp, mask, cpu_online_map);
-       if (cpus_empty(tmp))
-               return;
-
-       if (assign_irq_vector(irq, mask))
-               return;
-
-       cpus_and(tmp, cfg->domain, mask);
-       dest = cpu_mask_to_apicid(tmp);
-
-       read_msi_msg(irq, &msg);
-
-       msg.data &= ~MSI_DATA_VECTOR_MASK;
-       msg.data |= MSI_DATA_VECTOR(cfg->vector);
-       msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
-       msg.address_lo |= MSI_ADDR_DEST_ID(dest);
-
-       write_msi_msg(irq, &msg);
-       irq_desc[irq].affinity = mask;
-}
-#endif /* CONFIG_SMP */
-
-/*
- * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
- * which implement the MSI or MSI-X Capability Structure.
- */
-static struct irq_chip msi_chip = {
-       .name           = "PCI-MSI",
-       .unmask         = unmask_msi_irq,
-       .mask           = mask_msi_irq,
-       .ack            = ack_apic_edge,
-#ifdef CONFIG_SMP
-       .set_affinity   = set_msi_irq_affinity,
-#endif
-       .retrigger      = ioapic_retrigger_irq,
-};
-
-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
-{
-       struct msi_msg msg;
-       int irq, ret;
-       irq = create_irq();
-       if (irq < 0)
-               return irq;
-
-       ret = msi_compose_msg(dev, irq, &msg);
-       if (ret < 0) {
-               destroy_irq(irq);
-               return ret;
-       }
-
-       set_irq_msi(irq, desc);
-       write_msi_msg(irq, &msg);
-
-       set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
-
-       return 0;
-}
-
-void arch_teardown_msi_irq(unsigned int irq)
-{
-       destroy_irq(irq);
-}
-
-#endif /* CONFIG_PCI_MSI */
-
-/*
- * Hypertransport interrupt support
- */
-#ifdef CONFIG_HT_IRQ
-
-#ifdef CONFIG_SMP
-
-static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
-{
-       struct ht_irq_msg msg;
-       fetch_ht_irq_msg(irq, &msg);
-
-       msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
-       msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
-
-       msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
-       msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
-
-       write_ht_irq_msg(irq, &msg);
-}
-
-static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
-{
-       struct irq_cfg *cfg = irq_cfg + irq;
-       unsigned int dest;
-       cpumask_t tmp;
-
-       cpus_and(tmp, mask, cpu_online_map);
-       if (cpus_empty(tmp))
-               return;
-
-       if (assign_irq_vector(irq, mask))
-               return;
-
-       cpus_and(tmp, cfg->domain, mask);
-       dest = cpu_mask_to_apicid(tmp);
-
-       target_ht_irq(irq, dest, cfg->vector);
-       irq_desc[irq].affinity = mask;
-}
-#endif
-
-static struct irq_chip ht_irq_chip = {
-       .name           = "PCI-HT",
-       .mask           = mask_ht_irq,
-       .unmask         = unmask_ht_irq,
-       .ack            = ack_apic_edge,
-#ifdef CONFIG_SMP
-       .set_affinity   = set_ht_irq_affinity,
-#endif
-       .retrigger      = ioapic_retrigger_irq,
-};
-
-int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
-{
-       struct irq_cfg *cfg = irq_cfg + irq;
-       int err;
-       cpumask_t tmp;
-
-       tmp = TARGET_CPUS;
-       err = assign_irq_vector(irq, tmp);
-       if (!err) {
-               struct ht_irq_msg msg;
-               unsigned dest;
-
-               cpus_and(tmp, cfg->domain, tmp);
-               dest = cpu_mask_to_apicid(tmp);
-
-               msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
-
-               msg.address_lo =
-                       HT_IRQ_LOW_BASE |
-                       HT_IRQ_LOW_DEST_ID(dest) |
-                       HT_IRQ_LOW_VECTOR(cfg->vector) |
-                       ((INT_DEST_MODE == 0) ?
-                               HT_IRQ_LOW_DM_PHYSICAL :
-                               HT_IRQ_LOW_DM_LOGICAL) |
-                       HT_IRQ_LOW_RQEOI_EDGE |
-                       ((INT_DELIVERY_MODE != dest_LowestPrio) ?
-                               HT_IRQ_LOW_MT_FIXED :
-                               HT_IRQ_LOW_MT_ARBITRATED) |
-                       HT_IRQ_LOW_IRQ_MASKED;
-
-               write_ht_irq_msg(irq, &msg);
-
-               set_irq_chip_and_handler_name(irq, &ht_irq_chip,
-                                             handle_edge_irq, "edge");
-       }
-       return err;
-}
-#endif /* CONFIG_HT_IRQ */
-
-/* --------------------------------------------------------------------------
-                          ACPI-based IOAPIC Configuration
-   -------------------------------------------------------------------------- */
-
-#ifdef CONFIG_ACPI
-
-#define IO_APIC_MAX_ID         0xFE
-
-int __init io_apic_get_redir_entries (int ioapic)
-{
-       union IO_APIC_reg_01    reg_01;
-       unsigned long flags;
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       reg_01.raw = io_apic_read(ioapic, 1);
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-
-       return reg_01.bits.entries;
-}
-
-
-int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
-{
-       if (!IO_APIC_IRQ(irq)) {
-               apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
-                       ioapic);
-               return -EINVAL;
-       }
-
-       /*
-        * IRQs < 16 are already in the irq_2_pin[] map
-        */
-       if (irq >= 16)
-               add_pin_to_irq(irq, ioapic, pin);
-
-       setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
-
-       return 0;
-}
-
-#endif /* CONFIG_ACPI */
-
-
-/*
- * This function currently is only a helper for the i386 smp boot process where
- * we need to reprogram the ioredtbls to cater for the cpus which have come online
- * so mask in all cases should simply be TARGET_CPUS
- */
-#ifdef CONFIG_SMP
-void __init setup_ioapic_dest(void)
-{
-       int pin, ioapic, irq, irq_entry;
-
-       if (skip_ioapic_setup == 1)
-               return;
-
-       for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
-               for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
-                       irq_entry = find_irq_entry(ioapic, pin, mp_INT);
-                       if (irq_entry == -1)
-                               continue;
-                       irq = pin_2_irq(irq_entry, ioapic, pin);
-
-                       /* setup_IO_APIC_irqs could fail to get vector for some device
-                        * when you have too many devices, because at that time only boot
-                        * cpu is online.
-                        */
-                       if (!irq_cfg[irq].vector)
-                               setup_IO_APIC_irq(ioapic, pin, irq,
-                                                 irq_trigger(irq_entry),
-                                                 irq_polarity(irq_entry));
-                       else
-                               set_ioapic_affinity_irq(irq, TARGET_CPUS);
-               }
-
-       }
-}
-#endif
diff --git a/arch/x86_64/kernel/ioport_64.c b/arch/x86_64/kernel/ioport_64.c
deleted file mode 100644 (file)
index 653efa3..0000000
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- *     linux/arch/x86_64/kernel/ioport.c
- *
- * This contains the io-permission bitmap code - written by obz, with changes
- * by Linus.
- */
-
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/ioport.h>
-#include <linux/smp.h>
-#include <linux/stddef.h>
-#include <linux/slab.h>
-#include <linux/thread_info.h>
-#include <linux/syscalls.h>
-
-/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
-static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
-{
-       int i;
-               if (new_value)
-               for (i = base; i < base + extent; i++) 
-                       __set_bit(i, bitmap); 
-               else
-               for (i = base; i < base + extent; i++) 
-                       clear_bit(i, bitmap); 
-}
-
-/*
- * this changes the io permissions bitmap in the current task.
- */
-asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
-{
-       unsigned int i, max_long, bytes, bytes_updated;
-       struct thread_struct * t = &current->thread;
-       struct tss_struct * tss;
-       unsigned long *bitmap;
-
-       if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
-               return -EINVAL;
-       if (turn_on && !capable(CAP_SYS_RAWIO))
-               return -EPERM;
-
-       /*
-        * If it's the first ioperm() call in this thread's lifetime, set the
-        * IO bitmap up. ioperm() is much less timing critical than clone(),
-        * this is why we delay this operation until now:
-        */
-       if (!t->io_bitmap_ptr) {
-               bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
-               if (!bitmap)
-                       return -ENOMEM;
-
-               memset(bitmap, 0xff, IO_BITMAP_BYTES);
-               t->io_bitmap_ptr = bitmap;
-               set_thread_flag(TIF_IO_BITMAP);
-       }
-
-       /*
-        * do it in the per-thread copy and in the TSS ...
-        *
-        * Disable preemption via get_cpu() - we must not switch away
-        * because the ->io_bitmap_max value must match the bitmap
-        * contents:
-        */
-       tss = &per_cpu(init_tss, get_cpu());
-
-       set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
-
-       /*
-        * Search for a (possibly new) maximum. This is simple and stupid,
-        * to keep it obviously correct:
-        */
-       max_long = 0;
-       for (i = 0; i < IO_BITMAP_LONGS; i++)
-               if (t->io_bitmap_ptr[i] != ~0UL)
-                       max_long = i;
-
-       bytes = (max_long + 1) * sizeof(long);
-       bytes_updated = max(bytes, t->io_bitmap_max);
-
-       t->io_bitmap_max = bytes;
-
-       /* Update the TSS: */
-       memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
-
-       put_cpu();
-
-       return 0;
-}
-
-/*
- * sys_iopl has to be used when you want to access the IO ports
- * beyond the 0x3ff range: to get the full 65536 ports bitmapped
- * you'd need 8kB of bitmaps/process, which is a bit excessive.
- *
- * Here we just change the eflags value on the stack: we allow
- * only the super-user to do it. This depends on the stack-layout
- * on system-call entry - see also fork() and the signal handling
- * code.
- */
-
-asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
-{
-       unsigned int old = (regs->eflags >> 12) & 3;
-
-       if (level > 3)
-               return -EINVAL;
-       /* Trying to gain more privileges? */
-       if (level > old) {
-               if (!capable(CAP_SYS_RAWIO))
-                       return -EPERM;
-       }
-       regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12);
-       return 0;
-}
diff --git a/arch/x86_64/kernel/irq_64.c b/arch/x86_64/kernel/irq_64.c
deleted file mode 100644 (file)
index 39cb3fa..0000000
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- *     linux/arch/x86_64/kernel/irq.c
- *
- *     Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
- *
- * This file contains the lowest level x86_64-specific interrupt
- * entry and irq statistics code. All the remaining irq logic is
- * done by the generic kernel/irq/ code and in the
- * x86_64-specific irq controller code. (e.g. i8259.c and
- * io_apic.c.)
- */
-
-#include <linux/kernel_stat.h>
-#include <linux/interrupt.h>
-#include <linux/seq_file.h>
-#include <linux/module.h>
-#include <linux/delay.h>
-#include <asm/uaccess.h>
-#include <asm/io_apic.h>
-#include <asm/idle.h>
-#include <asm/smp.h>
-
-atomic_t irq_err_count;
-
-#ifdef CONFIG_DEBUG_STACKOVERFLOW
-/*
- * Probabilistic stack overflow check:
- *
- * Only check the stack in process context, because everything else
- * runs on the big interrupt stacks. Checking reliably is too expensive,
- * so we just check from interrupts.
- */
-static inline void stack_overflow_check(struct pt_regs *regs)
-{
-       u64 curbase = (u64)task_stack_page(current);
-       static unsigned long warned = -60*HZ;
-
-       if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
-           regs->rsp <  curbase + sizeof(struct thread_info) + 128 &&
-           time_after(jiffies, warned + 60*HZ)) {
-               printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n",
-                      current->comm, curbase, regs->rsp);
-               show_stack(NULL,NULL);
-               warned = jiffies;
-       }
-}
-#endif
-
-/*
- * Generic, controller-independent functions:
- */
-
-int show_interrupts(struct seq_file *p, void *v)
-{
-       int i = *(loff_t *) v, j;
-       struct irqaction * action;
-       unsigned long flags;
-
-       if (i == 0) {
-               seq_printf(p, "           ");
-               for_each_online_cpu(j)
-                       seq_printf(p, "CPU%-8d",j);
-               seq_putc(p, '\n');
-       }
-
-       if (i < NR_IRQS) {
-               spin_lock_irqsave(&irq_desc[i].lock, flags);
-               action = irq_desc[i].action;
-               if (!action) 
-                       goto skip;
-               seq_printf(p, "%3d: ",i);
-#ifndef CONFIG_SMP
-               seq_printf(p, "%10u ", kstat_irqs(i));
-#else
-               for_each_online_cpu(j)
-                       seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
-#endif
-               seq_printf(p, " %8s", irq_desc[i].chip->name);
-               seq_printf(p, "-%-8s", irq_desc[i].name);
-
-               seq_printf(p, "  %s", action->name);
-               for (action=action->next; action; action = action->next)
-                       seq_printf(p, ", %s", action->name);
-               seq_putc(p, '\n');
-skip:
-               spin_unlock_irqrestore(&irq_desc[i].lock, flags);
-       } else if (i == NR_IRQS) {
-               seq_printf(p, "NMI: ");
-               for_each_online_cpu(j)
-                       seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
-               seq_putc(p, '\n');
-               seq_printf(p, "LOC: ");
-               for_each_online_cpu(j)
-                       seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
-               seq_putc(p, '\n');
-               seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
-       }
-       return 0;
-}
-
-/*
- * do_IRQ handles all normal device IRQ's (the special
- * SMP cross-CPU interrupts have their own specific
- * handlers).
- */
-asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
-{
-       struct pt_regs *old_regs = set_irq_regs(regs);
-
-       /* high bit used in ret_from_ code  */
-       unsigned vector = ~regs->orig_rax;
-       unsigned irq;
-
-       exit_idle();
-       irq_enter();
-       irq = __get_cpu_var(vector_irq)[vector];
-
-#ifdef CONFIG_DEBUG_STACKOVERFLOW
-       stack_overflow_check(regs);
-#endif
-
-       if (likely(irq < NR_IRQS))
-               generic_handle_irq(irq);
-       else {
-               if (!disable_apic)
-                       ack_APIC_irq();
-
-               if (printk_ratelimit())
-                       printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n",
-                               __func__, smp_processor_id(), vector);
-       }
-
-       irq_exit();
-
-       set_irq_regs(old_regs);
-       return 1;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-void fixup_irqs(cpumask_t map)
-{
-       unsigned int irq;
-       static int warned;
-
-       for (irq = 0; irq < NR_IRQS; irq++) {
-               cpumask_t mask;
-               int break_affinity = 0;
-               int set_affinity = 1;
-
-               if (irq == 2)
-                       continue;
-
-               /* interrupt's are disabled at this point */
-               spin_lock(&irq_desc[irq].lock);
-
-               if (!irq_has_action(irq) ||
-                   cpus_equal(irq_desc[irq].affinity, map)) {
-                       spin_unlock(&irq_desc[irq].lock);
-                       continue;
-               }
-
-               cpus_and(mask, irq_desc[irq].affinity, map);
-               if (cpus_empty(mask)) {
-                       break_affinity = 1;
-                       mask = map;
-               }
-
-               if (irq_desc[irq].chip->mask)
-                       irq_desc[irq].chip->mask(irq);
-
-               if (irq_desc[irq].chip->set_affinity)
-                       irq_desc[irq].chip->set_affinity(irq, mask);
-               else if (!(warned++))
-                       set_affinity = 0;
-
-               if (irq_desc[irq].chip->unmask)
-                       irq_desc[irq].chip->unmask(irq);
-
-               spin_unlock(&irq_desc[irq].lock);
-
-               if (break_affinity && set_affinity)
-                       printk("Broke affinity for irq %i\n", irq);
-               else if (!set_affinity)
-                       printk("Cannot set affinity for irq %i\n", irq);
-       }
-
-       /* That doesn't seem sufficient.  Give it 1ms. */
-       local_irq_enable();
-       mdelay(1);
-       local_irq_disable();
-}
-#endif
-
-extern void call_softirq(void);
-
-asmlinkage void do_softirq(void)
-{
-       __u32 pending;
-       unsigned long flags;
-
-       if (in_interrupt())
-               return;
-
-       local_irq_save(flags);
-       pending = local_softirq_pending();
-       /* Switch to interrupt stack */
-       if (pending) {
-               call_softirq();
-               WARN_ON_ONCE(softirq_count());
-       }
-       local_irq_restore(flags);
-}
-EXPORT_SYMBOL(do_softirq);
diff --git a/arch/x86_64/kernel/k8.c b/arch/x86_64/kernel/k8.c
deleted file mode 100644 (file)
index 7377ccb..0000000
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Shared support code for AMD K8 northbridges and derivates.
- * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
- */
-#include <linux/gfp.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <asm/k8.h>
-
-int num_k8_northbridges;
-EXPORT_SYMBOL(num_k8_northbridges);
-
-static u32 *flush_words;
-
-struct pci_device_id k8_nb_ids[] = {
-       { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
-       { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
-       {}
-};
-EXPORT_SYMBOL(k8_nb_ids);
-
-struct pci_dev **k8_northbridges;
-EXPORT_SYMBOL(k8_northbridges);
-
-static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
-{
-       do {
-               dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
-               if (!dev)
-                       break;
-       } while (!pci_match_id(&k8_nb_ids[0], dev));
-       return dev;
-}
-
-int cache_k8_northbridges(void)
-{
-       int i;
-       struct pci_dev *dev;
-
-       if (num_k8_northbridges)
-               return 0;
-
-       dev = NULL;
-       while ((dev = next_k8_northbridge(dev)) != NULL)
-               num_k8_northbridges++;
-
-       k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *),
-                                 GFP_KERNEL);
-       if (!k8_northbridges)
-               return -ENOMEM;
-
-       if (!num_k8_northbridges) {
-               k8_northbridges[0] = NULL;
-               return 0;
-       }
-
-       flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL);
-       if (!flush_words) {
-               kfree(k8_northbridges);
-               return -ENOMEM;
-       }
-
-       dev = NULL;
-       i = 0;
-       while ((dev = next_k8_northbridge(dev)) != NULL) {
-               k8_northbridges[i] = dev;
-               pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
-       }
-       k8_northbridges[i] = NULL;
-       return 0;
-}
-EXPORT_SYMBOL_GPL(cache_k8_northbridges);
-
-/* Ignores subdevice/subvendor but as far as I can figure out
-   they're useless anyways */
-int __init early_is_k8_nb(u32 device)
-{
-       struct pci_device_id *id;
-       u32 vendor = device & 0xffff;
-       device >>= 16;
-       for (id = k8_nb_ids; id->vendor; id++)
-               if (vendor == id->vendor && device == id->device)
-                       return 1;
-       return 0;
-}
-
-void k8_flush_garts(void)
-{
-       int flushed, i;
-       unsigned long flags;
-       static DEFINE_SPINLOCK(gart_lock);
-
-       /* Avoid races between AGP and IOMMU. In theory it's not needed
-          but I'm not sure if the hardware won't lose flush requests
-          when another is pending. This whole thing is so expensive anyways
-          that it doesn't matter to serialize more. -AK */
-       spin_lock_irqsave(&gart_lock, flags);
-       flushed = 0;
-       for (i = 0; i < num_k8_northbridges; i++) {
-               pci_write_config_dword(k8_northbridges[i], 0x9c,
-                                      flush_words[i]|1);
-               flushed++;
-       }
-       for (i = 0; i < num_k8_northbridges; i++) {
-               u32 w;
-               /* Make sure the hardware actually executed the flush*/
-               for (;;) {
-                       pci_read_config_dword(k8_northbridges[i],
-                                             0x9c, &w);
-                       if (!(w & 1))
-                               break;
-                       cpu_relax();
-               }
-       }
-       spin_unlock_irqrestore(&gart_lock, flags);
-       if (!flushed)
-               printk("nothing to flush?\n");
-}
-EXPORT_SYMBOL_GPL(k8_flush_garts);
-
diff --git a/arch/x86_64/kernel/kprobes_64.c b/arch/x86_64/kernel/kprobes_64.c
deleted file mode 100644 (file)
index a30e004..0000000
+++ /dev/null
@@ -1,749 +0,0 @@
-/*
- *  Kernel Probes (KProbes)
- *  arch/x86_64/kernel/kprobes.c
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2002, 2004
- *
- * 2002-Oct    Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
- *             Probes initial implementation ( includes contributions from
- *             Rusty Russell).
- * 2004-July   Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
- *             interface to access function arguments.
- * 2004-Oct    Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi
- *             <prasanna@in.ibm.com> adapted for x86_64
- * 2005-Mar    Roland McGrath <roland@redhat.com>
- *             Fixed to handle %rip-relative addressing mode correctly.
- * 2005-May     Rusty Lynch <rusty.lynch@intel.com>
- *              Added function return probes functionality
- */
-
-#include <linux/kprobes.h>
-#include <linux/ptrace.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/preempt.h>
-#include <linux/module.h>
-#include <linux/kdebug.h>
-
-#include <asm/pgtable.h>
-#include <asm/uaccess.h>
-#include <asm/alternative.h>
-
-void jprobe_return_end(void);
-static void __kprobes arch_copy_kprobe(struct kprobe *p);
-
-DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
-DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
-
-/*
- * returns non-zero if opcode modifies the interrupt flag.
- */
-static __always_inline int is_IF_modifier(kprobe_opcode_t *insn)
-{
-       switch (*insn) {
-       case 0xfa:              /* cli */
-       case 0xfb:              /* sti */
-       case 0xcf:              /* iret/iretd */
-       case 0x9d:              /* popf/popfd */
-               return 1;
-       }
-
-       if (*insn  >= 0x40 && *insn <= 0x4f && *++insn == 0xcf)
-               return 1;
-       return 0;
-}
-
-int __kprobes arch_prepare_kprobe(struct kprobe *p)
-{
-       /* insn: must be on special executable page on x86_64. */
-       p->ainsn.insn = get_insn_slot();
-       if (!p->ainsn.insn) {
-               return -ENOMEM;
-       }
-       arch_copy_kprobe(p);
-       return 0;
-}
-
-/*
- * Determine if the instruction uses the %rip-relative addressing mode.
- * If it does, return the address of the 32-bit displacement word.
- * If not, return null.
- */
-static s32 __kprobes *is_riprel(u8 *insn)
-{
-#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf)               \
-       (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
-         (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
-         (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
-         (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
-        << (row % 64))
-       static const u64 onebyte_has_modrm[256 / 64] = {
-               /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
-               /*      -------------------------------         */
-               W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */
-               W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */
-               W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */
-               W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */
-               W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */
-               W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */
-               W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */
-               W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */
-               W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */
-               W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */
-               W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */
-               W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */
-               W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */
-               W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */
-               W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */
-               W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1)  /* f0 */
-               /*      -------------------------------         */
-               /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
-       };
-       static const u64 twobyte_has_modrm[256 / 64] = {
-               /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
-               /*      -------------------------------         */
-               W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */
-               W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */
-               W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */
-               W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */
-               W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */
-               W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */
-               W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */
-               W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */
-               W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */
-               W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */
-               W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */
-               W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */
-               W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */
-               W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */
-               W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */
-               W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0)  /* ff */
-               /*      -------------------------------         */
-               /*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
-       };
-#undef W
-       int need_modrm;
-
-       /* Skip legacy instruction prefixes.  */
-       while (1) {
-               switch (*insn) {
-               case 0x66:
-               case 0x67:
-               case 0x2e:
-               case 0x3e:
-               case 0x26:
-               case 0x64:
-               case 0x65:
-               case 0x36:
-               case 0xf0:
-               case 0xf3:
-               case 0xf2:
-                       ++insn;
-                       continue;
-               }
-               break;
-       }
-
-       /* Skip REX instruction prefix.  */
-       if ((*insn & 0xf0) == 0x40)
-               ++insn;
-
-       if (*insn == 0x0f) {    /* Two-byte opcode.  */
-               ++insn;
-               need_modrm = test_bit(*insn, twobyte_has_modrm);
-       } else {                /* One-byte opcode.  */
-               need_modrm = test_bit(*insn, onebyte_has_modrm);
-       }
-
-       if (need_modrm) {
-               u8 modrm = *++insn;
-               if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */
-                       /* Displacement follows ModRM byte.  */
-                       return (s32 *) ++insn;
-               }
-       }
-
-       /* No %rip-relative addressing mode here.  */
-       return NULL;
-}
-
-static void __kprobes arch_copy_kprobe(struct kprobe *p)
-{
-       s32 *ripdisp;
-       memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE);
-       ripdisp = is_riprel(p->ainsn.insn);
-       if (ripdisp) {
-               /*
-                * The copied instruction uses the %rip-relative
-                * addressing mode.  Adjust the displacement for the
-                * difference between the original location of this
-                * instruction and the location of the copy that will
-                * actually be run.  The tricky bit here is making sure
-                * that the sign extension happens correctly in this
-                * calculation, since we need a signed 32-bit result to
-                * be sign-extended to 64 bits when it's added to the
-                * %rip value and yield the same 64-bit result that the
-                * sign-extension of the original signed 32-bit
-                * displacement would have given.
-                */
-               s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn;
-               BUG_ON((s64) (s32) disp != disp); /* Sanity check.  */
-               *ripdisp = disp;
-       }
-       p->opcode = *p->addr;
-}
-
-void __kprobes arch_arm_kprobe(struct kprobe *p)
-{
-       text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
-}
-
-void __kprobes arch_disarm_kprobe(struct kprobe *p)
-{
-       text_poke(p->addr, &p->opcode, 1);
-}
-
-void __kprobes arch_remove_kprobe(struct kprobe *p)
-{
-       mutex_lock(&kprobe_mutex);
-       free_insn_slot(p->ainsn.insn, 0);
-       mutex_unlock(&kprobe_mutex);
-}
-
-static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
-{
-       kcb->prev_kprobe.kp = kprobe_running();
-       kcb->prev_kprobe.status = kcb->kprobe_status;
-       kcb->prev_kprobe.old_rflags = kcb->kprobe_old_rflags;
-       kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags;
-}
-
-static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
-{
-       __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
-       kcb->kprobe_status = kcb->prev_kprobe.status;
-       kcb->kprobe_old_rflags = kcb->prev_kprobe.old_rflags;
-       kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags;
-}
-
-static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
-                               struct kprobe_ctlblk *kcb)
-{
-       __get_cpu_var(current_kprobe) = p;
-       kcb->kprobe_saved_rflags = kcb->kprobe_old_rflags
-               = (regs->eflags & (TF_MASK | IF_MASK));
-       if (is_IF_modifier(p->ainsn.insn))
-               kcb->kprobe_saved_rflags &= ~IF_MASK;
-}
-
-static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
-{
-       regs->eflags |= TF_MASK;
-       regs->eflags &= ~IF_MASK;
-       /*single step inline if the instruction is an int3*/
-       if (p->opcode == BREAKPOINT_INSTRUCTION)
-               regs->rip = (unsigned long)p->addr;
-       else
-               regs->rip = (unsigned long)p->ainsn.insn;
-}
-
-/* Called with kretprobe_lock held */
-void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
-                                     struct pt_regs *regs)
-{
-       unsigned long *sara = (unsigned long *)regs->rsp;
-
-       ri->ret_addr = (kprobe_opcode_t *) *sara;
-       /* Replace the return addr with trampoline addr */
-       *sara = (unsigned long) &kretprobe_trampoline;
-}
-
-int __kprobes kprobe_handler(struct pt_regs *regs)
-{
-       struct kprobe *p;
-       int ret = 0;
-       kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t));
-       struct kprobe_ctlblk *kcb;
-
-       /*
-        * We don't want to be preempted for the entire
-        * duration of kprobe processing
-        */
-       preempt_disable();
-       kcb = get_kprobe_ctlblk();
-
-       /* Check we're not actually recursing */
-       if (kprobe_running()) {
-               p = get_kprobe(addr);
-               if (p) {
-                       if (kcb->kprobe_status == KPROBE_HIT_SS &&
-                               *p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
-                               regs->eflags &= ~TF_MASK;
-                               regs->eflags |= kcb->kprobe_saved_rflags;
-                               goto no_kprobe;
-                       } else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) {
-                               /* TODO: Provide re-entrancy from
-                                * post_kprobes_handler() and avoid exception
-                                * stack corruption while single-stepping on
-                                * the instruction of the new probe.
-                                */
-                               arch_disarm_kprobe(p);
-                               regs->rip = (unsigned long)p->addr;
-                               reset_current_kprobe();
-                               ret = 1;
-                       } else {
-                               /* We have reentered the kprobe_handler(), since
-                                * another probe was hit while within the
-                                * handler. We here save the original kprobe
-                                * variables and just single step on instruction
-                                * of the new probe without calling any user
-                                * handlers.
-                                */
-                               save_previous_kprobe(kcb);
-                               set_current_kprobe(p, regs, kcb);
-                               kprobes_inc_nmissed_count(p);
-                               prepare_singlestep(p, regs);
-                               kcb->kprobe_status = KPROBE_REENTER;
-                               return 1;
-                       }
-               } else {
-                       if (*addr != BREAKPOINT_INSTRUCTION) {
-                       /* The breakpoint instruction was removed by
-                        * another cpu right after we hit, no further
-                        * handling of this interrupt is appropriate
-                        */
-                               regs->rip = (unsigned long)addr;
-                               ret = 1;
-                               goto no_kprobe;
-                       }
-                       p = __get_cpu_var(current_kprobe);
-                       if (p->break_handler && p->break_handler(p, regs)) {
-                               goto ss_probe;
-                       }
-               }
-               goto no_kprobe;
-       }
-
-       p = get_kprobe(addr);
-       if (!p) {
-               if (*addr != BREAKPOINT_INSTRUCTION) {
-                       /*
-                        * The breakpoint instruction was removed right
-                        * after we hit it.  Another cpu has removed
-                        * either a probepoint or a debugger breakpoint
-                        * at this address.  In either case, no further
-                        * handling of this interrupt is appropriate.
-                        * Back up over the (now missing) int3 and run
-                        * the original instruction.
-                        */
-                       regs->rip = (unsigned long)addr;
-                       ret = 1;
-               }
-               /* Not one of ours: let kernel handle it */
-               goto no_kprobe;
-       }
-
-       set_current_kprobe(p, regs, kcb);
-       kcb->kprobe_status = KPROBE_HIT_ACTIVE;
-
-       if (p->pre_handler && p->pre_handler(p, regs))
-               /* handler has already set things up, so skip ss setup */
-               return 1;
-
-ss_probe:
-       prepare_singlestep(p, regs);
-       kcb->kprobe_status = KPROBE_HIT_SS;
-       return 1;
-
-no_kprobe:
-       preempt_enable_no_resched();
-       return ret;
-}
-
-/*
- * For function-return probes, init_kprobes() establishes a probepoint
- * here. When a retprobed function returns, this probe is hit and
- * trampoline_probe_handler() runs, calling the kretprobe's handler.
- */
- void kretprobe_trampoline_holder(void)
- {
-       asm volatile (  ".global kretprobe_trampoline\n"
-                       "kretprobe_trampoline: \n"
-                       "nop\n");
- }
-
-/*
- * Called when we hit the probe point at kretprobe_trampoline
- */
-int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
-{
-       struct kretprobe_instance *ri = NULL;
-       struct hlist_head *head, empty_rp;
-       struct hlist_node *node, *tmp;
-       unsigned long flags, orig_ret_address = 0;
-       unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
-
-       INIT_HLIST_HEAD(&empty_rp);
-       spin_lock_irqsave(&kretprobe_lock, flags);
-       head = kretprobe_inst_table_head(current);
-
-       /*
-        * It is possible to have multiple instances associated with a given
-        * task either because an multiple functions in the call path
-        * have a return probe installed on them, and/or more then one return
-        * return probe was registered for a target function.
-        *
-        * We can handle this because:
-        *     - instances are always inserted at the head of the list
-        *     - when multiple return probes are registered for the same
-        *       function, the first instance's ret_addr will point to the
-        *       real return address, and all the rest will point to
-        *       kretprobe_trampoline
-        */
-       hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
-               if (ri->task != current)
-                       /* another task is sharing our hash bucket */
-                       continue;
-
-               if (ri->rp && ri->rp->handler)
-                       ri->rp->handler(ri, regs);
-
-               orig_ret_address = (unsigned long)ri->ret_addr;
-               recycle_rp_inst(ri, &empty_rp);
-
-               if (orig_ret_address != trampoline_address)
-                       /*
-                        * This is the real return address. Any other
-                        * instances associated with this task are for
-                        * other calls deeper on the call stack
-                        */
-                       break;
-       }
-
-       kretprobe_assert(ri, orig_ret_address, trampoline_address);
-       regs->rip = orig_ret_address;
-
-       reset_current_kprobe();
-       spin_unlock_irqrestore(&kretprobe_lock, flags);
-       preempt_enable_no_resched();
-
-       hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
-               hlist_del(&ri->hlist);
-               kfree(ri);
-       }
-       /*
-        * By returning a non-zero value, we are telling
-        * kprobe_handler() that we don't want the post_handler
-        * to run (and have re-enabled preemption)
-        */
-       return 1;
-}
-
-/*
- * Called after single-stepping.  p->addr is the address of the
- * instruction whose first byte has been replaced by the "int 3"
- * instruction.  To avoid the SMP problems that can occur when we
- * temporarily put back the original opcode to single-step, we
- * single-stepped a copy of the instruction.  The address of this
- * copy is p->ainsn.insn.
- *
- * This function prepares to return from the post-single-step
- * interrupt.  We have to fix up the stack as follows:
- *
- * 0) Except in the case of absolute or indirect jump or call instructions,
- * the new rip is relative to the copied instruction.  We need to make
- * it relative to the original instruction.
- *
- * 1) If the single-stepped instruction was pushfl, then the TF and IF
- * flags are set in the just-pushed eflags, and may need to be cleared.
- *
- * 2) If the single-stepped instruction was a call, the return address
- * that is atop the stack is the address following the copied instruction.
- * We need to make it the address following the original instruction.
- */
-static void __kprobes resume_execution(struct kprobe *p,
-               struct pt_regs *regs, struct kprobe_ctlblk *kcb)
-{
-       unsigned long *tos = (unsigned long *)regs->rsp;
-       unsigned long next_rip = 0;
-       unsigned long copy_rip = (unsigned long)p->ainsn.insn;
-       unsigned long orig_rip = (unsigned long)p->addr;
-       kprobe_opcode_t *insn = p->ainsn.insn;
-
-       /*skip the REX prefix*/
-       if (*insn >= 0x40 && *insn <= 0x4f)
-               insn++;
-
-       switch (*insn) {
-       case 0x9c:              /* pushfl */
-               *tos &= ~(TF_MASK | IF_MASK);
-               *tos |= kcb->kprobe_old_rflags;
-               break;
-       case 0xc3:              /* ret/lret */
-       case 0xcb:
-       case 0xc2:
-       case 0xca:
-               regs->eflags &= ~TF_MASK;
-               /* rip is already adjusted, no more changes required*/
-               return;
-       case 0xe8:              /* call relative - Fix return addr */
-               *tos = orig_rip + (*tos - copy_rip);
-               break;
-       case 0xff:
-               if ((insn[1] & 0x30) == 0x10) {
-                       /* call absolute, indirect */
-                       /* Fix return addr; rip is correct. */
-                       next_rip = regs->rip;
-                       *tos = orig_rip + (*tos - copy_rip);
-               } else if (((insn[1] & 0x31) == 0x20) ||        /* jmp near, absolute indirect */
-                          ((insn[1] & 0x31) == 0x21)) {        /* jmp far, absolute indirect */
-                       /* rip is correct. */
-                       next_rip = regs->rip;
-               }
-               break;
-       case 0xea:              /* jmp absolute -- rip is correct */
-               next_rip = regs->rip;
-               break;
-       default:
-               break;
-       }
-
-       regs->eflags &= ~TF_MASK;
-       if (next_rip) {
-               regs->rip = next_rip;
-       } else {
-               regs->rip = orig_rip + (regs->rip - copy_rip);
-       }
-}
-
-int __kprobes post_kprobe_handler(struct pt_regs *regs)
-{
-       struct kprobe *cur = kprobe_running();
-       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
-       if (!cur)
-               return 0;
-
-       if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
-               kcb->kprobe_status = KPROBE_HIT_SSDONE;
-               cur->post_handler(cur, regs, 0);
-       }
-
-       resume_execution(cur, regs, kcb);
-       regs->eflags |= kcb->kprobe_saved_rflags;
-
-       /* Restore the original saved kprobes variables and continue. */
-       if (kcb->kprobe_status == KPROBE_REENTER) {
-               restore_previous_kprobe(kcb);
-               goto out;
-       }
-       reset_current_kprobe();
-out:
-       preempt_enable_no_resched();
-
-       /*
-        * if somebody else is singlestepping across a probe point, eflags
-        * will have TF set, in which case, continue the remaining processing
-        * of do_debug, as if this is not a probe hit.
-        */
-       if (regs->eflags & TF_MASK)
-               return 0;
-
-       return 1;
-}
-
-int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
-{
-       struct kprobe *cur = kprobe_running();
-       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-       const struct exception_table_entry *fixup;
-
-       switch(kcb->kprobe_status) {
-       case KPROBE_HIT_SS:
-       case KPROBE_REENTER:
-               /*
-                * We are here because the instruction being single
-                * stepped caused a page fault. We reset the current
-                * kprobe and the rip points back to the probe address
-                * and allow the page fault handler to continue as a
-                * normal page fault.
-                */
-               regs->rip = (unsigned long)cur->addr;
-               regs->eflags |= kcb->kprobe_old_rflags;
-               if (kcb->kprobe_status == KPROBE_REENTER)
-                       restore_previous_kprobe(kcb);
-               else
-                       reset_current_kprobe();
-               preempt_enable_no_resched();
-               break;
-       case KPROBE_HIT_ACTIVE:
-       case KPROBE_HIT_SSDONE:
-               /*
-                * We increment the nmissed count for accounting,
-                * we can also use npre/npostfault count for accouting
-                * these specific fault cases.
-                */
-               kprobes_inc_nmissed_count(cur);
-
-               /*
-                * We come here because instructions in the pre/post
-                * handler caused the page_fault, this could happen
-                * if handler tries to access user space by
-                * copy_from_user(), get_user() etc. Let the
-                * user-specified handler try to fix it first.
-                */
-               if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
-                       return 1;
-
-               /*
-                * In case the user-specified fault handler returned
-                * zero, try to fix up.
-                */
-               fixup = search_exception_tables(regs->rip);
-               if (fixup) {
-                       regs->rip = fixup->fixup;
-                       return 1;
-               }
-
-               /*
-                * fixup() could not handle it,
-                * Let do_page_fault() fix it.
-                */
-               break;
-       default:
-               break;
-       }
-       return 0;
-}
-
-/*
- * Wrapper routine for handling exceptions.
- */
-int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
-                                      unsigned long val, void *data)
-{
-       struct die_args *args = (struct die_args *)data;
-       int ret = NOTIFY_DONE;
-
-       if (args->regs && user_mode(args->regs))
-               return ret;
-
-       switch (val) {
-       case DIE_INT3:
-               if (kprobe_handler(args->regs))
-                       ret = NOTIFY_STOP;
-               break;
-       case DIE_DEBUG:
-               if (post_kprobe_handler(args->regs))
-                       ret = NOTIFY_STOP;
-               break;
-       case DIE_GPF:
-       case DIE_PAGE_FAULT:
-               /* kprobe_running() needs smp_processor_id() */
-               preempt_disable();
-               if (kprobe_running() &&
-                   kprobe_fault_handler(args->regs, args->trapnr))
-                       ret = NOTIFY_STOP;
-               preempt_enable();
-               break;
-       default:
-               break;
-       }
-       return ret;
-}
-
-int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
-{
-       struct jprobe *jp = container_of(p, struct jprobe, kp);
-       unsigned long addr;
-       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
-       kcb->jprobe_saved_regs = *regs;
-       kcb->jprobe_saved_rsp = (long *) regs->rsp;
-       addr = (unsigned long)(kcb->jprobe_saved_rsp);
-       /*
-        * As Linus pointed out, gcc assumes that the callee
-        * owns the argument space and could overwrite it, e.g.
-        * tailcall optimization. So, to be absolutely safe
-        * we also save and restore enough stack bytes to cover
-        * the argument area.
-        */
-       memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
-                       MIN_STACK_SIZE(addr));
-       regs->eflags &= ~IF_MASK;
-       regs->rip = (unsigned long)(jp->entry);
-       return 1;
-}
-
-void __kprobes jprobe_return(void)
-{
-       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
-       asm volatile ("       xchg   %%rbx,%%rsp     \n"
-                     "       int3                      \n"
-                     "       .globl jprobe_return_end  \n"
-                     "       jprobe_return_end:        \n"
-                     "       nop                       \n"::"b"
-                     (kcb->jprobe_saved_rsp):"memory");
-}
-
-int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
-{
-       struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-       u8 *addr = (u8 *) (regs->rip - 1);
-       unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_rsp);
-       struct jprobe *jp = container_of(p, struct jprobe, kp);
-
-       if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
-               if ((long *)regs->rsp != kcb->jprobe_saved_rsp) {
-                       struct pt_regs *saved_regs =
-                           container_of(kcb->jprobe_saved_rsp,
-                                           struct pt_regs, rsp);
-                       printk("current rsp %p does not match saved rsp %p\n",
-                              (long *)regs->rsp, kcb->jprobe_saved_rsp);
-                       printk("Saved registers for jprobe %p\n", jp);
-                       show_registers(saved_regs);
-                       printk("Current registers\n");
-                       show_registers(regs);
-                       BUG();
-               }
-               *regs = kcb->jprobe_saved_regs;
-               memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
-                      MIN_STACK_SIZE(stack_addr));
-               preempt_enable_no_resched();
-               return 1;
-       }
-       return 0;
-}
-
-static struct kprobe trampoline_p = {
-       .addr = (kprobe_opcode_t *) &kretprobe_trampoline,
-       .pre_handler = trampoline_probe_handler
-};
-
-int __init arch_init_kprobes(void)
-{
-       return register_kprobe(&trampoline_p);
-}
-
-int __kprobes arch_trampoline_kprobe(struct kprobe *p)
-{
-       if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline)
-               return 1;
-
-       return 0;
-}
diff --git a/arch/x86_64/kernel/ldt_64.c b/arch/x86_64/kernel/ldt_64.c
deleted file mode 100644 (file)
index bc9ffd5..0000000
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- * linux/arch/x86_64/kernel/ldt.c
- *
- * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
- * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
- * Copyright (C) 2002 Andi Kleen
- * 
- * This handles calls from both 32bit and 64bit mode.
- */
-
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <asm/ldt.h>
-#include <asm/desc.h>
-#include <asm/proto.h>
-
-#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
-static void flush_ldt(void *null)
-{
-       if (current->active_mm)
-               load_LDT(&current->active_mm->context);
-}
-#endif
-
-static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
-{
-       void *oldldt;
-       void *newldt;
-       unsigned oldsize;
-
-       if (mincount <= (unsigned)pc->size)
-               return 0;
-       oldsize = pc->size;
-       mincount = (mincount+511)&(~511);
-       if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
-               newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
-       else
-               newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
-
-       if (!newldt)
-               return -ENOMEM;
-
-       if (oldsize)
-               memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
-       oldldt = pc->ldt;
-       memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
-       wmb();
-       pc->ldt = newldt;
-       wmb();
-       pc->size = mincount;
-       wmb();
-       if (reload) {
-#ifdef CONFIG_SMP
-               cpumask_t mask;
-
-               preempt_disable();
-               mask = cpumask_of_cpu(smp_processor_id());
-               load_LDT(pc);
-               if (!cpus_equal(current->mm->cpu_vm_mask, mask))
-                       smp_call_function(flush_ldt, NULL, 1, 1);
-               preempt_enable();
-#else
-               load_LDT(pc);
-#endif
-       }
-       if (oldsize) {
-               if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
-                       vfree(oldldt);
-               else
-                       kfree(oldldt);
-       }
-       return 0;
-}
-
-static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
-{
-       int err = alloc_ldt(new, old->size, 0);
-       if (err < 0)
-               return err;
-       memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
-       return 0;
-}
-
-/*
- * we do not have to muck with descriptors here, that is
- * done in switch_mm() as needed.
- */
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
-{
-       struct mm_struct * old_mm;
-       int retval = 0;
-
-       init_MUTEX(&mm->context.sem);
-       mm->context.size = 0;
-       old_mm = current->mm;
-       if (old_mm && old_mm->context.size > 0) {
-               down(&old_mm->context.sem);
-               retval = copy_ldt(&mm->context, &old_mm->context);
-               up(&old_mm->context.sem);
-       }
-       return retval;
-}
-
-/*
- * 
- * Don't touch the LDT register - we're already in the next thread.
- */
-void destroy_context(struct mm_struct *mm)
-{
-       if (mm->context.size) {
-               if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
-                       vfree(mm->context.ldt);
-               else
-                       kfree(mm->context.ldt);
-               mm->context.size = 0;
-       }
-}
-
-static int read_ldt(void __user * ptr, unsigned long bytecount)
-{
-       int err;
-       unsigned long size;
-       struct mm_struct * mm = current->mm;
-
-       if (!mm->context.size)
-               return 0;
-       if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
-               bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
-
-       down(&mm->context.sem);
-       size = mm->context.size*LDT_ENTRY_SIZE;
-       if (size > bytecount)
-               size = bytecount;
-
-       err = 0;
-       if (copy_to_user(ptr, mm->context.ldt, size))
-               err = -EFAULT;
-       up(&mm->context.sem);
-       if (err < 0)
-               goto error_return;
-       if (size != bytecount) {
-               /* zero-fill the rest */
-               if (clear_user(ptr+size, bytecount-size) != 0) {
-                       err = -EFAULT;
-                       goto error_return;
-               }
-       }
-       return bytecount;
-error_return:
-       return err;
-}
-
-static int read_default_ldt(void __user * ptr, unsigned long bytecount)
-{
-       /* Arbitrary number */ 
-       /* x86-64 default LDT is all zeros */
-       if (bytecount > 128) 
-               bytecount = 128;        
-       if (clear_user(ptr, bytecount))
-               return -EFAULT;
-       return bytecount; 
-}
-
-static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
-{
-       struct task_struct *me = current;
-       struct mm_struct * mm = me->mm;
-       __u32 entry_1, entry_2, *lp;
-       int error;
-       struct user_desc ldt_info;
-
-       error = -EINVAL;
-
-       if (bytecount != sizeof(ldt_info))
-               goto out;
-       error = -EFAULT;        
-       if (copy_from_user(&ldt_info, ptr, bytecount))
-               goto out;
-
-       error = -EINVAL;
-       if (ldt_info.entry_number >= LDT_ENTRIES)
-               goto out;
-       if (ldt_info.contents == 3) {
-               if (oldmode)
-                       goto out;
-               if (ldt_info.seg_not_present == 0)
-                       goto out;
-       }
-
-       down(&mm->context.sem);
-       if (ldt_info.entry_number >= (unsigned)mm->context.size) {
-               error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
-               if (error < 0)
-                       goto out_unlock;
-       }
-
-       lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
-
-       /* Allow LDTs to be cleared by the user. */
-       if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
-               if (oldmode || LDT_empty(&ldt_info)) {
-                       entry_1 = 0;
-                       entry_2 = 0;
-                       goto install;
-               }
-       }
-
-       entry_1 = LDT_entry_a(&ldt_info);
-       entry_2 = LDT_entry_b(&ldt_info);
-       if (oldmode)
-               entry_2 &= ~(1 << 20);
-
-       /* Install the new entry ...  */
-install:
-       *lp     = entry_1;
-       *(lp+1) = entry_2;
-       error = 0;
-
-out_unlock:
-       up(&mm->context.sem);
-out:
-       return error;
-}
-
-asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
-{
-       int ret = -ENOSYS;
-
-       switch (func) {
-       case 0:
-               ret = read_ldt(ptr, bytecount);
-               break;
-       case 1:
-               ret = write_ldt(ptr, bytecount, 1);
-               break;
-       case 2:
-               ret = read_default_ldt(ptr, bytecount);
-               break;
-       case 0x11:
-               ret = write_ldt(ptr, bytecount, 0);
-               break;
-       }
-       return ret;
-}
diff --git a/arch/x86_64/kernel/machine_kexec_64.c b/arch/x86_64/kernel/machine_kexec_64.c
deleted file mode 100644 (file)
index c3a5547..0000000
+++ /dev/null
@@ -1,259 +0,0 @@
-/*
- * machine_kexec.c - handle transition of Linux booting another kernel
- * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2.  See the file COPYING for more details.
- */
-
-#include <linux/mm.h>
-#include <linux/kexec.h>
-#include <linux/string.h>
-#include <linux/reboot.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <asm/mmu_context.h>
-#include <asm/io.h>
-
-#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
-static u64 kexec_pgd[512] PAGE_ALIGNED;
-static u64 kexec_pud0[512] PAGE_ALIGNED;
-static u64 kexec_pmd0[512] PAGE_ALIGNED;
-static u64 kexec_pte0[512] PAGE_ALIGNED;
-static u64 kexec_pud1[512] PAGE_ALIGNED;
-static u64 kexec_pmd1[512] PAGE_ALIGNED;
-static u64 kexec_pte1[512] PAGE_ALIGNED;
-
-static void init_level2_page(pmd_t *level2p, unsigned long addr)
-{
-       unsigned long end_addr;
-
-       addr &= PAGE_MASK;
-       end_addr = addr + PUD_SIZE;
-       while (addr < end_addr) {
-               set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
-               addr += PMD_SIZE;
-       }
-}
-
-static int init_level3_page(struct kimage *image, pud_t *level3p,
-                               unsigned long addr, unsigned long last_addr)
-{
-       unsigned long end_addr;
-       int result;
-
-       result = 0;
-       addr &= PAGE_MASK;
-       end_addr = addr + PGDIR_SIZE;
-       while ((addr < last_addr) && (addr < end_addr)) {
-               struct page *page;
-               pmd_t *level2p;
-
-               page = kimage_alloc_control_pages(image, 0);
-               if (!page) {
-                       result = -ENOMEM;
-                       goto out;
-               }
-               level2p = (pmd_t *)page_address(page);
-               init_level2_page(level2p, addr);
-               set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
-               addr += PUD_SIZE;
-       }
-       /* clear the unused entries */
-       while (addr < end_addr) {
-               pud_clear(level3p++);
-               addr += PUD_SIZE;
-       }
-out:
-       return result;
-}
-
-
-static int init_level4_page(struct kimage *image, pgd_t *level4p,
-                               unsigned long addr, unsigned long last_addr)
-{
-       unsigned long end_addr;
-       int result;
-
-       result = 0;
-       addr &= PAGE_MASK;
-       end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
-       while ((addr < last_addr) && (addr < end_addr)) {
-               struct page *page;
-               pud_t *level3p;
-
-               page = kimage_alloc_control_pages(image, 0);
-               if (!page) {
-                       result = -ENOMEM;
-                       goto out;
-               }
-               level3p = (pud_t *)page_address(page);
-               result = init_level3_page(image, level3p, addr, last_addr);
-               if (result) {
-                       goto out;
-               }
-               set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
-               addr += PGDIR_SIZE;
-       }
-       /* clear the unused entries */
-       while (addr < end_addr) {
-               pgd_clear(level4p++);
-               addr += PGDIR_SIZE;
-       }
-out:
-       return result;
-}
-
-
-static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
-{
-       pgd_t *level4p;
-       level4p = (pgd_t *)__va(start_pgtable);
-       return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
-}
-
-static void set_idt(void *newidt, u16 limit)
-{
-       struct desc_ptr curidt;
-
-       /* x86-64 supports unaliged loads & stores */
-       curidt.size    = limit;
-       curidt.address = (unsigned long)newidt;
-
-       __asm__ __volatile__ (
-               "lidtq %0\n"
-               : : "m" (curidt)
-               );
-};
-
-
-static void set_gdt(void *newgdt, u16 limit)
-{
-       struct desc_ptr curgdt;
-
-       /* x86-64 supports unaligned loads & stores */
-       curgdt.size    = limit;
-       curgdt.address = (unsigned long)newgdt;
-
-       __asm__ __volatile__ (
-               "lgdtq %0\n"
-               : : "m" (curgdt)
-               );
-};
-
-static void load_segments(void)
-{
-       __asm__ __volatile__ (
-               "\tmovl %0,%%ds\n"
-               "\tmovl %0,%%es\n"
-               "\tmovl %0,%%ss\n"
-               "\tmovl %0,%%fs\n"
-               "\tmovl %0,%%gs\n"
-               : : "a" (__KERNEL_DS) : "memory"
-               );
-}
-
-int machine_kexec_prepare(struct kimage *image)
-{
-       unsigned long start_pgtable;
-       int result;
-
-       /* Calculate the offsets */
-       start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
-
-       /* Setup the identity mapped 64bit page table */
-       result = init_pgtable(image, start_pgtable);
-       if (result)
-               return result;
-
-       return 0;
-}
-
-void machine_kexec_cleanup(struct kimage *image)
-{
-       return;
-}
-
-/*
- * Do not allocate memory (or fail in any way) in machine_kexec().
- * We are past the point of no return, committed to rebooting now.
- */
-NORET_TYPE void machine_kexec(struct kimage *image)
-{
-       unsigned long page_list[PAGES_NR];
-       void *control_page;
-
-       /* Interrupts aren't acceptable while we reboot */
-       local_irq_disable();
-
-       control_page = page_address(image->control_code_page) + PAGE_SIZE;
-       memcpy(control_page, relocate_kernel, PAGE_SIZE);
-
-       page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
-       page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
-       page_list[PA_PGD] = virt_to_phys(&kexec_pgd);
-       page_list[VA_PGD] = (unsigned long)kexec_pgd;
-       page_list[PA_PUD_0] = virt_to_phys(&kexec_pud0);
-       page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
-       page_list[PA_PMD_0] = virt_to_phys(&kexec_pmd0);
-       page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
-       page_list[PA_PTE_0] = virt_to_phys(&kexec_pte0);
-       page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
-       page_list[PA_PUD_1] = virt_to_phys(&kexec_pud1);
-       page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
-       page_list[PA_PMD_1] = virt_to_phys(&kexec_pmd1);
-       page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
-       page_list[PA_PTE_1] = virt_to_phys(&kexec_pte1);
-       page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
-
-       page_list[PA_TABLE_PAGE] =
-         (unsigned long)__pa(page_address(image->control_code_page));
-
-       /* The segment registers are funny things, they have both a
-        * visible and an invisible part.  Whenever the visible part is
-        * set to a specific selector, the invisible part is loaded
-        * with from a table in memory.  At no other time is the
-        * descriptor table in memory accessed.
-        *
-        * I take advantage of this here by force loading the
-        * segments, before I zap the gdt with an invalid value.
-        */
-       load_segments();
-       /* The gdt & idt are now invalid.
-        * If you want to load them you must set up your own idt & gdt.
-        */
-       set_gdt(phys_to_virt(0),0);
-       set_idt(phys_to_virt(0),0);
-
-       /* now call it */
-       relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
-                       image->start);
-}
-
-/* crashkernel=size@addr specifies the location to reserve for
- * a crash kernel.  By reserving this memory we guarantee
- * that linux never set's it up as a DMA target.
- * Useful for holding code to do something appropriate
- * after a kernel panic.
- */
-static int __init setup_crashkernel(char *arg)
-{
-       unsigned long size, base;
-       char *p;
-       if (!arg)
-               return -EINVAL;
-       size = memparse(arg, &p);
-       if (arg == p)
-               return -EINVAL;
-       if (*p == '@') {
-               base = memparse(p+1, &p);
-               /* FIXME: Do I want a sanity check to validate the
-                * memory range?  Yes you do, but it's too early for
-                * e820 -AK */
-               crashk_res.start = base;
-               crashk_res.end   = base + size - 1;
-       }
-       return 0;
-}
-early_param("crashkernel", setup_crashkernel);
-
diff --git a/arch/x86_64/kernel/mce_64.c b/arch/x86_64/kernel/mce_64.c
deleted file mode 100644 (file)
index a66d607..0000000
+++ /dev/null
@@ -1,875 +0,0 @@
-/*
- * Machine check handler.
- * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
- * Rest from unknown author(s). 
- * 2004 Andi Kleen. Rewrote most of it. 
- */
-
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/rcupdate.h>
-#include <linux/kallsyms.h>
-#include <linux/sysdev.h>
-#include <linux/miscdevice.h>
-#include <linux/fs.h>
-#include <linux/capability.h>
-#include <linux/cpu.h>
-#include <linux/percpu.h>
-#include <linux/poll.h>
-#include <linux/thread_info.h>
-#include <linux/ctype.h>
-#include <linux/kmod.h>
-#include <linux/kdebug.h>
-#include <asm/processor.h> 
-#include <asm/msr.h>
-#include <asm/mce.h>
-#include <asm/uaccess.h>
-#include <asm/smp.h>
-#include <asm/idle.h>
-
-#define MISC_MCELOG_MINOR 227
-#define NR_BANKS 6
-
-atomic_t mce_entry;
-
-static int mce_dont_init;
-
-/*
- * Tolerant levels:
- *   0: always panic on uncorrected errors, log corrected errors
- *   1: panic or SIGBUS on uncorrected errors, log corrected errors
- *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
- *   3: never panic or SIGBUS, log all errors (for testing only)
- */
-static int tolerant = 1;
-static int banks;
-static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
-static unsigned long notify_user;
-static int rip_msr;
-static int mce_bootlog = 1;
-static atomic_t mce_events;
-
-static char trigger[128];
-static char *trigger_argv[2] = { trigger, NULL };
-
-static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
-
-/*
- * Lockless MCE logging infrastructure.
- * This avoids deadlocks on printk locks without having to break locks. Also
- * separate MCEs from kernel messages to avoid bogus bug reports.
- */
-
-struct mce_log mcelog = { 
-       MCE_LOG_SIGNATURE,
-       MCE_LOG_LEN,
-}; 
-
-void mce_log(struct mce *mce)
-{
-       unsigned next, entry;
-       atomic_inc(&mce_events);
-       mce->finished = 0;
-       wmb();
-       for (;;) {
-               entry = rcu_dereference(mcelog.next);
-               /* The rmb forces the compiler to reload next in each
-                   iteration */
-               rmb();
-               for (;;) {
-                       /* When the buffer fills up discard new entries. Assume
-                          that the earlier errors are the more interesting. */
-                       if (entry >= MCE_LOG_LEN) {
-                               set_bit(MCE_OVERFLOW, &mcelog.flags);
-                               return;
-                       }
-                       /* Old left over entry. Skip. */
-                       if (mcelog.entry[entry].finished) {
-                               entry++;
-                               continue;
-                       }
-                       break;
-               }
-               smp_rmb();
-               next = entry + 1;
-               if (cmpxchg(&mcelog.next, entry, next) == entry)
-                       break;
-       }
-       memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
-       wmb();
-       mcelog.entry[entry].finished = 1;
-       wmb();
-
-       set_bit(0, &notify_user);
-}
-
-static void print_mce(struct mce *m)
-{
-       printk(KERN_EMERG "\n"
-              KERN_EMERG "HARDWARE ERROR\n"
-              KERN_EMERG
-              "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
-              m->cpu, m->mcgstatus, m->bank, m->status);
-       if (m->rip) {
-               printk(KERN_EMERG 
-                      "RIP%s %02x:<%016Lx> ",
-                      !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
-                      m->cs, m->rip);
-               if (m->cs == __KERNEL_CS)
-                       print_symbol("{%s}", m->rip);
-               printk("\n");
-       }
-       printk(KERN_EMERG "TSC %Lx ", m->tsc); 
-       if (m->addr)
-               printk("ADDR %Lx ", m->addr);
-       if (m->misc)
-               printk("MISC %Lx ", m->misc);   
-       printk("\n");
-       printk(KERN_EMERG "This is not a software problem!\n");
-        printk(KERN_EMERG
-    "Run through mcelog --ascii to decode and contact your hardware vendor\n");
-}
-
-static void mce_panic(char *msg, struct mce *backup, unsigned long start)
-{ 
-       int i;
-
-       oops_begin();
-       for (i = 0; i < MCE_LOG_LEN; i++) {
-               unsigned long tsc = mcelog.entry[i].tsc;
-               if (time_before(tsc, start))
-                       continue;
-               print_mce(&mcelog.entry[i]); 
-               if (backup && mcelog.entry[i].tsc == backup->tsc)
-                       backup = NULL;
-       }
-       if (backup)
-               print_mce(backup);
-       panic(msg);
-} 
-
-static int mce_available(struct cpuinfo_x86 *c)
-{
-       return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
-}
-
-static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
-{
-       if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
-               m->rip = regs->rip;
-               m->cs = regs->cs;
-       } else {
-               m->rip = 0;
-               m->cs = 0;
-       }
-       if (rip_msr) {
-               /* Assume the RIP in the MSR is exact. Is this true? */
-               m->mcgstatus |= MCG_STATUS_EIPV;
-               rdmsrl(rip_msr, m->rip);
-               m->cs = 0;
-       }
-}
-
-/* 
- * The actual machine check handler
- */
-
-void do_machine_check(struct pt_regs * regs, long error_code)
-{
-       struct mce m, panicm;
-       u64 mcestart = 0;
-       int i;
-       int panicm_found = 0;
-       /*
-        * If no_way_out gets set, there is no safe way to recover from this
-        * MCE.  If tolerant is cranked up, we'll try anyway.
-        */
-       int no_way_out = 0;
-       /*
-        * If kill_it gets set, there might be a way to recover from this
-        * error.
-        */
-       int kill_it = 0;
-
-       atomic_inc(&mce_entry);
-
-       if (regs)
-               notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
-       if (!banks)
-               goto out2;
-
-       memset(&m, 0, sizeof(struct mce));
-       m.cpu = smp_processor_id();
-       rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
-       /* if the restart IP is not valid, we're done for */
-       if (!(m.mcgstatus & MCG_STATUS_RIPV))
-               no_way_out = 1;
-       
-       rdtscll(mcestart);
-       barrier();
-
-       for (i = 0; i < banks; i++) {
-               if (!bank[i])
-                       continue;
-               
-               m.misc = 0; 
-               m.addr = 0;
-               m.bank = i;
-               m.tsc = 0;
-
-               rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
-               if ((m.status & MCI_STATUS_VAL) == 0)
-                       continue;
-
-               if (m.status & MCI_STATUS_EN) {
-                       /* if PCC was set, there's no way out */
-                       no_way_out |= !!(m.status & MCI_STATUS_PCC);
-                       /*
-                        * If this error was uncorrectable and there was
-                        * an overflow, we're in trouble.  If no overflow,
-                        * we might get away with just killing a task.
-                        */
-                       if (m.status & MCI_STATUS_UC) {
-                               if (tolerant < 1 || m.status & MCI_STATUS_OVER)
-                                       no_way_out = 1;
-                               kill_it = 1;
-                       }
-               }
-
-               if (m.status & MCI_STATUS_MISCV)
-                       rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
-               if (m.status & MCI_STATUS_ADDRV)
-                       rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
-
-               mce_get_rip(&m, regs);
-               if (error_code >= 0)
-                       rdtscll(m.tsc);
-               if (error_code != -2)
-                       mce_log(&m);
-
-               /* Did this bank cause the exception? */
-               /* Assume that the bank with uncorrectable errors did it,
-                  and that there is only a single one. */
-               if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
-                       panicm = m;
-                       panicm_found = 1;
-               }
-
-               add_taint(TAINT_MACHINE_CHECK);
-       }
-
-       /* Never do anything final in the polling timer */
-       if (!regs)
-               goto out;
-
-       /* If we didn't find an uncorrectable error, pick
-          the last one (shouldn't happen, just being safe). */
-       if (!panicm_found)
-               panicm = m;
-
-       /*
-        * If we have decided that we just CAN'T continue, and the user
-        *  has not set tolerant to an insane level, give up and die.
-        */
-       if (no_way_out && tolerant < 3)
-               mce_panic("Machine check", &panicm, mcestart);
-
-       /*
-        * If the error seems to be unrecoverable, something should be
-        * done.  Try to kill as little as possible.  If we can kill just
-        * one task, do that.  If the user has set the tolerance very
-        * high, don't try to do anything at all.
-        */
-       if (kill_it && tolerant < 3) {
-               int user_space = 0;
-
-               /*
-                * If the EIPV bit is set, it means the saved IP is the
-                * instruction which caused the MCE.
-                */
-               if (m.mcgstatus & MCG_STATUS_EIPV)
-                       user_space = panicm.rip && (panicm.cs & 3);
-
-               /*
-                * If we know that the error was in user space, send a
-                * SIGBUS.  Otherwise, panic if tolerance is low.
-                *
-                * do_exit() takes an awful lot of locks and has a slight
-                * risk of deadlocking.
-                */
-               if (user_space) {
-                       do_exit(SIGBUS);
-               } else if (panic_on_oops || tolerant < 2) {
-                       mce_panic("Uncorrected machine check",
-                               &panicm, mcestart);
-               }
-       }
-
-       /* notify userspace ASAP */
-       set_thread_flag(TIF_MCE_NOTIFY);
-
- out:
-       /* the last thing we do is clear state */
-       for (i = 0; i < banks; i++)
-               wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
-       wrmsrl(MSR_IA32_MCG_STATUS, 0);
- out2:
-       atomic_dec(&mce_entry);
-}
-
-#ifdef CONFIG_X86_MCE_INTEL
-/***
- * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
- * @cpu: The CPU on which the event occured.
- * @status: Event status information
- *
- * This function should be called by the thermal interrupt after the
- * event has been processed and the decision was made to log the event
- * further.
- *
- * The status parameter will be saved to the 'status' field of 'struct mce'
- * and historically has been the register value of the
- * MSR_IA32_THERMAL_STATUS (Intel) msr.
- */
-void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
-{
-       struct mce m;
-
-       memset(&m, 0, sizeof(m));
-       m.cpu = cpu;
-       m.bank = MCE_THERMAL_BANK;
-       m.status = status;
-       rdtscll(m.tsc);
-       mce_log(&m);
-}
-#endif /* CONFIG_X86_MCE_INTEL */
-
-/*
- * Periodic polling timer for "silent" machine check errors.  If the
- * poller finds an MCE, poll 2x faster.  When the poller finds no more
- * errors, poll 2x slower (up to check_interval seconds).
- */
-
-static int check_interval = 5 * 60; /* 5 minutes */
-static int next_interval; /* in jiffies */
-static void mcheck_timer(struct work_struct *work);
-static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
-
-static void mcheck_check_cpu(void *info)
-{
-       if (mce_available(&current_cpu_data))
-               do_machine_check(NULL, 0);
-}
-
-static void mcheck_timer(struct work_struct *work)
-{
-       on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
-
-       /*
-        * Alert userspace if needed.  If we logged an MCE, reduce the
-        * polling interval, otherwise increase the polling interval.
-        */
-       if (mce_notify_user()) {
-               next_interval = max(next_interval/2, HZ/100);
-       } else {
-               next_interval = min(next_interval*2,
-                               (int)round_jiffies_relative(check_interval*HZ));
-       }
-
-       schedule_delayed_work(&mcheck_work, next_interval);
-}
-
-/*
- * This is only called from process context.  This is where we do
- * anything we need to alert userspace about new MCEs.  This is called
- * directly from the poller and also from entry.S and idle, thanks to
- * TIF_MCE_NOTIFY.
- */
-int mce_notify_user(void)
-{
-       clear_thread_flag(TIF_MCE_NOTIFY);
-       if (test_and_clear_bit(0, &notify_user)) {
-               static unsigned long last_print;
-               unsigned long now = jiffies;
-
-               wake_up_interruptible(&mce_wait);
-               if (trigger[0])
-                       call_usermodehelper(trigger, trigger_argv, NULL,
-                                               UMH_NO_WAIT);
-
-               if (time_after_eq(now, last_print + (check_interval*HZ))) {
-                       last_print = now;
-                       printk(KERN_INFO "Machine check events logged\n");
-               }
-
-               return 1;
-       }
-       return 0;
-}
-
-/* see if the idle task needs to notify userspace */
-static int
-mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
-{
-       /* IDLE_END should be safe - interrupts are back on */
-       if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
-               mce_notify_user();
-
-       return NOTIFY_OK;
-}
-
-static struct notifier_block mce_idle_notifier = {
-       .notifier_call = mce_idle_callback,
-};
-
-static __init int periodic_mcheck_init(void)
-{ 
-       next_interval = check_interval * HZ;
-       if (next_interval)
-               schedule_delayed_work(&mcheck_work,
-                                     round_jiffies_relative(next_interval));
-       idle_notifier_register(&mce_idle_notifier);
-       return 0;
-} 
-__initcall(periodic_mcheck_init);
-
-
-/* 
- * Initialize Machine Checks for a CPU.
- */
-static void mce_init(void *dummy)
-{
-       u64 cap;
-       int i;
-
-       rdmsrl(MSR_IA32_MCG_CAP, cap);
-       banks = cap & 0xff;
-       if (banks > NR_BANKS) { 
-               printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
-               banks = NR_BANKS; 
-       }
-       /* Use accurate RIP reporting if available. */
-       if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
-               rip_msr = MSR_IA32_MCG_EIP;
-
-       /* Log the machine checks left over from the previous reset.
-          This also clears all registers */
-       do_machine_check(NULL, mce_bootlog ? -1 : -2);
-
-       set_in_cr4(X86_CR4_MCE);
-
-       if (cap & MCG_CTL_P)
-               wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
-
-       for (i = 0; i < banks; i++) {
-               wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
-               wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
-       }       
-}
-
-/* Add per CPU specific workarounds here */
-static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
-{ 
-       /* This should be disabled by the BIOS, but isn't always */
-       if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
-               /* disable GART TBL walk error reporting, which trips off 
-                  incorrectly with the IOMMU & 3ware & Cerberus. */
-               clear_bit(10, &bank[4]);
-               /* Lots of broken BIOS around that don't clear them
-                  by default and leave crap in there. Don't log. */
-               mce_bootlog = 0;
-       }
-
-}                      
-
-static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
-{
-       switch (c->x86_vendor) {
-       case X86_VENDOR_INTEL:
-               mce_intel_feature_init(c);
-               break;
-       case X86_VENDOR_AMD:
-               mce_amd_feature_init(c);
-               break;
-       default:
-               break;
-       }
-}
-
-/* 
- * Called for each booted CPU to set up machine checks.
- * Must be called with preempt off. 
- */
-void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
-{
-       static cpumask_t mce_cpus = CPU_MASK_NONE;
-
-       mce_cpu_quirks(c); 
-
-       if (mce_dont_init ||
-           cpu_test_and_set(smp_processor_id(), mce_cpus) ||
-           !mce_available(c))
-               return;
-
-       mce_init(NULL);
-       mce_cpu_features(c);
-}
-
-/*
- * Character device to read and clear the MCE log.
- */
-
-static DEFINE_SPINLOCK(mce_state_lock);
-static int open_count; /* #times opened */
-static int open_exclu; /* already open exclusive? */
-
-static int mce_open(struct inode *inode, struct file *file)
-{
-       spin_lock(&mce_state_lock);
-
-       if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
-               spin_unlock(&mce_state_lock);
-               return -EBUSY;
-       }
-
-       if (file->f_flags & O_EXCL)
-               open_exclu = 1;
-       open_count++;
-
-       spin_unlock(&mce_state_lock);
-
-       return nonseekable_open(inode, file);
-}
-
-static int mce_release(struct inode *inode, struct file *file)
-{
-       spin_lock(&mce_state_lock);
-
-       open_count--;
-       open_exclu = 0;
-
-       spin_unlock(&mce_state_lock);
-
-       return 0;
-}
-
-static void collect_tscs(void *data) 
-{ 
-       unsigned long *cpu_tsc = (unsigned long *)data;
-       rdtscll(cpu_tsc[smp_processor_id()]);
-} 
-
-static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
-{
-       unsigned long *cpu_tsc;
-       static DECLARE_MUTEX(mce_read_sem);
-       unsigned next;
-       char __user *buf = ubuf;
-       int i, err;
-
-       cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
-       if (!cpu_tsc)
-               return -ENOMEM;
-
-       down(&mce_read_sem); 
-       next = rcu_dereference(mcelog.next);
-
-       /* Only supports full reads right now */
-       if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 
-               up(&mce_read_sem);
-               kfree(cpu_tsc);
-               return -EINVAL;
-       }
-
-       err = 0;
-       for (i = 0; i < next; i++) {            
-               unsigned long start = jiffies;
-               while (!mcelog.entry[i].finished) {
-                       if (time_after_eq(jiffies, start + 2)) {
-                               memset(mcelog.entry + i,0, sizeof(struct mce));
-                               goto timeout;
-                       }
-                       cpu_relax();
-               }
-               smp_rmb();
-               err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
-               buf += sizeof(struct mce); 
- timeout:
-               ;
-       } 
-
-       memset(mcelog.entry, 0, next * sizeof(struct mce));
-       mcelog.next = 0;
-
-       synchronize_sched();
-
-       /* Collect entries that were still getting written before the synchronize. */
-
-       on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
-       for (i = next; i < MCE_LOG_LEN; i++) { 
-               if (mcelog.entry[i].finished && 
-                   mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {  
-                       err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
-                       smp_rmb();
-                       buf += sizeof(struct mce);
-                       memset(&mcelog.entry[i], 0, sizeof(struct mce));
-               }
-       }       
-       up(&mce_read_sem);
-       kfree(cpu_tsc);
-       return err ? -EFAULT : buf - ubuf; 
-}
-
-static unsigned int mce_poll(struct file *file, poll_table *wait)
-{
-       poll_wait(file, &mce_wait, wait);
-       if (rcu_dereference(mcelog.next))
-               return POLLIN | POLLRDNORM;
-       return 0;
-}
-
-static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
-{
-       int __user *p = (int __user *)arg;
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM; 
-       switch (cmd) {
-       case MCE_GET_RECORD_LEN: 
-               return put_user(sizeof(struct mce), p);
-       case MCE_GET_LOG_LEN:
-               return put_user(MCE_LOG_LEN, p);                
-       case MCE_GETCLEAR_FLAGS: {
-               unsigned flags;
-               do { 
-                       flags = mcelog.flags;
-               } while (cmpxchg(&mcelog.flags, flags, 0) != flags); 
-               return put_user(flags, p); 
-       }
-       default:
-               return -ENOTTY; 
-       } 
-}
-
-static const struct file_operations mce_chrdev_ops = {
-       .open = mce_open,
-       .release = mce_release,
-       .read = mce_read,
-       .poll = mce_poll,
-       .ioctl = mce_ioctl,
-};
-
-static struct miscdevice mce_log_device = {
-       MISC_MCELOG_MINOR,
-       "mcelog",
-       &mce_chrdev_ops,
-};
-
-static unsigned long old_cr4 __initdata;
-
-void __init stop_mce(void)
-{
-       old_cr4 = read_cr4();
-       clear_in_cr4(X86_CR4_MCE);
-}
-
-void __init restart_mce(void)
-{
-       if (old_cr4 & X86_CR4_MCE)
-               set_in_cr4(X86_CR4_MCE);
-}
-
-/* 
- * Old style boot options parsing. Only for compatibility. 
- */
-
-static int __init mcheck_disable(char *str)
-{
-       mce_dont_init = 1;
-       return 1;
-}
-
-/* mce=off disables machine check. Note you can reenable it later
-   using sysfs.
-   mce=TOLERANCELEVEL (number, see above)
-   mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
-   mce=nobootlog Don't log MCEs from before booting. */
-static int __init mcheck_enable(char *str)
-{
-       if (*str == '=')
-               str++;
-       if (!strcmp(str, "off"))
-               mce_dont_init = 1;
-       else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
-               mce_bootlog = str[0] == 'b';
-       else if (isdigit(str[0]))
-               get_option(&str, &tolerant);
-       else
-               printk("mce= argument %s ignored. Please use /sys", str); 
-       return 1;
-}
-
-__setup("nomce", mcheck_disable);
-__setup("mce", mcheck_enable);
-
-/* 
- * Sysfs support
- */ 
-
-/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
-   Only one CPU is active at this time, the others get readded later using
-   CPU hotplug. */
-static int mce_resume(struct sys_device *dev)
-{
-       mce_init(NULL);
-       return 0;
-}
-
-/* Reinit MCEs after user configuration changes */
-static void mce_restart(void) 
-{ 
-       if (next_interval)
-               cancel_delayed_work(&mcheck_work);
-       /* Timer race is harmless here */
-       on_each_cpu(mce_init, NULL, 1, 1);       
-       next_interval = check_interval * HZ;
-       if (next_interval)
-               schedule_delayed_work(&mcheck_work,
-                                     round_jiffies_relative(next_interval));
-}
-
-static struct sysdev_class mce_sysclass = {
-       .resume = mce_resume,
-       set_kset_name("machinecheck"),
-};
-
-DEFINE_PER_CPU(struct sys_device, device_mce);
-
-/* Why are there no generic functions for this? */
-#define ACCESSOR(name, var, start) \
-       static ssize_t show_ ## name(struct sys_device *s, char *buf) {                    \
-               return sprintf(buf, "%lx\n", (unsigned long)var);                  \
-       }                                                                          \
-       static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
-               char *end;                                                         \
-               unsigned long new = simple_strtoul(buf, &end, 0);                  \
-               if (end == buf) return -EINVAL;                                    \
-               var = new;                                                         \
-               start;                                                             \
-               return end-buf;                                                    \
-       }                                                                          \
-       static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
-
-/* TBD should generate these dynamically based on number of available banks */
-ACCESSOR(bank0ctl,bank[0],mce_restart())
-ACCESSOR(bank1ctl,bank[1],mce_restart())
-ACCESSOR(bank2ctl,bank[2],mce_restart())
-ACCESSOR(bank3ctl,bank[3],mce_restart())
-ACCESSOR(bank4ctl,bank[4],mce_restart())
-ACCESSOR(bank5ctl,bank[5],mce_restart())
-
-static ssize_t show_trigger(struct sys_device *s, char *buf)
-{
-       strcpy(buf, trigger);
-       strcat(buf, "\n");
-       return strlen(trigger) + 1;
-}
-
-static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
-{
-       char *p;
-       int len;
-       strncpy(trigger, buf, sizeof(trigger));
-       trigger[sizeof(trigger)-1] = 0;
-       len = strlen(trigger);
-       p = strchr(trigger, '\n');
-       if (*p) *p = 0;
-       return len;
-}
-
-static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
-ACCESSOR(tolerant,tolerant,)
-ACCESSOR(check_interval,check_interval,mce_restart())
-static struct sysdev_attribute *mce_attributes[] = {
-       &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
-       &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
-       &attr_tolerant, &attr_check_interval, &attr_trigger,
-       NULL
-};
-
-/* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
-static __cpuinit int mce_create_device(unsigned int cpu)
-{
-       int err;
-       int i;
-       if (!mce_available(&cpu_data[cpu]))
-               return -EIO;
-
-       per_cpu(device_mce,cpu).id = cpu;
-       per_cpu(device_mce,cpu).cls = &mce_sysclass;
-
-       err = sysdev_register(&per_cpu(device_mce,cpu));
-
-       if (!err) {
-               for (i = 0; mce_attributes[i]; i++)
-                       sysdev_create_file(&per_cpu(device_mce,cpu),
-                               mce_attributes[i]);
-       }
-       return err;
-}
-
-static void mce_remove_device(unsigned int cpu)
-{
-       int i;
-
-       for (i = 0; mce_attributes[i]; i++)
-               sysdev_remove_file(&per_cpu(device_mce,cpu),
-                       mce_attributes[i]);
-       sysdev_unregister(&per_cpu(device_mce,cpu));
-       memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
-}
-
-/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int
-mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
-       unsigned int cpu = (unsigned long)hcpu;
-
-       switch (action) {
-       case CPU_ONLINE:
-       case CPU_ONLINE_FROZEN:
-               mce_create_device(cpu);
-               break;
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-               mce_remove_device(cpu);
-               break;
-       }
-       return NOTIFY_OK;
-}
-
-static struct notifier_block mce_cpu_notifier = {
-       .notifier_call = mce_cpu_callback,
-};
-
-static __init int mce_init_device(void)
-{
-       int err;
-       int i = 0;
-
-       if (!mce_available(&boot_cpu_data))
-               return -EIO;
-       err = sysdev_class_register(&mce_sysclass);
-
-       for_each_online_cpu(i) {
-               mce_create_device(i);
-       }
-
-       register_hotcpu_notifier(&mce_cpu_notifier);
-       misc_register(&mce_log_device);
-       return err;
-}
-
-device_initcall(mce_init_device);
diff --git a/arch/x86_64/kernel/mce_amd_64.c b/arch/x86_64/kernel/mce_amd_64.c
deleted file mode 100644 (file)
index 2f8a7f1..0000000
+++ /dev/null
@@ -1,689 +0,0 @@
-/*
- *  (c) 2005, 2006 Advanced Micro Devices, Inc.
- *  Your use of this code is subject to the terms and conditions of the
- *  GNU general public license version 2. See "COPYING" or
- *  http://www.gnu.org/licenses/gpl.html
- *
- *  Written by Jacob Shin - AMD, Inc.
- *
- *  Support : jacob.shin@amd.com
- *
- *  April 2006
- *     - added support for AMD Family 0x10 processors
- *
- *  All MC4_MISCi registers are shared between multi-cores
- */
-
-#include <linux/cpu.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/kobject.h>
-#include <linux/notifier.h>
-#include <linux/sched.h>
-#include <linux/smp.h>
-#include <linux/sysdev.h>
-#include <linux/sysfs.h>
-#include <asm/apic.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-#include <asm/percpu.h>
-#include <asm/idle.h>
-
-#define PFX               "mce_threshold: "
-#define VERSION           "version 1.1.1"
-#define NR_BANKS          6
-#define NR_BLOCKS         9
-#define THRESHOLD_MAX     0xFFF
-#define INT_TYPE_APIC     0x00020000
-#define MASK_VALID_HI     0x80000000
-#define MASK_CNTP_HI      0x40000000
-#define MASK_LOCKED_HI    0x20000000
-#define MASK_LVTOFF_HI    0x00F00000
-#define MASK_COUNT_EN_HI  0x00080000
-#define MASK_INT_TYPE_HI  0x00060000
-#define MASK_OVERFLOW_HI  0x00010000
-#define MASK_ERR_COUNT_HI 0x00000FFF
-#define MASK_BLKPTR_LO    0xFF000000
-#define MCG_XBLK_ADDR     0xC0000400
-
-struct threshold_block {
-       unsigned int block;
-       unsigned int bank;
-       unsigned int cpu;
-       u32 address;
-       u16 interrupt_enable;
-       u16 threshold_limit;
-       struct kobject kobj;
-       struct list_head miscj;
-};
-
-/* defaults used early on boot */
-static struct threshold_block threshold_defaults = {
-       .interrupt_enable = 0,
-       .threshold_limit = THRESHOLD_MAX,
-};
-
-struct threshold_bank {
-       struct kobject kobj;
-       struct threshold_block *blocks;
-       cpumask_t cpus;
-};
-static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
-
-#ifdef CONFIG_SMP
-static unsigned char shared_bank[NR_BANKS] = {
-       0, 0, 0, 0, 1
-};
-#endif
-
-static DEFINE_PER_CPU(unsigned char, bank_map);        /* see which banks are on */
-
-/*
- * CPU Initialization
- */
-
-/* must be called with correct cpu affinity */
-static void threshold_restart_bank(struct threshold_block *b,
-                                  int reset, u16 old_limit)
-{
-       u32 mci_misc_hi, mci_misc_lo;
-
-       rdmsr(b->address, mci_misc_lo, mci_misc_hi);
-
-       if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
-               reset = 1;      /* limit cannot be lower than err count */
-
-       if (reset) {            /* reset err count and overflow bit */
-               mci_misc_hi =
-                   (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
-                   (THRESHOLD_MAX - b->threshold_limit);
-       } else if (old_limit) { /* change limit w/o reset */
-               int new_count = (mci_misc_hi & THRESHOLD_MAX) +
-                   (old_limit - b->threshold_limit);
-               mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
-                   (new_count & THRESHOLD_MAX);
-       }
-
-       b->interrupt_enable ?
-           (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
-           (mci_misc_hi &= ~MASK_INT_TYPE_HI);
-
-       mci_misc_hi |= MASK_COUNT_EN_HI;
-       wrmsr(b->address, mci_misc_lo, mci_misc_hi);
-}
-
-/* cpu init entry point, called from mce.c with preempt off */
-void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
-{
-       unsigned int bank, block;
-       unsigned int cpu = smp_processor_id();
-       u32 low = 0, high = 0, address = 0;
-
-       for (bank = 0; bank < NR_BANKS; ++bank) {
-               for (block = 0; block < NR_BLOCKS; ++block) {
-                       if (block == 0)
-                               address = MSR_IA32_MC0_MISC + bank * 4;
-                       else if (block == 1) {
-                               address = (low & MASK_BLKPTR_LO) >> 21;
-                               if (!address)
-                                       break;
-                               address += MCG_XBLK_ADDR;
-                       }
-                       else
-                               ++address;
-
-                       if (rdmsr_safe(address, &low, &high))
-                               break;
-
-                       if (!(high & MASK_VALID_HI)) {
-                               if (block)
-                                       continue;
-                               else
-                                       break;
-                       }
-
-                       if (!(high & MASK_CNTP_HI)  ||
-                            (high & MASK_LOCKED_HI))
-                               continue;
-
-                       if (!block)
-                               per_cpu(bank_map, cpu) |= (1 << bank);
-#ifdef CONFIG_SMP
-                       if (shared_bank[bank] && c->cpu_core_id)
-                               break;
-#endif
-                       high &= ~MASK_LVTOFF_HI;
-                       high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20;
-                       wrmsr(address, low, high);
-
-                       setup_APIC_extended_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD,
-                                               THRESHOLD_APIC_VECTOR,
-                                               K8_APIC_EXT_INT_MSG_FIX, 0);
-
-                       threshold_defaults.address = address;
-                       threshold_restart_bank(&threshold_defaults, 0, 0);
-               }
-       }
-}
-
-/*
- * APIC Interrupt Handler
- */
-
-/*
- * threshold interrupt handler will service THRESHOLD_APIC_VECTOR.
- * the interrupt goes off when error_count reaches threshold_limit.
- * the handler will simply log mcelog w/ software defined bank number.
- */
-asmlinkage void mce_threshold_interrupt(void)
-{
-       unsigned int bank, block;
-       struct mce m;
-       u32 low = 0, high = 0, address = 0;
-
-       ack_APIC_irq();
-       exit_idle();
-       irq_enter();
-
-       memset(&m, 0, sizeof(m));
-       rdtscll(m.tsc);
-       m.cpu = smp_processor_id();
-
-       /* assume first bank caused it */
-       for (bank = 0; bank < NR_BANKS; ++bank) {
-               if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
-                       continue;
-               for (block = 0; block < NR_BLOCKS; ++block) {
-                       if (block == 0)
-                               address = MSR_IA32_MC0_MISC + bank * 4;
-                       else if (block == 1) {
-                               address = (low & MASK_BLKPTR_LO) >> 21;
-                               if (!address)
-                                       break;
-                               address += MCG_XBLK_ADDR;
-                       }
-                       else
-                               ++address;
-
-                       if (rdmsr_safe(address, &low, &high))
-                               break;
-
-                       if (!(high & MASK_VALID_HI)) {
-                               if (block)
-                                       continue;
-                               else
-                                       break;
-                       }
-
-                       if (!(high & MASK_CNTP_HI)  ||
-                            (high & MASK_LOCKED_HI))
-                               continue;
-
-                       /* Log the machine check that caused the threshold
-                          event. */
-                       do_machine_check(NULL, 0);
-
-                       if (high & MASK_OVERFLOW_HI) {
-                               rdmsrl(address, m.misc);
-                               rdmsrl(MSR_IA32_MC0_STATUS + bank * 4,
-                                      m.status);
-                               m.bank = K8_MCE_THRESHOLD_BASE
-                                      + bank * NR_BLOCKS
-                                      + block;
-                               mce_log(&m);
-                               goto out;
-                       }
-               }
-       }
-out:
-       irq_exit();
-}
-
-/*
- * Sysfs Interface
- */
-
-struct threshold_attr {
-       struct attribute attr;
-       ssize_t(*show) (struct threshold_block *, char *);
-       ssize_t(*store) (struct threshold_block *, const char *, size_t count);
-};
-
-static cpumask_t affinity_set(unsigned int cpu)
-{
-       cpumask_t oldmask = current->cpus_allowed;
-       cpumask_t newmask = CPU_MASK_NONE;
-       cpu_set(cpu, newmask);
-       set_cpus_allowed(current, newmask);
-       return oldmask;
-}
-
-static void affinity_restore(cpumask_t oldmask)
-{
-       set_cpus_allowed(current, oldmask);
-}
-
-#define SHOW_FIELDS(name)                                           \
-static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
-{                                                                   \
-        return sprintf(buf, "%lx\n", (unsigned long) b->name);      \
-}
-SHOW_FIELDS(interrupt_enable)
-SHOW_FIELDS(threshold_limit)
-
-static ssize_t store_interrupt_enable(struct threshold_block *b,
-                                     const char *buf, size_t count)
-{
-       char *end;
-       cpumask_t oldmask;
-       unsigned long new = simple_strtoul(buf, &end, 0);
-       if (end == buf)
-               return -EINVAL;
-       b->interrupt_enable = !!new;
-
-       oldmask = affinity_set(b->cpu);
-       threshold_restart_bank(b, 0, 0);
-       affinity_restore(oldmask);
-
-       return end - buf;
-}
-
-static ssize_t store_threshold_limit(struct threshold_block *b,
-                                    const char *buf, size_t count)
-{
-       char *end;
-       cpumask_t oldmask;
-       u16 old;
-       unsigned long new = simple_strtoul(buf, &end, 0);
-       if (end == buf)
-               return -EINVAL;
-       if (new > THRESHOLD_MAX)
-               new = THRESHOLD_MAX;
-       if (new < 1)
-               new = 1;
-       old = b->threshold_limit;
-       b->threshold_limit = new;
-
-       oldmask = affinity_set(b->cpu);
-       threshold_restart_bank(b, 0, old);
-       affinity_restore(oldmask);
-
-       return end - buf;
-}
-
-static ssize_t show_error_count(struct threshold_block *b, char *buf)
-{
-       u32 high, low;
-       cpumask_t oldmask;
-       oldmask = affinity_set(b->cpu);
-       rdmsr(b->address, low, high);
-       affinity_restore(oldmask);
-       return sprintf(buf, "%x\n",
-                      (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));
-}
-
-static ssize_t store_error_count(struct threshold_block *b,
-                                const char *buf, size_t count)
-{
-       cpumask_t oldmask;
-       oldmask = affinity_set(b->cpu);
-       threshold_restart_bank(b, 1, 0);
-       affinity_restore(oldmask);
-       return 1;
-}
-
-#define THRESHOLD_ATTR(_name,_mode,_show,_store) {            \
-        .attr = {.name = __stringify(_name), .mode = _mode }, \
-        .show = _show,                                        \
-        .store = _store,                                      \
-};
-
-#define RW_ATTR(name)                                           \
-static struct threshold_attr name =                             \
-        THRESHOLD_ATTR(name, 0644, show_## name, store_## name)
-
-RW_ATTR(interrupt_enable);
-RW_ATTR(threshold_limit);
-RW_ATTR(error_count);
-
-static struct attribute *default_attrs[] = {
-       &interrupt_enable.attr,
-       &threshold_limit.attr,
-       &error_count.attr,
-       NULL
-};
-
-#define to_block(k) container_of(k, struct threshold_block, kobj)
-#define to_attr(a) container_of(a, struct threshold_attr, attr)
-
-static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
-{
-       struct threshold_block *b = to_block(kobj);
-       struct threshold_attr *a = to_attr(attr);
-       ssize_t ret;
-       ret = a->show ? a->show(b, buf) : -EIO;
-       return ret;
-}
-
-static ssize_t store(struct kobject *kobj, struct attribute *attr,
-                    const char *buf, size_t count)
-{
-       struct threshold_block *b = to_block(kobj);
-       struct threshold_attr *a = to_attr(attr);
-       ssize_t ret;
-       ret = a->store ? a->store(b, buf, count) : -EIO;
-       return ret;
-}
-
-static struct sysfs_ops threshold_ops = {
-       .show = show,
-       .store = store,
-};
-
-static struct kobj_type threshold_ktype = {
-       .sysfs_ops = &threshold_ops,
-       .default_attrs = default_attrs,
-};
-
-static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
-                                              unsigned int bank,
-                                              unsigned int block,
-                                              u32 address)
-{
-       int err;
-       u32 low, high;
-       struct threshold_block *b = NULL;
-
-       if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
-               return 0;
-
-       if (rdmsr_safe(address, &low, &high))
-               return 0;
-
-       if (!(high & MASK_VALID_HI)) {
-               if (block)
-                       goto recurse;
-               else
-                       return 0;
-       }
-
-       if (!(high & MASK_CNTP_HI)  ||
-            (high & MASK_LOCKED_HI))
-               goto recurse;
-
-       b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL);
-       if (!b)
-               return -ENOMEM;
-
-       b->block = block;
-       b->bank = bank;
-       b->cpu = cpu;
-       b->address = address;
-       b->interrupt_enable = 0;
-       b->threshold_limit = THRESHOLD_MAX;
-
-       INIT_LIST_HEAD(&b->miscj);
-
-       if (per_cpu(threshold_banks, cpu)[bank]->blocks)
-               list_add(&b->miscj,
-                        &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
-       else
-               per_cpu(threshold_banks, cpu)[bank]->blocks = b;
-
-       kobject_set_name(&b->kobj, "misc%i", block);
-       b->kobj.parent = &per_cpu(threshold_banks, cpu)[bank]->kobj;
-       b->kobj.ktype = &threshold_ktype;
-       err = kobject_register(&b->kobj);
-       if (err)
-               goto out_free;
-recurse:
-       if (!block) {
-               address = (low & MASK_BLKPTR_LO) >> 21;
-               if (!address)
-                       return 0;
-               address += MCG_XBLK_ADDR;
-       } else
-               ++address;
-
-       err = allocate_threshold_blocks(cpu, bank, ++block, address);
-       if (err)
-               goto out_free;
-
-       return err;
-
-out_free:
-       if (b) {
-               kobject_unregister(&b->kobj);
-               kfree(b);
-       }
-       return err;
-}
-
-/* symlinks sibling shared banks to first core.  first core owns dir/files. */
-static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
-{
-       int i, err = 0;
-       struct threshold_bank *b = NULL;
-       cpumask_t oldmask = CPU_MASK_NONE;
-       char name[32];
-
-       sprintf(name, "threshold_bank%i", bank);
-
-#ifdef CONFIG_SMP
-       if (cpu_data[cpu].cpu_core_id && shared_bank[bank]) {   /* symlink */
-               i = first_cpu(cpu_core_map[cpu]);
-
-               /* first core not up yet */
-               if (cpu_data[i].cpu_core_id)
-                       goto out;
-
-               /* already linked */
-               if (per_cpu(threshold_banks, cpu)[bank])
-                       goto out;
-
-               b = per_cpu(threshold_banks, i)[bank];
-
-               if (!b)
-                       goto out;
-
-               err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj,
-                                       &b->kobj, name);
-               if (err)
-                       goto out;
-
-               b->cpus = cpu_core_map[cpu];
-               per_cpu(threshold_banks, cpu)[bank] = b;
-               goto out;
-       }
-#endif
-
-       b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
-       if (!b) {
-               err = -ENOMEM;
-               goto out;
-       }
-
-       kobject_set_name(&b->kobj, "threshold_bank%i", bank);
-       b->kobj.parent = &per_cpu(device_mce, cpu).kobj;
-#ifndef CONFIG_SMP
-       b->cpus = CPU_MASK_ALL;
-#else
-       b->cpus = cpu_core_map[cpu];
-#endif
-       err = kobject_register(&b->kobj);
-       if (err)
-               goto out_free;
-
-       per_cpu(threshold_banks, cpu)[bank] = b;
-
-       oldmask = affinity_set(cpu);
-       err = allocate_threshold_blocks(cpu, bank, 0,
-                                       MSR_IA32_MC0_MISC + bank * 4);
-       affinity_restore(oldmask);
-
-       if (err)
-               goto out_free;
-
-       for_each_cpu_mask(i, b->cpus) {
-               if (i == cpu)
-                       continue;
-
-               err = sysfs_create_link(&per_cpu(device_mce, i).kobj,
-                                       &b->kobj, name);
-               if (err)
-                       goto out;
-
-               per_cpu(threshold_banks, i)[bank] = b;
-       }
-
-       goto out;
-
-out_free:
-       per_cpu(threshold_banks, cpu)[bank] = NULL;
-       kfree(b);
-out:
-       return err;
-}
-
-/* create dir/files for all valid threshold banks */
-static __cpuinit int threshold_create_device(unsigned int cpu)
-{
-       unsigned int bank;
-       int err = 0;
-
-       for (bank = 0; bank < NR_BANKS; ++bank) {
-               if (!(per_cpu(bank_map, cpu) & 1 << bank))
-                       continue;
-               err = threshold_create_bank(cpu, bank);
-               if (err)
-                       goto out;
-       }
-out:
-       return err;
-}
-
-/*
- * let's be hotplug friendly.
- * in case of multiple core processors, the first core always takes ownership
- *   of shared sysfs dir/files, and rest of the cores will be symlinked to it.
- */
-
-static void deallocate_threshold_block(unsigned int cpu,
-                                                unsigned int bank)
-{
-       struct threshold_block *pos = NULL;
-       struct threshold_block *tmp = NULL;
-       struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank];
-
-       if (!head)
-               return;
-
-       list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
-               kobject_unregister(&pos->kobj);
-               list_del(&pos->miscj);
-               kfree(pos);
-       }
-
-       kfree(per_cpu(threshold_banks, cpu)[bank]->blocks);
-       per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
-}
-
-static void threshold_remove_bank(unsigned int cpu, int bank)
-{
-       int i = 0;
-       struct threshold_bank *b;
-       char name[32];
-
-       b = per_cpu(threshold_banks, cpu)[bank];
-
-       if (!b)
-               return;
-
-       if (!b->blocks)
-               goto free_out;
-
-       sprintf(name, "threshold_bank%i", bank);
-
-#ifdef CONFIG_SMP
-       /* sibling symlink */
-       if (shared_bank[bank] && b->blocks->cpu != cpu) {
-               sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name);
-               per_cpu(threshold_banks, cpu)[bank] = NULL;
-               return;
-       }
-#endif
-
-       /* remove all sibling symlinks before unregistering */
-       for_each_cpu_mask(i, b->cpus) {
-               if (i == cpu)
-                       continue;
-
-               sysfs_remove_link(&per_cpu(device_mce, i).kobj, name);
-               per_cpu(threshold_banks, i)[bank] = NULL;
-       }
-
-       deallocate_threshold_block(cpu, bank);
-
-free_out:
-       kobject_unregister(&b->kobj);
-       kfree(b);
-       per_cpu(threshold_banks, cpu)[bank] = NULL;
-}
-
-static void threshold_remove_device(unsigned int cpu)
-{
-       unsigned int bank;
-
-       for (bank = 0; bank < NR_BANKS; ++bank) {
-               if (!(per_cpu(bank_map, cpu) & 1 << bank))
-                       continue;
-               threshold_remove_bank(cpu, bank);
-       }
-}
-
-/* get notified when a cpu comes on/off */
-static int threshold_cpu_callback(struct notifier_block *nfb,
-                                           unsigned long action, void *hcpu)
-{
-       /* cpu was unsigned int to begin with */
-       unsigned int cpu = (unsigned long)hcpu;
-
-       if (cpu >= NR_CPUS)
-               goto out;
-
-       switch (action) {
-       case CPU_ONLINE:
-       case CPU_ONLINE_FROZEN:
-               threshold_create_device(cpu);
-               break;
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-               threshold_remove_device(cpu);
-               break;
-       default:
-               break;
-       }
-      out:
-       return NOTIFY_OK;
-}
-
-static struct notifier_block threshold_cpu_notifier = {
-       .notifier_call = threshold_cpu_callback,
-};
-
-static __init int threshold_init_device(void)
-{
-       unsigned lcpu = 0;
-
-       /* to hit CPUs online before the notifier is up */
-       for_each_online_cpu(lcpu) {
-               int err = threshold_create_device(lcpu);
-               if (err)
-                       return err;
-       }
-       register_hotcpu_notifier(&threshold_cpu_notifier);
-       return 0;
-}
-
-device_initcall(threshold_init_device);
diff --git a/arch/x86_64/kernel/mce_intel_64.c b/arch/x86_64/kernel/mce_intel_64.c
deleted file mode 100644 (file)
index 6551505..0000000
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Intel specific MCE features.
- * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
- */
-
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/percpu.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/mce.h>
-#include <asm/hw_irq.h>
-#include <asm/idle.h>
-#include <asm/therm_throt.h>
-
-asmlinkage void smp_thermal_interrupt(void)
-{
-       __u64 msr_val;
-
-       ack_APIC_irq();
-
-       exit_idle();
-       irq_enter();
-
-       rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
-       if (therm_throt_process(msr_val & 1))
-               mce_log_therm_throt_event(smp_processor_id(), msr_val);
-
-       irq_exit();
-}
-
-static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c)
-{
-       u32 l, h;
-       int tm2 = 0;
-       unsigned int cpu = smp_processor_id();
-
-       if (!cpu_has(c, X86_FEATURE_ACPI))
-               return;
-
-       if (!cpu_has(c, X86_FEATURE_ACC))
-               return;
-
-       /* first check if TM1 is already enabled by the BIOS, in which
-        * case there might be some SMM goo which handles it, so we can't even
-        * put a handler since it might be delivered via SMI already.
-        */
-       rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-       h = apic_read(APIC_LVTTHMR);
-       if ((l & (1 << 3)) && (h & APIC_DM_SMI)) {
-               printk(KERN_DEBUG
-                      "CPU%d: Thermal monitoring handled by SMI\n", cpu);
-               return;
-       }
-
-       if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13)))
-               tm2 = 1;
-
-       if (h & APIC_VECTOR_MASK) {
-               printk(KERN_DEBUG
-                      "CPU%d: Thermal LVT vector (%#x) already "
-                      "installed\n", cpu, (h & APIC_VECTOR_MASK));
-               return;
-       }
-
-       h = THERMAL_APIC_VECTOR;
-       h |= (APIC_DM_FIXED | APIC_LVT_MASKED);
-       apic_write(APIC_LVTTHMR, h);
-
-       rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
-       wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
-
-       rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-       wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h);
-
-       l = apic_read(APIC_LVTTHMR);
-       apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
-       printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
-               cpu, tm2 ? "TM2" : "TM1");
-
-       /* enable thermal throttle processing */
-       atomic_set(&therm_throt_en, 1);
-       return;
-}
-
-void __cpuinit mce_intel_feature_init(struct cpuinfo_x86 *c)
-{
-       intel_init_thermal(c);
-}
diff --git a/arch/x86_64/kernel/module_64.c b/arch/x86_64/kernel/module_64.c
deleted file mode 100644 (file)
index a888e67..0000000
+++ /dev/null
@@ -1,185 +0,0 @@
-/*  Kernel module help for x86-64
-    Copyright (C) 2001 Rusty Russell.
-    Copyright (C) 2002,2003 Andi Kleen, SuSE Labs.
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-*/
-#include <linux/moduleloader.h>
-#include <linux/elf.h>
-#include <linux/vmalloc.h>
-#include <linux/fs.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/bug.h>
-
-#include <asm/system.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-
-#define DEBUGP(fmt...) 
-
-#ifndef CONFIG_UML
-void module_free(struct module *mod, void *module_region)
-{
-       vfree(module_region);
-       /* FIXME: If module_region == mod->init_region, trim exception
-           table entries. */
-}
-
-void *module_alloc(unsigned long size)
-{
-       struct vm_struct *area;
-
-       if (!size)
-               return NULL;
-       size = PAGE_ALIGN(size);
-       if (size > MODULES_LEN)
-               return NULL;
-
-       area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
-       if (!area)
-               return NULL;
-
-       return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC);
-}
-#endif
-
-/* We don't need anything special. */
-int module_frob_arch_sections(Elf_Ehdr *hdr,
-                             Elf_Shdr *sechdrs,
-                             char *secstrings,
-                             struct module *mod)
-{
-       return 0;
-}
-
-int apply_relocate_add(Elf64_Shdr *sechdrs,
-                  const char *strtab,
-                  unsigned int symindex,
-                  unsigned int relsec,
-                  struct module *me)
-{
-       unsigned int i;
-       Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
-       Elf64_Sym *sym;
-       void *loc;
-       u64 val; 
-
-       DEBUGP("Applying relocate section %u to %u\n", relsec,
-              sechdrs[relsec].sh_info);
-       for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
-               /* This is where to make the change */
-               loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
-                       + rel[i].r_offset;
-
-               /* This is the symbol it is referring to.  Note that all
-                  undefined symbols have been resolved.  */
-               sym = (Elf64_Sym *)sechdrs[symindex].sh_addr
-                       + ELF64_R_SYM(rel[i].r_info);
-
-               DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
-                      (int)ELF64_R_TYPE(rel[i].r_info), 
-                      sym->st_value, rel[i].r_addend, (u64)loc);
-
-               val = sym->st_value + rel[i].r_addend; 
-
-               switch (ELF64_R_TYPE(rel[i].r_info)) {
-               case R_X86_64_NONE:
-                       break;
-               case R_X86_64_64:
-                       *(u64 *)loc = val;
-                       break;
-               case R_X86_64_32:
-                       *(u32 *)loc = val;
-                       if (val != *(u32 *)loc)
-                               goto overflow;
-                       break;
-               case R_X86_64_32S:
-                       *(s32 *)loc = val;
-                       if ((s64)val != *(s32 *)loc)
-                               goto overflow;
-                       break;
-               case R_X86_64_PC32: 
-                       val -= (u64)loc;
-                       *(u32 *)loc = val;
-#if 0
-                       if ((s64)val != *(s32 *)loc)
-                               goto overflow; 
-#endif
-                       break;
-               default:
-                       printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n",
-                              me->name, ELF64_R_TYPE(rel[i].r_info));
-                       return -ENOEXEC;
-               }
-       }
-       return 0;
-
-overflow:
-       printk(KERN_ERR "overflow in relocation type %d val %Lx\n", 
-              (int)ELF64_R_TYPE(rel[i].r_info), val);
-       printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n",
-              me->name);
-       return -ENOEXEC;
-}
-
-int apply_relocate(Elf_Shdr *sechdrs,
-                  const char *strtab,
-                  unsigned int symindex,
-                  unsigned int relsec,
-                  struct module *me)
-{
-       printk("non add relocation not supported\n");
-       return -ENOSYS;
-} 
-
-int module_finalize(const Elf_Ehdr *hdr,
-                    const Elf_Shdr *sechdrs,
-                    struct module *me)
-{
-       const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL;
-       char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
-
-       for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
-               if (!strcmp(".text", secstrings + s->sh_name))
-                       text = s;
-               if (!strcmp(".altinstructions", secstrings + s->sh_name))
-                       alt = s;
-               if (!strcmp(".smp_locks", secstrings + s->sh_name))
-                       locks= s;
-       }
-
-       if (alt) {
-               /* patch .altinstructions */
-               void *aseg = (void *)alt->sh_addr;
-               apply_alternatives(aseg, aseg + alt->sh_size);
-       }
-       if (locks && text) {
-               void *lseg = (void *)locks->sh_addr;
-               void *tseg = (void *)text->sh_addr;
-               alternatives_smp_module_add(me, me->name,
-                                           lseg, lseg + locks->sh_size,
-                                           tseg, tseg + text->sh_size);
-       }
-
-       return module_bug_finalize(hdr, sechdrs, me);
-}
-
-void module_arch_cleanup(struct module *mod)
-{
-       alternatives_smp_module_del(mod);
-       module_bug_cleanup(mod);
-}
diff --git a/arch/x86_64/kernel/mpparse_64.c b/arch/x86_64/kernel/mpparse_64.c
deleted file mode 100644 (file)
index 8bf0ca0..0000000
+++ /dev/null
@@ -1,852 +0,0 @@
-/*
- *     Intel Multiprocessor Specification 1.1 and 1.4
- *     compliant MP-table parsing routines.
- *
- *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
- *     (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
- *
- *     Fixes
- *             Erich Boleyn    :       MP v1.4 and additional changes.
- *             Alan Cox        :       Added EBDA scanning
- *             Ingo Molnar     :       various cleanups and rewrites
- *             Maciej W. Rozycki:      Bits for default MP configurations
- *             Paul Diefenbaugh:       Added full ACPI support
- */
-
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/kernel_stat.h>
-#include <linux/mc146818rtc.h>
-#include <linux/acpi.h>
-#include <linux/module.h>
-
-#include <asm/smp.h>
-#include <asm/mtrr.h>
-#include <asm/mpspec.h>
-#include <asm/pgalloc.h>
-#include <asm/io_apic.h>
-#include <asm/proto.h>
-#include <asm/acpi.h>
-
-/* Have we found an MP table */
-int smp_found_config;
-
-/*
- * Various Linux-internal data structures created from the
- * MP-table.
- */
-DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
-int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
-
-static int mp_current_pci_id = 0;
-/* I/O APIC entries */
-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
-
-/* # of MP IRQ source entries */
-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
-
-/* MP IRQ source entries */
-int mp_irq_entries;
-
-int nr_ioapics;
-unsigned long mp_lapic_addr = 0;
-
-
-
-/* Processor that is doing the boot up */
-unsigned int boot_cpu_id = -1U;
-/* Internal processor count */
-unsigned int num_processors __cpuinitdata = 0;
-
-unsigned disabled_cpus __cpuinitdata;
-
-/* Bitmask of physically existing CPUs */
-physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
-
-u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
-
-
-/*
- * Intel MP BIOS table parsing routines:
- */
-
-/*
- * Checksum an MP configuration block.
- */
-
-static int __init mpf_checksum(unsigned char *mp, int len)
-{
-       int sum = 0;
-
-       while (len--)
-               sum += *mp++;
-
-       return sum & 0xFF;
-}
-
-static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
-{
-       int cpu;
-       cpumask_t tmp_map;
-       char *bootup_cpu = "";
-
-       if (!(m->mpc_cpuflag & CPU_ENABLED)) {
-               disabled_cpus++;
-               return;
-       }
-       if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
-               bootup_cpu = " (Bootup-CPU)";
-               boot_cpu_id = m->mpc_apicid;
-       }
-
-       printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
-
-       if (num_processors >= NR_CPUS) {
-               printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
-                       " Processor ignored.\n", NR_CPUS);
-               return;
-       }
-
-       num_processors++;
-       cpus_complement(tmp_map, cpu_present_map);
-       cpu = first_cpu(tmp_map);
-
-       physid_set(m->mpc_apicid, phys_cpu_present_map);
-       if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
-               /*
-                * bios_cpu_apicid is required to have processors listed
-                * in same order as logical cpu numbers. Hence the first
-                * entry is BSP, and so on.
-                */
-               cpu = 0;
-       }
-       bios_cpu_apicid[cpu] = m->mpc_apicid;
-       x86_cpu_to_apicid[cpu] = m->mpc_apicid;
-
-       cpu_set(cpu, cpu_possible_map);
-       cpu_set(cpu, cpu_present_map);
-}
-
-static void __init MP_bus_info (struct mpc_config_bus *m)
-{
-       char str[7];
-
-       memcpy(str, m->mpc_bustype, 6);
-       str[6] = 0;
-       Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
-
-       if (strncmp(str, "ISA", 3) == 0) {
-               set_bit(m->mpc_busid, mp_bus_not_pci);
-       } else if (strncmp(str, "PCI", 3) == 0) {
-               clear_bit(m->mpc_busid, mp_bus_not_pci);
-               mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
-               mp_current_pci_id++;
-       } else {
-               printk(KERN_ERR "Unknown bustype %s\n", str);
-       }
-}
-
-static int bad_ioapic(unsigned long address)
-{
-       if (nr_ioapics >= MAX_IO_APICS) {
-               printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
-                       "(found %d)\n", MAX_IO_APICS, nr_ioapics);
-               panic("Recompile kernel with bigger MAX_IO_APICS!\n");
-       }
-       if (!address) {
-               printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
-                       " found in table, skipping!\n");
-               return 1;
-       }
-       return 0;
-}
-
-static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
-{
-       if (!(m->mpc_flags & MPC_APIC_USABLE))
-               return;
-
-       printk("I/O APIC #%d at 0x%X.\n",
-               m->mpc_apicid, m->mpc_apicaddr);
-
-       if (bad_ioapic(m->mpc_apicaddr))
-               return;
-
-       mp_ioapics[nr_ioapics] = *m;
-       nr_ioapics++;
-}
-
-static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
-{
-       mp_irqs [mp_irq_entries] = *m;
-       Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
-               " IRQ %02x, APIC ID %x, APIC INT %02x\n",
-                       m->mpc_irqtype, m->mpc_irqflag & 3,
-                       (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
-                       m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
-       if (++mp_irq_entries >= MAX_IRQ_SOURCES)
-               panic("Max # of irq sources exceeded!!\n");
-}
-
-static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
-{
-       Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
-               " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
-                       m->mpc_irqtype, m->mpc_irqflag & 3,
-                       (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
-                       m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
-}
-
-/*
- * Read/parse the MPC
- */
-
-static int __init smp_read_mpc(struct mp_config_table *mpc)
-{
-       char str[16];
-       int count=sizeof(*mpc);
-       unsigned char *mpt=((unsigned char *)mpc)+count;
-
-       if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
-               printk("MPTABLE: bad signature [%c%c%c%c]!\n",
-                       mpc->mpc_signature[0],
-                       mpc->mpc_signature[1],
-                       mpc->mpc_signature[2],
-                       mpc->mpc_signature[3]);
-               return 0;
-       }
-       if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
-               printk("MPTABLE: checksum error!\n");
-               return 0;
-       }
-       if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
-               printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
-                       mpc->mpc_spec);
-               return 0;
-       }
-       if (!mpc->mpc_lapic) {
-               printk(KERN_ERR "MPTABLE: null local APIC address!\n");
-               return 0;
-       }
-       memcpy(str,mpc->mpc_oem,8);
-       str[8] = 0;
-       printk(KERN_INFO "MPTABLE: OEM ID: %s ",str);
-
-       memcpy(str,mpc->mpc_productid,12);
-       str[12] = 0;
-       printk("MPTABLE: Product ID: %s ",str);
-
-       printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic);
-
-       /* save the local APIC address, it might be non-default */
-       if (!acpi_lapic)
-               mp_lapic_addr = mpc->mpc_lapic;
-
-       /*
-        *      Now process the configuration blocks.
-        */
-       while (count < mpc->mpc_length) {
-               switch(*mpt) {
-                       case MP_PROCESSOR:
-                       {
-                               struct mpc_config_processor *m=
-                                       (struct mpc_config_processor *)mpt;
-                               if (!acpi_lapic)
-                                       MP_processor_info(m);
-                               mpt += sizeof(*m);
-                               count += sizeof(*m);
-                               break;
-                       }
-                       case MP_BUS:
-                       {
-                               struct mpc_config_bus *m=
-                                       (struct mpc_config_bus *)mpt;
-                               MP_bus_info(m);
-                               mpt += sizeof(*m);
-                               count += sizeof(*m);
-                               break;
-                       }
-                       case MP_IOAPIC:
-                       {
-                               struct mpc_config_ioapic *m=
-                                       (struct mpc_config_ioapic *)mpt;
-                               MP_ioapic_info(m);
-                               mpt += sizeof(*m);
-                               count += sizeof(*m);
-                               break;
-                       }
-                       case MP_INTSRC:
-                       {
-                               struct mpc_config_intsrc *m=
-                                       (struct mpc_config_intsrc *)mpt;
-
-                               MP_intsrc_info(m);
-                               mpt += sizeof(*m);
-                               count += sizeof(*m);
-                               break;
-                       }
-                       case MP_LINTSRC:
-                       {
-                               struct mpc_config_lintsrc *m=
-                                       (struct mpc_config_lintsrc *)mpt;
-                               MP_lintsrc_info(m);
-                               mpt += sizeof(*m);
-                               count += sizeof(*m);
-                               break;
-                       }
-               }
-       }
-       setup_apic_routing();
-       if (!num_processors)
-               printk(KERN_ERR "MPTABLE: no processors registered!\n");
-       return num_processors;
-}
-
-static int __init ELCR_trigger(unsigned int irq)
-{
-       unsigned int port;
-
-       port = 0x4d0 + (irq >> 3);
-       return (inb(port) >> (irq & 7)) & 1;
-}
-
-static void __init construct_default_ioirq_mptable(int mpc_default_type)
-{
-       struct mpc_config_intsrc intsrc;
-       int i;
-       int ELCR_fallback = 0;
-
-       intsrc.mpc_type = MP_INTSRC;
-       intsrc.mpc_irqflag = 0;                 /* conforming */
-       intsrc.mpc_srcbus = 0;
-       intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
-
-       intsrc.mpc_irqtype = mp_INT;
-
-       /*
-        *  If true, we have an ISA/PCI system with no IRQ entries
-        *  in the MP table. To prevent the PCI interrupts from being set up
-        *  incorrectly, we try to use the ELCR. The sanity check to see if
-        *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
-        *  never be level sensitive, so we simply see if the ELCR agrees.
-        *  If it does, we assume it's valid.
-        */
-       if (mpc_default_type == 5) {
-               printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
-
-               if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
-                       printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
-               else {
-                       printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
-                       ELCR_fallback = 1;
-               }
-       }
-
-       for (i = 0; i < 16; i++) {
-               switch (mpc_default_type) {
-               case 2:
-                       if (i == 0 || i == 13)
-                               continue;       /* IRQ0 & IRQ13 not connected */
-                       /* fall through */
-               default:
-                       if (i == 2)
-                               continue;       /* IRQ2 is never connected */
-               }
-
-               if (ELCR_fallback) {
-                       /*
-                        *  If the ELCR indicates a level-sensitive interrupt, we
-                        *  copy that information over to the MP table in the
-                        *  irqflag field (level sensitive, active high polarity).
-                        */
-                       if (ELCR_trigger(i))
-                               intsrc.mpc_irqflag = 13;
-                       else
-                               intsrc.mpc_irqflag = 0;
-               }
-
-               intsrc.mpc_srcbusirq = i;
-               intsrc.mpc_dstirq = i ? i : 2;          /* IRQ0 to INTIN2 */
-               MP_intsrc_info(&intsrc);
-       }
-
-       intsrc.mpc_irqtype = mp_ExtINT;
-       intsrc.mpc_srcbusirq = 0;
-       intsrc.mpc_dstirq = 0;                          /* 8259A to INTIN0 */
-       MP_intsrc_info(&intsrc);
-}
-
-static inline void __init construct_default_ISA_mptable(int mpc_default_type)
-{
-       struct mpc_config_processor processor;
-       struct mpc_config_bus bus;
-       struct mpc_config_ioapic ioapic;
-       struct mpc_config_lintsrc lintsrc;
-       int linttypes[2] = { mp_ExtINT, mp_NMI };
-       int i;
-
-       /*
-        * local APIC has default address
-        */
-       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
-       /*
-        * 2 CPUs, numbered 0 & 1.
-        */
-       processor.mpc_type = MP_PROCESSOR;
-       processor.mpc_apicver = 0;
-       processor.mpc_cpuflag = CPU_ENABLED;
-       processor.mpc_cpufeature = 0;
-       processor.mpc_featureflag = 0;
-       processor.mpc_reserved[0] = 0;
-       processor.mpc_reserved[1] = 0;
-       for (i = 0; i < 2; i++) {
-               processor.mpc_apicid = i;
-               MP_processor_info(&processor);
-       }
-
-       bus.mpc_type = MP_BUS;
-       bus.mpc_busid = 0;
-       switch (mpc_default_type) {
-               default:
-                       printk(KERN_ERR "???\nUnknown standard configuration %d\n",
-                               mpc_default_type);
-                       /* fall through */
-               case 1:
-               case 5:
-                       memcpy(bus.mpc_bustype, "ISA   ", 6);
-                       break;
-       }
-       MP_bus_info(&bus);
-       if (mpc_default_type > 4) {
-               bus.mpc_busid = 1;
-               memcpy(bus.mpc_bustype, "PCI   ", 6);
-               MP_bus_info(&bus);
-       }
-
-       ioapic.mpc_type = MP_IOAPIC;
-       ioapic.mpc_apicid = 2;
-       ioapic.mpc_apicver = 0;
-       ioapic.mpc_flags = MPC_APIC_USABLE;
-       ioapic.mpc_apicaddr = 0xFEC00000;
-       MP_ioapic_info(&ioapic);
-
-       /*
-        * We set up most of the low 16 IO-APIC pins according to MPS rules.
-        */
-       construct_default_ioirq_mptable(mpc_default_type);
-
-       lintsrc.mpc_type = MP_LINTSRC;
-       lintsrc.mpc_irqflag = 0;                /* conforming */
-       lintsrc.mpc_srcbusid = 0;
-       lintsrc.mpc_srcbusirq = 0;
-       lintsrc.mpc_destapic = MP_APIC_ALL;
-       for (i = 0; i < 2; i++) {
-               lintsrc.mpc_irqtype = linttypes[i];
-               lintsrc.mpc_destapiclint = i;
-               MP_lintsrc_info(&lintsrc);
-       }
-}
-
-static struct intel_mp_floating *mpf_found;
-
-/*
- * Scan the memory blocks for an SMP configuration block.
- */
-void __init get_smp_config (void)
-{
-       struct intel_mp_floating *mpf = mpf_found;
-
-       /*
-        * ACPI supports both logical (e.g. Hyper-Threading) and physical 
-        * processors, where MPS only supports physical.
-        */
-       if (acpi_lapic && acpi_ioapic) {
-               printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
-               return;
-       }
-       else if (acpi_lapic)
-               printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
-
-       printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
-
-       /*
-        * Now see if we need to read further.
-        */
-       if (mpf->mpf_feature1 != 0) {
-
-               printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
-               construct_default_ISA_mptable(mpf->mpf_feature1);
-
-       } else if (mpf->mpf_physptr) {
-
-               /*
-                * Read the physical hardware table.  Anything here will
-                * override the defaults.
-                */
-               if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) {
-                       smp_found_config = 0;
-                       printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
-                       printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
-                       return;
-               }
-               /*
-                * If there are no explicit MP IRQ entries, then we are
-                * broken.  We set up most of the low 16 IO-APIC pins to
-                * ISA defaults and hope it will work.
-                */
-               if (!mp_irq_entries) {
-                       struct mpc_config_bus bus;
-
-                       printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
-
-                       bus.mpc_type = MP_BUS;
-                       bus.mpc_busid = 0;
-                       memcpy(bus.mpc_bustype, "ISA   ", 6);
-                       MP_bus_info(&bus);
-
-                       construct_default_ioirq_mptable(0);
-               }
-
-       } else
-               BUG();
-
-       printk(KERN_INFO "Processors: %d\n", num_processors);
-       /*
-        * Only use the first configuration found.
-        */
-}
-
-static int __init smp_scan_config (unsigned long base, unsigned long length)
-{
-       extern void __bad_mpf_size(void); 
-       unsigned int *bp = phys_to_virt(base);
-       struct intel_mp_floating *mpf;
-
-       Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
-       if (sizeof(*mpf) != 16)
-               __bad_mpf_size();
-
-       while (length > 0) {
-               mpf = (struct intel_mp_floating *)bp;
-               if ((*bp == SMP_MAGIC_IDENT) &&
-                       (mpf->mpf_length == 1) &&
-                       !mpf_checksum((unsigned char *)bp, 16) &&
-                       ((mpf->mpf_specification == 1)
-                               || (mpf->mpf_specification == 4)) ) {
-
-                       smp_found_config = 1;
-                       reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
-                       if (mpf->mpf_physptr)
-                               reserve_bootmem_generic(mpf->mpf_physptr, PAGE_SIZE);
-                       mpf_found = mpf;
-                       return 1;
-               }
-               bp += 4;
-               length -= 16;
-       }
-       return 0;
-}
-
-void __init find_smp_config(void)
-{
-       unsigned int address;
-
-       /*
-        * FIXME: Linux assumes you have 640K of base ram..
-        * this continues the error...
-        *
-        * 1) Scan the bottom 1K for a signature
-        * 2) Scan the top 1K of base RAM
-        * 3) Scan the 64K of bios
-        */
-       if (smp_scan_config(0x0,0x400) ||
-               smp_scan_config(639*0x400,0x400) ||
-                       smp_scan_config(0xF0000,0x10000))
-               return;
-       /*
-        * If it is an SMP machine we should know now.
-        *
-        * there is a real-mode segmented pointer pointing to the
-        * 4K EBDA area at 0x40E, calculate and scan it here.
-        *
-        * NOTE! There are Linux loaders that will corrupt the EBDA
-        * area, and as such this kind of SMP config may be less
-        * trustworthy, simply because the SMP table may have been
-        * stomped on during early boot. These loaders are buggy and
-        * should be fixed.
-        */
-
-       address = *(unsigned short *)phys_to_virt(0x40E);
-       address <<= 4;
-       if (smp_scan_config(address, 0x1000))
-               return;
-
-       /* If we have come this far, we did not find an MP table  */
-        printk(KERN_INFO "No mptable found.\n");
-}
-
-/* --------------------------------------------------------------------------
-                            ACPI-based MP Configuration
-   -------------------------------------------------------------------------- */
-
-#ifdef CONFIG_ACPI
-
-void __init mp_register_lapic_address(u64 address)
-{
-       mp_lapic_addr = (unsigned long) address;
-       set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
-       if (boot_cpu_id == -1U)
-               boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
-}
-
-void __cpuinit mp_register_lapic (u8 id, u8 enabled)
-{
-       struct mpc_config_processor processor;
-       int                     boot_cpu = 0;
-       
-       if (id == boot_cpu_id)
-               boot_cpu = 1;
-
-       processor.mpc_type = MP_PROCESSOR;
-       processor.mpc_apicid = id;
-       processor.mpc_apicver = 0;
-       processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
-       processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
-       processor.mpc_cpufeature = 0;
-       processor.mpc_featureflag = 0;
-       processor.mpc_reserved[0] = 0;
-       processor.mpc_reserved[1] = 0;
-
-       MP_processor_info(&processor);
-}
-
-#define MP_ISA_BUS             0
-#define MP_MAX_IOAPIC_PIN      127
-
-static struct mp_ioapic_routing {
-       int                     apic_id;
-       int                     gsi_start;
-       int                     gsi_end;
-       u32                     pin_programmed[4];
-} mp_ioapic_routing[MAX_IO_APICS];
-
-static int mp_find_ioapic(int gsi)
-{
-       int i = 0;
-
-       /* Find the IOAPIC that manages this GSI. */
-       for (i = 0; i < nr_ioapics; i++) {
-               if ((gsi >= mp_ioapic_routing[i].gsi_start)
-                       && (gsi <= mp_ioapic_routing[i].gsi_end))
-                       return i;
-       }
-
-       printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
-       return -1;
-}
-
-static u8 uniq_ioapic_id(u8 id)
-{
-       int i;
-       DECLARE_BITMAP(used, 256);
-       bitmap_zero(used, 256);
-       for (i = 0; i < nr_ioapics; i++) {
-               struct mpc_config_ioapic *ia = &mp_ioapics[i];
-               __set_bit(ia->mpc_apicid, used);
-       }
-       if (!test_bit(id, used))
-               return id;
-       return find_first_zero_bit(used, 256);
-}
-
-void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
-{
-       int idx = 0;
-
-       if (bad_ioapic(address))
-               return;
-
-       idx = nr_ioapics;
-
-       mp_ioapics[idx].mpc_type = MP_IOAPIC;
-       mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
-       mp_ioapics[idx].mpc_apicaddr = address;
-
-       set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
-       mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
-       mp_ioapics[idx].mpc_apicver = 0;
-       
-       /* 
-        * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
-        * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
-        */
-       mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
-       mp_ioapic_routing[idx].gsi_start = gsi_base;
-       mp_ioapic_routing[idx].gsi_end = gsi_base + 
-               io_apic_get_redir_entries(idx);
-
-       printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, "
-               "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 
-               mp_ioapics[idx].mpc_apicaddr,
-               mp_ioapic_routing[idx].gsi_start,
-               mp_ioapic_routing[idx].gsi_end);
-
-       nr_ioapics++;
-}
-
-void __init
-mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32        gsi)
-{
-       struct mpc_config_intsrc intsrc;
-       int                     ioapic = -1;
-       int                     pin = -1;
-
-       /* 
-        * Convert 'gsi' to 'ioapic.pin'.
-        */
-       ioapic = mp_find_ioapic(gsi);
-       if (ioapic < 0)
-               return;
-       pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
-
-       /*
-        * TBD: This check is for faulty timer entries, where the override
-        *      erroneously sets the trigger to level, resulting in a HUGE 
-        *      increase of timer interrupts!
-        */
-       if ((bus_irq == 0) && (trigger == 3))
-               trigger = 1;
-
-       intsrc.mpc_type = MP_INTSRC;
-       intsrc.mpc_irqtype = mp_INT;
-       intsrc.mpc_irqflag = (trigger << 2) | polarity;
-       intsrc.mpc_srcbus = MP_ISA_BUS;
-       intsrc.mpc_srcbusirq = bus_irq;                                /* IRQ */
-       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;        /* APIC ID */
-       intsrc.mpc_dstirq = pin;                                    /* INTIN# */
-
-       Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", 
-               intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
-               (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
-               intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
-
-       mp_irqs[mp_irq_entries] = intsrc;
-       if (++mp_irq_entries == MAX_IRQ_SOURCES)
-               panic("Max # of irq sources exceeded!\n");
-}
-
-void __init mp_config_acpi_legacy_irqs(void)
-{
-       struct mpc_config_intsrc intsrc;
-       int i = 0;
-       int ioapic = -1;
-
-       /* 
-        * Fabricate the legacy ISA bus (bus #31).
-        */
-       set_bit(MP_ISA_BUS, mp_bus_not_pci);
-
-       /* 
-        * Locate the IOAPIC that manages the ISA IRQs (0-15). 
-        */
-       ioapic = mp_find_ioapic(0);
-       if (ioapic < 0)
-               return;
-
-       intsrc.mpc_type = MP_INTSRC;
-       intsrc.mpc_irqflag = 0;                                 /* Conforming */
-       intsrc.mpc_srcbus = MP_ISA_BUS;
-       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
-
-       /* 
-        * Use the default configuration for the IRQs 0-15.  Unless
-        * overridden by (MADT) interrupt source override entries.
-        */
-       for (i = 0; i < 16; i++) {
-               int idx;
-
-               for (idx = 0; idx < mp_irq_entries; idx++) {
-                       struct mpc_config_intsrc *irq = mp_irqs + idx;
-
-                       /* Do we already have a mapping for this ISA IRQ? */
-                       if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
-                               break;
-
-                       /* Do we already have a mapping for this IOAPIC pin */
-                       if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
-                               (irq->mpc_dstirq == i))
-                               break;
-               }
-
-               if (idx != mp_irq_entries) {
-                       printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
-                       continue;                       /* IRQ already used */
-               }
-
-               intsrc.mpc_irqtype = mp_INT;
-               intsrc.mpc_srcbusirq = i;                  /* Identity mapped */
-               intsrc.mpc_dstirq = i;
-
-               Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
-                       "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
-                       (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
-                       intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, 
-                       intsrc.mpc_dstirq);
-
-               mp_irqs[mp_irq_entries] = intsrc;
-               if (++mp_irq_entries == MAX_IRQ_SOURCES)
-                       panic("Max # of irq sources exceeded!\n");
-       }
-}
-
-int mp_register_gsi(u32 gsi, int triggering, int polarity)
-{
-       int ioapic = -1;
-       int ioapic_pin = 0;
-       int idx, bit = 0;
-
-       if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
-               return gsi;
-
-       /* Don't set up the ACPI SCI because it's already set up */
-       if (acpi_gbl_FADT.sci_interrupt == gsi)
-               return gsi;
-
-       ioapic = mp_find_ioapic(gsi);
-       if (ioapic < 0) {
-               printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
-               return gsi;
-       }
-
-       ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
-
-       /* 
-        * Avoid pin reprogramming.  PRTs typically include entries  
-        * with redundant pin->gsi mappings (but unique PCI devices);
-        * we only program the IOAPIC on the first.
-        */
-       bit = ioapic_pin % 32;
-       idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
-       if (idx > 3) {
-               printk(KERN_ERR "Invalid reference to IOAPIC pin "
-                       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, 
-                       ioapic_pin);
-               return gsi;
-       }
-       if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
-               Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
-                       mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-               return gsi;
-       }
-
-       mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
-
-       io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
-               triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
-               polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
-       return gsi;
-}
-#endif /*CONFIG_ACPI*/
diff --git a/arch/x86_64/kernel/nmi_64.c b/arch/x86_64/kernel/nmi_64.c
deleted file mode 100644 (file)
index 0ec6d2d..0000000
+++ /dev/null
@@ -1,483 +0,0 @@
-/*
- *  linux/arch/x86_64/nmi.c
- *
- *  NMI watchdog support on APIC systems
- *
- *  Started by Ingo Molnar <mingo@redhat.com>
- *
- *  Fixes:
- *  Mikael Pettersson  : AMD K7 support for local APIC NMI watchdog.
- *  Mikael Pettersson  : Power Management for local APIC NMI watchdog.
- *  Pavel Machek and
- *  Mikael Pettersson  : PM converted to driver model. Disable/enable API.
- */
-
-#include <linux/nmi.h>
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/sysdev.h>
-#include <linux/sysctl.h>
-#include <linux/kprobes.h>
-#include <linux/cpumask.h>
-#include <linux/kdebug.h>
-
-#include <asm/smp.h>
-#include <asm/nmi.h>
-#include <asm/proto.h>
-#include <asm/mce.h>
-
-int unknown_nmi_panic;
-int nmi_watchdog_enabled;
-int panic_on_unrecovered_nmi;
-
-static cpumask_t backtrace_mask = CPU_MASK_NONE;
-
-/* nmi_active:
- * >0: the lapic NMI watchdog is active, but can be disabled
- * <0: the lapic NMI watchdog has not been set up, and cannot
- *     be enabled
- *  0: the lapic NMI watchdog is disabled, but can be enabled
- */
-atomic_t nmi_active = ATOMIC_INIT(0);          /* oprofile uses this */
-int panic_on_timeout;
-
-unsigned int nmi_watchdog = NMI_DEFAULT;
-static unsigned int nmi_hz = HZ;
-
-static DEFINE_PER_CPU(short, wd_enabled);
-
-/* local prototypes */
-static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
-
-/* Run after command line and cpu_init init, but before all other checks */
-void nmi_watchdog_default(void)
-{
-       if (nmi_watchdog != NMI_DEFAULT)
-               return;
-       nmi_watchdog = NMI_NONE;
-}
-
-static int endflag __initdata = 0;
-
-#ifdef CONFIG_SMP
-/* The performance counters used by NMI_LOCAL_APIC don't trigger when
- * the CPU is idle. To make sure the NMI watchdog really ticks on all
- * CPUs during the test make them busy.
- */
-static __init void nmi_cpu_busy(void *data)
-{
-       local_irq_enable_in_hardirq();
-       /* Intentionally don't use cpu_relax here. This is
-          to make sure that the performance counter really ticks,
-          even if there is a simulator or similar that catches the
-          pause instruction. On a real HT machine this is fine because
-          all other CPUs are busy with "useless" delay loops and don't
-          care if they get somewhat less cycles. */
-       while (endflag == 0)
-               mb();
-}
-#endif
-
-int __init check_nmi_watchdog (void)
-{
-       int *counts;
-       int cpu;
-
-       if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) 
-               return 0;
-
-       if (!atomic_read(&nmi_active))
-               return 0;
-
-       counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
-       if (!counts)
-               return -1;
-
-       printk(KERN_INFO "testing NMI watchdog ... ");
-
-#ifdef CONFIG_SMP
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
-#endif
-
-       for (cpu = 0; cpu < NR_CPUS; cpu++)
-               counts[cpu] = cpu_pda(cpu)->__nmi_count;
-       local_irq_enable();
-       mdelay((20*1000)/nmi_hz); // wait 20 ticks
-
-       for_each_online_cpu(cpu) {
-               if (!per_cpu(wd_enabled, cpu))
-                       continue;
-               if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) {
-                       printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
-                              cpu,
-                              counts[cpu],
-                              cpu_pda(cpu)->__nmi_count);
-                       per_cpu(wd_enabled, cpu) = 0;
-                       atomic_dec(&nmi_active);
-               }
-       }
-       if (!atomic_read(&nmi_active)) {
-               kfree(counts);
-               atomic_set(&nmi_active, -1);
-               endflag = 1;
-               return -1;
-       }
-       endflag = 1;
-       printk("OK.\n");
-
-       /* now that we know it works we can reduce NMI frequency to
-          something more reasonable; makes a difference in some configs */
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               nmi_hz = lapic_adjust_nmi_hz(1);
-
-       kfree(counts);
-       return 0;
-}
-
-int __init setup_nmi_watchdog(char *str)
-{
-       int nmi;
-
-       if (!strncmp(str,"panic",5)) {
-               panic_on_timeout = 1;
-               str = strchr(str, ',');
-               if (!str)
-                       return 1;
-               ++str;
-       }
-
-       get_option(&str, &nmi);
-
-       if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
-               return 0;
-
-       nmi_watchdog = nmi;
-       return 1;
-}
-
-__setup("nmi_watchdog=", setup_nmi_watchdog);
-
-
-static void __acpi_nmi_disable(void *__unused)
-{
-       apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
-}
-
-/*
- * Disable timer based NMIs on all CPUs:
- */
-void acpi_nmi_disable(void)
-{
-       if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-               on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
-}
-
-static void __acpi_nmi_enable(void *__unused)
-{
-       apic_write(APIC_LVT0, APIC_DM_NMI);
-}
-
-/*
- * Enable timer based NMIs on all CPUs:
- */
-void acpi_nmi_enable(void)
-{
-       if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-               on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
-}
-#ifdef CONFIG_PM
-
-static int nmi_pm_active; /* nmi_active before suspend */
-
-static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
-{
-       /* only CPU0 goes here, other CPUs should be offline */
-       nmi_pm_active = atomic_read(&nmi_active);
-       stop_apic_nmi_watchdog(NULL);
-       BUG_ON(atomic_read(&nmi_active) != 0);
-       return 0;
-}
-
-static int lapic_nmi_resume(struct sys_device *dev)
-{
-       /* only CPU0 goes here, other CPUs should be offline */
-       if (nmi_pm_active > 0) {
-               setup_apic_nmi_watchdog(NULL);
-               touch_nmi_watchdog();
-       }
-       return 0;
-}
-
-static struct sysdev_class nmi_sysclass = {
-       set_kset_name("lapic_nmi"),
-       .resume         = lapic_nmi_resume,
-       .suspend        = lapic_nmi_suspend,
-};
-
-static struct sys_device device_lapic_nmi = {
-       .id             = 0,
-       .cls    = &nmi_sysclass,
-};
-
-static int __init init_lapic_nmi_sysfs(void)
-{
-       int error;
-
-       /* should really be a BUG_ON but b/c this is an
-        * init call, it just doesn't work.  -dcz
-        */
-       if (nmi_watchdog != NMI_LOCAL_APIC)
-               return 0;
-
-       if ( atomic_read(&nmi_active) < 0 )
-               return 0;
-
-       error = sysdev_class_register(&nmi_sysclass);
-       if (!error)
-               error = sysdev_register(&device_lapic_nmi);
-       return error;
-}
-/* must come after the local APIC's device_initcall() */
-late_initcall(init_lapic_nmi_sysfs);
-
-#endif /* CONFIG_PM */
-
-void setup_apic_nmi_watchdog(void *unused)
-{
-       if (__get_cpu_var(wd_enabled) == 1)
-               return;
-
-       /* cheap hack to support suspend/resume */
-       /* if cpu0 is not active neither should the other cpus */
-       if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
-               return;
-
-       switch (nmi_watchdog) {
-       case NMI_LOCAL_APIC:
-               __get_cpu_var(wd_enabled) = 1;
-               if (lapic_watchdog_init(nmi_hz) < 0) {
-                       __get_cpu_var(wd_enabled) = 0;
-                       return;
-               }
-               /* FALL THROUGH */
-       case NMI_IO_APIC:
-               __get_cpu_var(wd_enabled) = 1;
-               atomic_inc(&nmi_active);
-       }
-}
-
-void stop_apic_nmi_watchdog(void *unused)
-{
-       /* only support LOCAL and IO APICs for now */
-       if ((nmi_watchdog != NMI_LOCAL_APIC) &&
-           (nmi_watchdog != NMI_IO_APIC))
-               return;
-       if (__get_cpu_var(wd_enabled) == 0)
-               return;
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               lapic_watchdog_stop();
-       __get_cpu_var(wd_enabled) = 0;
-       atomic_dec(&nmi_active);
-}
-
-/*
- * the best way to detect whether a CPU has a 'hard lockup' problem
- * is to check it's local APIC timer IRQ counts. If they are not
- * changing then that CPU has some problem.
- *
- * as these watchdog NMI IRQs are generated on every CPU, we only
- * have to check the current processor.
- */
-
-static DEFINE_PER_CPU(unsigned, last_irq_sum);
-static DEFINE_PER_CPU(local_t, alert_counter);
-static DEFINE_PER_CPU(int, nmi_touch);
-
-void touch_nmi_watchdog(void)
-{
-       if (nmi_watchdog > 0) {
-               unsigned cpu;
-
-               /*
-                * Tell other CPUs to reset their alert counters. We cannot
-                * do it ourselves because the alert count increase is not
-                * atomic.
-                */
-               for_each_present_cpu(cpu) {
-                       if (per_cpu(nmi_touch, cpu) != 1)
-                               per_cpu(nmi_touch, cpu) = 1;
-               }
-       }
-
-       touch_softlockup_watchdog();
-}
-
-int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
-{
-       int sum;
-       int touched = 0;
-       int cpu = smp_processor_id();
-       int rc = 0;
-
-       /* check for other users first */
-       if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
-                       == NOTIFY_STOP) {
-               rc = 1;
-               touched = 1;
-       }
-
-       sum = read_pda(apic_timer_irqs);
-       if (__get_cpu_var(nmi_touch)) {
-               __get_cpu_var(nmi_touch) = 0;
-               touched = 1;
-       }
-
-       if (cpu_isset(cpu, backtrace_mask)) {
-               static DEFINE_SPINLOCK(lock);   /* Serialise the printks */
-
-               spin_lock(&lock);
-               printk("NMI backtrace for cpu %d\n", cpu);
-               dump_stack();
-               spin_unlock(&lock);
-               cpu_clear(cpu, backtrace_mask);
-       }
-
-#ifdef CONFIG_X86_MCE
-       /* Could check oops_in_progress here too, but it's safer
-          not too */
-       if (atomic_read(&mce_entry) > 0)
-               touched = 1;
-#endif
-       /* if the apic timer isn't firing, this cpu isn't doing much */
-       if (!touched && __get_cpu_var(last_irq_sum) == sum) {
-               /*
-                * Ayiee, looks like this CPU is stuck ...
-                * wait a few IRQs (5 seconds) before doing the oops ...
-                */
-               local_inc(&__get_cpu_var(alert_counter));
-               if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz)
-                       die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs,
-                               panic_on_timeout);
-       } else {
-               __get_cpu_var(last_irq_sum) = sum;
-               local_set(&__get_cpu_var(alert_counter), 0);
-       }
-
-       /* see if the nmi watchdog went off */
-       if (!__get_cpu_var(wd_enabled))
-               return rc;
-       switch (nmi_watchdog) {
-       case NMI_LOCAL_APIC:
-               rc |= lapic_wd_event(nmi_hz);
-               break;
-       case NMI_IO_APIC:
-               /* don't know how to accurately check for this.
-                * just assume it was a watchdog timer interrupt
-                * This matches the old behaviour.
-                */
-               rc = 1;
-               break;
-       }
-       return rc;
-}
-
-static unsigned ignore_nmis;
-
-asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code)
-{
-       nmi_enter();
-       add_pda(__nmi_count,1);
-       if (!ignore_nmis)
-               default_do_nmi(regs);
-       nmi_exit();
-}
-
-int do_nmi_callback(struct pt_regs * regs, int cpu)
-{
-#ifdef CONFIG_SYSCTL
-       if (unknown_nmi_panic)
-               return unknown_nmi_panic_callback(regs, cpu);
-#endif
-       return 0;
-}
-
-void stop_nmi(void)
-{
-       acpi_nmi_disable();
-       ignore_nmis++;
-}
-
-void restart_nmi(void)
-{
-       ignore_nmis--;
-       acpi_nmi_enable();
-}
-
-#ifdef CONFIG_SYSCTL
-
-static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
-{
-       unsigned char reason = get_nmi_reason();
-       char buf[64];
-
-       sprintf(buf, "NMI received for unknown reason %02x\n", reason);
-       die_nmi(buf, regs, 1);  /* Always panic here */
-       return 0;
-}
-
-/*
- * proc handler for /proc/sys/kernel/nmi
- */
-int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
-                       void __user *buffer, size_t *length, loff_t *ppos)
-{
-       int old_state;
-
-       nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
-       old_state = nmi_watchdog_enabled;
-       proc_dointvec(table, write, file, buffer, length, ppos);
-       if (!!old_state == !!nmi_watchdog_enabled)
-               return 0;
-
-       if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) {
-               printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
-               return -EIO;
-       }
-
-       /* if nmi_watchdog is not set yet, then set it */
-       nmi_watchdog_default();
-
-       if (nmi_watchdog == NMI_LOCAL_APIC) {
-               if (nmi_watchdog_enabled)
-                       enable_lapic_nmi_watchdog();
-               else
-                       disable_lapic_nmi_watchdog();
-       } else {
-               printk( KERN_WARNING
-                       "NMI watchdog doesn't know what hardware to touch\n");
-               return -EIO;
-       }
-       return 0;
-}
-
-#endif
-
-void __trigger_all_cpu_backtrace(void)
-{
-       int i;
-
-       backtrace_mask = cpu_online_map;
-       /* Wait for up to 10 seconds for all CPUs to do the backtrace */
-       for (i = 0; i < 10 * 1000; i++) {
-               if (cpus_empty(backtrace_mask))
-                       break;
-               mdelay(1);
-       }
-}
-
-EXPORT_SYMBOL(nmi_active);
-EXPORT_SYMBOL(nmi_watchdog);
-EXPORT_SYMBOL(touch_nmi_watchdog);
diff --git a/arch/x86_64/kernel/pci-calgary_64.c b/arch/x86_64/kernel/pci-calgary_64.c
deleted file mode 100644 (file)
index 71da01e..0000000
+++ /dev/null
@@ -1,1578 +0,0 @@
-/*
- * Derived from arch/powerpc/kernel/iommu.c
- *
- * Copyright IBM Corporation, 2006-2007
- * Copyright (C) 2006  Jon Mason <jdmason@kudzu.us>
- *
- * Author: Jon Mason <jdmason@kudzu.us>
- * Author: Muli Ben-Yehuda <muli@il.ibm.com>
-
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/string.h>
-#include <linux/dma-mapping.h>
-#include <linux/init.h>
-#include <linux/bitops.h>
-#include <linux/pci_ids.h>
-#include <linux/pci.h>
-#include <linux/delay.h>
-#include <asm/iommu.h>
-#include <asm/calgary.h>
-#include <asm/tce.h>
-#include <asm/pci-direct.h>
-#include <asm/system.h>
-#include <asm/dma.h>
-#include <asm/rio.h>
-
-#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
-int use_calgary __read_mostly = 1;
-#else
-int use_calgary __read_mostly = 0;
-#endif /* CONFIG_CALGARY_DEFAULT_ENABLED */
-
-#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1
-#define PCI_DEVICE_ID_IBM_CALIOC2 0x0308
-
-/* register offsets inside the host bridge space */
-#define CALGARY_CONFIG_REG     0x0108
-#define PHB_CSR_OFFSET         0x0110 /* Channel Status */
-#define PHB_PLSSR_OFFSET       0x0120
-#define PHB_CONFIG_RW_OFFSET   0x0160
-#define PHB_IOBASE_BAR_LOW     0x0170
-#define PHB_IOBASE_BAR_HIGH    0x0180
-#define PHB_MEM_1_LOW          0x0190
-#define PHB_MEM_1_HIGH         0x01A0
-#define PHB_IO_ADDR_SIZE       0x01B0
-#define PHB_MEM_1_SIZE         0x01C0
-#define PHB_MEM_ST_OFFSET      0x01D0
-#define PHB_AER_OFFSET         0x0200
-#define PHB_CONFIG_0_HIGH      0x0220
-#define PHB_CONFIG_0_LOW       0x0230
-#define PHB_CONFIG_0_END       0x0240
-#define PHB_MEM_2_LOW          0x02B0
-#define PHB_MEM_2_HIGH         0x02C0
-#define PHB_MEM_2_SIZE_HIGH    0x02D0
-#define PHB_MEM_2_SIZE_LOW     0x02E0
-#define PHB_DOSHOLE_OFFSET     0x08E0
-
-/* CalIOC2 specific */
-#define PHB_SAVIOR_L2          0x0DB0
-#define PHB_PAGE_MIG_CTRL      0x0DA8
-#define PHB_PAGE_MIG_DEBUG     0x0DA0
-#define PHB_ROOT_COMPLEX_STATUS 0x0CB0
-
-/* PHB_CONFIG_RW */
-#define PHB_TCE_ENABLE         0x20000000
-#define PHB_SLOT_DISABLE       0x1C000000
-#define PHB_DAC_DISABLE                0x01000000
-#define PHB_MEM2_ENABLE                0x00400000
-#define PHB_MCSR_ENABLE                0x00100000
-/* TAR (Table Address Register) */
-#define TAR_SW_BITS            0x0000ffffffff800fUL
-#define TAR_VALID              0x0000000000000008UL
-/* CSR (Channel/DMA Status Register) */
-#define CSR_AGENT_MASK         0xffe0ffff
-/* CCR (Calgary Configuration Register) */
-#define CCR_2SEC_TIMEOUT       0x000000000000000EUL
-/* PMCR/PMDR (Page Migration Control/Debug Registers */
-#define PMR_SOFTSTOP           0x80000000
-#define PMR_SOFTSTOPFAULT      0x40000000
-#define PMR_HARDSTOP           0x20000000
-
-#define MAX_NUM_OF_PHBS                8 /* how many PHBs in total? */
-#define MAX_NUM_CHASSIS                8 /* max number of chassis */
-/* MAX_PHB_BUS_NUM is the maximal possible dev->bus->number */
-#define MAX_PHB_BUS_NUM                (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2)
-#define PHBS_PER_CALGARY       4
-
-/* register offsets in Calgary's internal register space */
-static const unsigned long tar_offsets[] = {
-       0x0580 /* TAR0 */,
-       0x0588 /* TAR1 */,
-       0x0590 /* TAR2 */,
-       0x0598 /* TAR3 */
-};
-
-static const unsigned long split_queue_offsets[] = {
-       0x4870 /* SPLIT QUEUE 0 */,
-       0x5870 /* SPLIT QUEUE 1 */,
-       0x6870 /* SPLIT QUEUE 2 */,
-       0x7870 /* SPLIT QUEUE 3 */
-};
-
-static const unsigned long phb_offsets[] = {
-       0x8000 /* PHB0 */,
-       0x9000 /* PHB1 */,
-       0xA000 /* PHB2 */,
-       0xB000 /* PHB3 */
-};
-
-/* PHB debug registers */
-
-static const unsigned long phb_debug_offsets[] = {
-       0x4000  /* PHB 0 DEBUG */,
-       0x5000  /* PHB 1 DEBUG */,
-       0x6000  /* PHB 2 DEBUG */,
-       0x7000  /* PHB 3 DEBUG */
-};
-
-/*
- * STUFF register for each debug PHB,
- * byte 1 = start bus number, byte 2 = end bus number
- */
-
-#define PHB_DEBUG_STUFF_OFFSET 0x0020
-
-#define EMERGENCY_PAGES 32 /* = 128KB */
-
-unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
-static int translate_empty_slots __read_mostly = 0;
-static int calgary_detected __read_mostly = 0;
-
-static struct rio_table_hdr    *rio_table_hdr __initdata;
-static struct scal_detail      *scal_devs[MAX_NUMNODES] __initdata;
-static struct rio_detail       *rio_devs[MAX_NUMNODES * 4] __initdata;
-
-struct calgary_bus_info {
-       void *tce_space;
-       unsigned char translation_disabled;
-       signed char phbid;
-       void __iomem *bbar;
-};
-
-static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
-static void calgary_tce_cache_blast(struct iommu_table *tbl);
-static void calgary_dump_error_regs(struct iommu_table *tbl);
-static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
-static void calioc2_tce_cache_blast(struct iommu_table *tbl);
-static void calioc2_dump_error_regs(struct iommu_table *tbl);
-
-static struct cal_chipset_ops calgary_chip_ops = {
-       .handle_quirks = calgary_handle_quirks,
-       .tce_cache_blast = calgary_tce_cache_blast,
-       .dump_error_regs = calgary_dump_error_regs
-};
-
-static struct cal_chipset_ops calioc2_chip_ops = {
-       .handle_quirks = calioc2_handle_quirks,
-       .tce_cache_blast = calioc2_tce_cache_blast,
-       .dump_error_regs = calioc2_dump_error_regs
-};
-
-static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
-
-/* enable this to stress test the chip's TCE cache */
-#ifdef CONFIG_IOMMU_DEBUG
-int debugging __read_mostly = 1;
-
-static inline unsigned long verify_bit_range(unsigned long* bitmap,
-       int expected, unsigned long start, unsigned long end)
-{
-       unsigned long idx = start;
-
-       BUG_ON(start >= end);
-
-       while (idx < end) {
-               if (!!test_bit(idx, bitmap) != expected)
-                       return idx;
-               ++idx;
-       }
-
-       /* all bits have the expected value */
-       return ~0UL;
-}
-#else /* debugging is disabled */
-int debugging __read_mostly = 0;
-
-static inline unsigned long verify_bit_range(unsigned long* bitmap,
-       int expected, unsigned long start, unsigned long end)
-{
-       return ~0UL;
-}
-
-#endif /* CONFIG_IOMMU_DEBUG */
-
-static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen)
-{
-       unsigned int npages;
-
-       npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK);
-       npages >>= PAGE_SHIFT;
-
-       return npages;
-}
-
-static inline int translate_phb(struct pci_dev* dev)
-{
-       int disabled = bus_info[dev->bus->number].translation_disabled;
-       return !disabled;
-}
-
-static void iommu_range_reserve(struct iommu_table *tbl,
-       unsigned long start_addr, unsigned int npages)
-{
-       unsigned long index;
-       unsigned long end;
-       unsigned long badbit;
-       unsigned long flags;
-
-       index = start_addr >> PAGE_SHIFT;
-
-       /* bail out if we're asked to reserve a region we don't cover */
-       if (index >= tbl->it_size)
-               return;
-
-       end = index + npages;
-       if (end > tbl->it_size) /* don't go off the table */
-               end = tbl->it_size;
-
-       spin_lock_irqsave(&tbl->it_lock, flags);
-
-       badbit = verify_bit_range(tbl->it_map, 0, index, end);
-       if (badbit != ~0UL) {
-               if (printk_ratelimit())
-                       printk(KERN_ERR "Calgary: entry already allocated at "
-                              "0x%lx tbl %p dma 0x%lx npages %u\n",
-                              badbit, tbl, start_addr, npages);
-       }
-
-       set_bit_string(tbl->it_map, index, npages);
-
-       spin_unlock_irqrestore(&tbl->it_lock, flags);
-}
-
-static unsigned long iommu_range_alloc(struct iommu_table *tbl,
-       unsigned int npages)
-{
-       unsigned long flags;
-       unsigned long offset;
-
-       BUG_ON(npages == 0);
-
-       spin_lock_irqsave(&tbl->it_lock, flags);
-
-       offset = find_next_zero_string(tbl->it_map, tbl->it_hint,
-                                      tbl->it_size, npages);
-       if (offset == ~0UL) {
-               tbl->chip_ops->tce_cache_blast(tbl);
-               offset = find_next_zero_string(tbl->it_map, 0,
-                                              tbl->it_size, npages);
-               if (offset == ~0UL) {
-                       printk(KERN_WARNING "Calgary: IOMMU full.\n");
-                       spin_unlock_irqrestore(&tbl->it_lock, flags);
-                       if (panic_on_overflow)
-                               panic("Calgary: fix the allocator.\n");
-                       else
-                               return bad_dma_address;
-               }
-       }
-
-       set_bit_string(tbl->it_map, offset, npages);
-       tbl->it_hint = offset + npages;
-       BUG_ON(tbl->it_hint > tbl->it_size);
-
-       spin_unlock_irqrestore(&tbl->it_lock, flags);
-
-       return offset;
-}
-
-static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr,
-       unsigned int npages, int direction)
-{
-       unsigned long entry;
-       dma_addr_t ret = bad_dma_address;
-
-       entry = iommu_range_alloc(tbl, npages);
-
-       if (unlikely(entry == bad_dma_address))
-               goto error;
-
-       /* set the return dma address */
-       ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK);
-
-       /* put the TCEs in the HW table */
-       tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK,
-                 direction);
-
-       return ret;
-
-error:
-       printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
-              "iommu %p\n", npages, tbl);
-       return bad_dma_address;
-}
-
-static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
-       unsigned int npages)
-{
-       unsigned long entry;
-       unsigned long badbit;
-       unsigned long badend;
-       unsigned long flags;
-
-       /* were we called with bad_dma_address? */
-       badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE);
-       if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) {
-               printk(KERN_ERR "Calgary: driver tried unmapping bad DMA "
-                      "address 0x%Lx\n", dma_addr);
-               WARN_ON(1);
-               return;
-       }
-
-       entry = dma_addr >> PAGE_SHIFT;
-
-       BUG_ON(entry + npages > tbl->it_size);
-
-       tce_free(tbl, entry, npages);
-
-       spin_lock_irqsave(&tbl->it_lock, flags);
-
-       badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages);
-       if (badbit != ~0UL) {
-               if (printk_ratelimit())
-                       printk(KERN_ERR "Calgary: bit is off at 0x%lx "
-                              "tbl %p dma 0x%Lx entry 0x%lx npages %u\n",
-                              badbit, tbl, dma_addr, entry, npages);
-       }
-
-       __clear_bit_string(tbl->it_map, entry, npages);
-
-       spin_unlock_irqrestore(&tbl->it_lock, flags);
-}
-
-static inline struct iommu_table *find_iommu_table(struct device *dev)
-{
-       struct pci_dev *pdev;
-       struct pci_bus *pbus;
-       struct iommu_table *tbl;
-
-       pdev = to_pci_dev(dev);
-
-       pbus = pdev->bus;
-
-       /* is the device behind a bridge? Look for the root bus */
-       while (pbus->parent)
-               pbus = pbus->parent;
-
-       tbl = pci_iommu(pbus);
-
-       BUG_ON(tbl && (tbl->it_busno != pbus->number));
-
-       return tbl;
-}
-
-static void calgary_unmap_sg(struct device *dev,
-       struct scatterlist *sglist, int nelems, int direction)
-{
-       struct iommu_table *tbl = find_iommu_table(dev);
-
-       if (!translate_phb(to_pci_dev(dev)))
-               return;
-
-       while (nelems--) {
-               unsigned int npages;
-               dma_addr_t dma = sglist->dma_address;
-               unsigned int dmalen = sglist->dma_length;
-
-               if (dmalen == 0)
-                       break;
-
-               npages = num_dma_pages(dma, dmalen);
-               iommu_free(tbl, dma, npages);
-               sglist++;
-       }
-}
-
-static int calgary_nontranslate_map_sg(struct device* dev,
-       struct scatterlist *sg, int nelems, int direction)
-{
-       int i;
-
-       for (i = 0; i < nelems; i++ ) {
-               struct scatterlist *s = &sg[i];
-               BUG_ON(!s->page);
-               s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
-               s->dma_length = s->length;
-       }
-       return nelems;
-}
-
-static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
-       int nelems, int direction)
-{
-       struct iommu_table *tbl = find_iommu_table(dev);
-       unsigned long vaddr;
-       unsigned int npages;
-       unsigned long entry;
-       int i;
-
-       if (!translate_phb(to_pci_dev(dev)))
-               return calgary_nontranslate_map_sg(dev, sg, nelems, direction);
-
-       for (i = 0; i < nelems; i++ ) {
-               struct scatterlist *s = &sg[i];
-               BUG_ON(!s->page);
-
-               vaddr = (unsigned long)page_address(s->page) + s->offset;
-               npages = num_dma_pages(vaddr, s->length);
-
-               entry = iommu_range_alloc(tbl, npages);
-               if (entry == bad_dma_address) {
-                       /* makes sure unmap knows to stop */
-                       s->dma_length = 0;
-                       goto error;
-               }
-
-               s->dma_address = (entry << PAGE_SHIFT) | s->offset;
-
-               /* insert into HW table */
-               tce_build(tbl, entry, npages, vaddr & PAGE_MASK,
-                         direction);
-
-               s->dma_length = s->length;
-       }
-
-       return nelems;
-error:
-       calgary_unmap_sg(dev, sg, nelems, direction);
-       for (i = 0; i < nelems; i++) {
-               sg[i].dma_address = bad_dma_address;
-               sg[i].dma_length = 0;
-       }
-       return 0;
-}
-
-static dma_addr_t calgary_map_single(struct device *dev, void *vaddr,
-       size_t size, int direction)
-{
-       dma_addr_t dma_handle = bad_dma_address;
-       unsigned long uaddr;
-       unsigned int npages;
-       struct iommu_table *tbl = find_iommu_table(dev);
-
-       uaddr = (unsigned long)vaddr;
-       npages = num_dma_pages(uaddr, size);
-
-       if (translate_phb(to_pci_dev(dev)))
-               dma_handle = iommu_alloc(tbl, vaddr, npages, direction);
-       else
-               dma_handle = virt_to_bus(vaddr);
-
-       return dma_handle;
-}
-
-static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
-       size_t size, int direction)
-{
-       struct iommu_table *tbl = find_iommu_table(dev);
-       unsigned int npages;
-
-       if (!translate_phb(to_pci_dev(dev)))
-               return;
-
-       npages = num_dma_pages(dma_handle, size);
-       iommu_free(tbl, dma_handle, npages);
-}
-
-static void* calgary_alloc_coherent(struct device *dev, size_t size,
-       dma_addr_t *dma_handle, gfp_t flag)
-{
-       void *ret = NULL;
-       dma_addr_t mapping;
-       unsigned int npages, order;
-       struct iommu_table *tbl = find_iommu_table(dev);
-
-       size = PAGE_ALIGN(size); /* size rounded up to full pages */
-       npages = size >> PAGE_SHIFT;
-       order = get_order(size);
-
-       /* alloc enough pages (and possibly more) */
-       ret = (void *)__get_free_pages(flag, order);
-       if (!ret)
-               goto error;
-       memset(ret, 0, size);
-
-       if (translate_phb(to_pci_dev(dev))) {
-               /* set up tces to cover the allocated range */
-               mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL);
-               if (mapping == bad_dma_address)
-                       goto free;
-
-               *dma_handle = mapping;
-       } else /* non translated slot */
-               *dma_handle = virt_to_bus(ret);
-
-       return ret;
-
-free:
-       free_pages((unsigned long)ret, get_order(size));
-       ret = NULL;
-error:
-       return ret;
-}
-
-static const struct dma_mapping_ops calgary_dma_ops = {
-       .alloc_coherent = calgary_alloc_coherent,
-       .map_single = calgary_map_single,
-       .unmap_single = calgary_unmap_single,
-       .map_sg = calgary_map_sg,
-       .unmap_sg = calgary_unmap_sg,
-};
-
-static inline void __iomem * busno_to_bbar(unsigned char num)
-{
-       return bus_info[num].bbar;
-}
-
-static inline int busno_to_phbid(unsigned char num)
-{
-       return bus_info[num].phbid;
-}
-
-static inline unsigned long split_queue_offset(unsigned char num)
-{
-       size_t idx = busno_to_phbid(num);
-
-       return split_queue_offsets[idx];
-}
-
-static inline unsigned long tar_offset(unsigned char num)
-{
-       size_t idx = busno_to_phbid(num);
-
-       return tar_offsets[idx];
-}
-
-static inline unsigned long phb_offset(unsigned char num)
-{
-       size_t idx = busno_to_phbid(num);
-
-       return phb_offsets[idx];
-}
-
-static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset)
-{
-       unsigned long target = ((unsigned long)bar) | offset;
-       return (void __iomem*)target;
-}
-
-static inline int is_calioc2(unsigned short device)
-{
-       return (device == PCI_DEVICE_ID_IBM_CALIOC2);
-}
-
-static inline int is_calgary(unsigned short device)
-{
-       return (device == PCI_DEVICE_ID_IBM_CALGARY);
-}
-
-static inline int is_cal_pci_dev(unsigned short device)
-{
-       return (is_calgary(device) || is_calioc2(device));
-}
-
-static void calgary_tce_cache_blast(struct iommu_table *tbl)
-{
-       u64 val;
-       u32 aer;
-       int i = 0;
-       void __iomem *bbar = tbl->bbar;
-       void __iomem *target;
-
-       /* disable arbitration on the bus */
-       target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
-       aer = readl(target);
-       writel(0, target);
-
-       /* read plssr to ensure it got there */
-       target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
-       val = readl(target);
-
-       /* poll split queues until all DMA activity is done */
-       target = calgary_reg(bbar, split_queue_offset(tbl->it_busno));
-       do {
-               val = readq(target);
-               i++;
-       } while ((val & 0xff) != 0xff && i < 100);
-       if (i == 100)
-               printk(KERN_WARNING "Calgary: PCI bus not quiesced, "
-                      "continuing anyway\n");
-
-       /* invalidate TCE cache */
-       target = calgary_reg(bbar, tar_offset(tbl->it_busno));
-       writeq(tbl->tar_val, target);
-
-       /* enable arbitration */
-       target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
-       writel(aer, target);
-       (void)readl(target); /* flush */
-}
-
-static void calioc2_tce_cache_blast(struct iommu_table *tbl)
-{
-       void __iomem *bbar = tbl->bbar;
-       void __iomem *target;
-       u64 val64;
-       u32 val;
-       int i = 0;
-       int count = 1;
-       unsigned char bus = tbl->it_busno;
-
-begin:
-       printk(KERN_DEBUG "Calgary: CalIOC2 bus 0x%x entering tce cache blast "
-              "sequence - count %d\n", bus, count);
-
-       /* 1. using the Page Migration Control reg set SoftStop */
-       target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
-       val = be32_to_cpu(readl(target));
-       printk(KERN_DEBUG "1a. read 0x%x [LE] from %p\n", val, target);
-       val |= PMR_SOFTSTOP;
-       printk(KERN_DEBUG "1b. writing 0x%x [LE] to %p\n", val, target);
-       writel(cpu_to_be32(val), target);
-
-       /* 2. poll split queues until all DMA activity is done */
-       printk(KERN_DEBUG "2a. starting to poll split queues\n");
-       target = calgary_reg(bbar, split_queue_offset(bus));
-       do {
-               val64 = readq(target);
-               i++;
-       } while ((val64 & 0xff) != 0xff && i < 100);
-       if (i == 100)
-               printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, "
-                      "continuing anyway\n");
-
-       /* 3. poll Page Migration DEBUG for SoftStopFault */
-       target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
-       val = be32_to_cpu(readl(target));
-       printk(KERN_DEBUG "3. read 0x%x [LE] from %p\n", val, target);
-
-       /* 4. if SoftStopFault - goto (1) */
-       if (val & PMR_SOFTSTOPFAULT) {
-               if (++count < 100)
-                       goto begin;
-               else {
-                       printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, "
-                              "aborting TCE cache flush sequence!\n");
-                       return; /* pray for the best */
-               }
-       }
-
-       /* 5. Slam into HardStop by reading PHB_PAGE_MIG_CTRL */
-       target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
-       printk(KERN_DEBUG "5a. slamming into HardStop by reading %p\n", target);
-       val = be32_to_cpu(readl(target));
-       printk(KERN_DEBUG "5b. read 0x%x [LE] from %p\n", val, target);
-       target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
-       val = be32_to_cpu(readl(target));
-       printk(KERN_DEBUG "5c. read 0x%x [LE] from %p (debug)\n", val, target);
-
-       /* 6. invalidate TCE cache */
-       printk(KERN_DEBUG "6. invalidating TCE cache\n");
-       target = calgary_reg(bbar, tar_offset(bus));
-       writeq(tbl->tar_val, target);
-
-       /* 7. Re-read PMCR */
-       printk(KERN_DEBUG "7a. Re-reading PMCR\n");
-       target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
-       val = be32_to_cpu(readl(target));
-       printk(KERN_DEBUG "7b. read 0x%x [LE] from %p\n", val, target);
-
-       /* 8. Remove HardStop */
-       printk(KERN_DEBUG "8a. removing HardStop from PMCR\n");
-       target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
-       val = 0;
-       printk(KERN_DEBUG "8b. writing 0x%x [LE] to %p\n", val, target);
-       writel(cpu_to_be32(val), target);
-       val = be32_to_cpu(readl(target));
-       printk(KERN_DEBUG "8c. read 0x%x [LE] from %p\n", val, target);
-}
-
-static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start,
-       u64 limit)
-{
-       unsigned int numpages;
-
-       limit = limit | 0xfffff;
-       limit++;
-
-       numpages = ((limit - start) >> PAGE_SHIFT);
-       iommu_range_reserve(pci_iommu(dev->bus), start, numpages);
-}
-
-static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev)
-{
-       void __iomem *target;
-       u64 low, high, sizelow;
-       u64 start, limit;
-       struct iommu_table *tbl = pci_iommu(dev->bus);
-       unsigned char busnum = dev->bus->number;
-       void __iomem *bbar = tbl->bbar;
-
-       /* peripheral MEM_1 region */
-       target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW);
-       low = be32_to_cpu(readl(target));
-       target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH);
-       high = be32_to_cpu(readl(target));
-       target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE);
-       sizelow = be32_to_cpu(readl(target));
-
-       start = (high << 32) | low;
-       limit = sizelow;
-
-       calgary_reserve_mem_region(dev, start, limit);
-}
-
-static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev)
-{
-       void __iomem *target;
-       u32 val32;
-       u64 low, high, sizelow, sizehigh;
-       u64 start, limit;
-       struct iommu_table *tbl = pci_iommu(dev->bus);
-       unsigned char busnum = dev->bus->number;
-       void __iomem *bbar = tbl->bbar;
-
-       /* is it enabled? */
-       target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
-       val32 = be32_to_cpu(readl(target));
-       if (!(val32 & PHB_MEM2_ENABLE))
-               return;
-
-       target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW);
-       low = be32_to_cpu(readl(target));
-       target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH);
-       high = be32_to_cpu(readl(target));
-       target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW);
-       sizelow = be32_to_cpu(readl(target));
-       target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH);
-       sizehigh = be32_to_cpu(readl(target));
-
-       start = (high << 32) | low;
-       limit = (sizehigh << 32) | sizelow;
-
-       calgary_reserve_mem_region(dev, start, limit);
-}
-
-/*
- * some regions of the IO address space do not get translated, so we
- * must not give devices IO addresses in those regions. The regions
- * are the 640KB-1MB region and the two PCI peripheral memory holes.
- * Reserve all of them in the IOMMU bitmap to avoid giving them out
- * later.
- */
-static void __init calgary_reserve_regions(struct pci_dev *dev)
-{
-       unsigned int npages;
-       u64 start;
-       struct iommu_table *tbl = pci_iommu(dev->bus);
-
-       /* reserve EMERGENCY_PAGES from bad_dma_address and up */
-       iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES);
-
-       /* avoid the BIOS/VGA first 640KB-1MB region */
-       /* for CalIOC2 - avoid the entire first MB */
-       if (is_calgary(dev->device)) {
-               start = (640 * 1024);
-               npages = ((1024 - 640) * 1024) >> PAGE_SHIFT;
-       } else { /* calioc2 */
-               start = 0;
-               npages = (1 * 1024 * 1024) >> PAGE_SHIFT;
-       }
-       iommu_range_reserve(tbl, start, npages);
-
-       /* reserve the two PCI peripheral memory regions in IO space */
-       calgary_reserve_peripheral_mem_1(dev);
-       calgary_reserve_peripheral_mem_2(dev);
-}
-
-static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
-{
-       u64 val64;
-       u64 table_phys;
-       void __iomem *target;
-       int ret;
-       struct iommu_table *tbl;
-
-       /* build TCE tables for each PHB */
-       ret = build_tce_table(dev, bbar);
-       if (ret)
-               return ret;
-
-       tbl = pci_iommu(dev->bus);
-       tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space;
-       tce_free(tbl, 0, tbl->it_size);
-
-       if (is_calgary(dev->device))
-               tbl->chip_ops = &calgary_chip_ops;
-       else if (is_calioc2(dev->device))
-               tbl->chip_ops = &calioc2_chip_ops;
-       else
-               BUG();
-
-       calgary_reserve_regions(dev);
-
-       /* set TARs for each PHB */
-       target = calgary_reg(bbar, tar_offset(dev->bus->number));
-       val64 = be64_to_cpu(readq(target));
-
-       /* zero out all TAR bits under sw control */
-       val64 &= ~TAR_SW_BITS;
-       table_phys = (u64)__pa(tbl->it_base);
-
-       val64 |= table_phys;
-
-       BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M);
-       val64 |= (u64) specified_table_size;
-
-       tbl->tar_val = cpu_to_be64(val64);
-
-       writeq(tbl->tar_val, target);
-       readq(target); /* flush */
-
-       return 0;
-}
-
-static void __init calgary_free_bus(struct pci_dev *dev)
-{
-       u64 val64;
-       struct iommu_table *tbl = pci_iommu(dev->bus);
-       void __iomem *target;
-       unsigned int bitmapsz;
-
-       target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number));
-       val64 = be64_to_cpu(readq(target));
-       val64 &= ~TAR_SW_BITS;
-       writeq(cpu_to_be64(val64), target);
-       readq(target); /* flush */
-
-       bitmapsz = tbl->it_size / BITS_PER_BYTE;
-       free_pages((unsigned long)tbl->it_map, get_order(bitmapsz));
-       tbl->it_map = NULL;
-
-       kfree(tbl);
-       
-       set_pci_iommu(dev->bus, NULL);
-
-       /* Can't free bootmem allocated memory after system is up :-( */
-       bus_info[dev->bus->number].tce_space = NULL;
-}
-
-static void calgary_dump_error_regs(struct iommu_table *tbl)
-{
-       void __iomem *bbar = tbl->bbar;
-       void __iomem *target;
-       u32 csr, plssr;
-
-       target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
-       csr = be32_to_cpu(readl(target));
-
-       target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
-       plssr = be32_to_cpu(readl(target));
-
-       /* If no error, the agent ID in the CSR is not valid */
-       printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, "
-              "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr);
-}
-
-static void calioc2_dump_error_regs(struct iommu_table *tbl)
-{
-       void __iomem *bbar = tbl->bbar;
-       u32 csr, csmr, plssr, mck, rcstat;
-       void __iomem *target;
-       unsigned long phboff = phb_offset(tbl->it_busno);
-       unsigned long erroff;
-       u32 errregs[7];
-       int i;
-
-       /* dump CSR */
-       target = calgary_reg(bbar, phboff | PHB_CSR_OFFSET);
-       csr = be32_to_cpu(readl(target));
-       /* dump PLSSR */
-       target = calgary_reg(bbar, phboff | PHB_PLSSR_OFFSET);
-       plssr = be32_to_cpu(readl(target));
-       /* dump CSMR */
-       target = calgary_reg(bbar, phboff | 0x290);
-       csmr = be32_to_cpu(readl(target));
-       /* dump mck */
-       target = calgary_reg(bbar, phboff | 0x800);
-       mck = be32_to_cpu(readl(target));
-
-       printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n",
-              tbl->it_busno);
-
-       printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n",
-              csr, plssr, csmr, mck);
-
-       /* dump rest of error regs */
-       printk(KERN_EMERG "Calgary: ");
-       for (i = 0; i < ARRAY_SIZE(errregs); i++) {
-               /* err regs are at 0x810 - 0x870 */
-               erroff = (0x810 + (i * 0x10));
-               target = calgary_reg(bbar, phboff | erroff);
-               errregs[i] = be32_to_cpu(readl(target));
-               printk("0x%08x@0x%lx ", errregs[i], erroff);
-       }
-       printk("\n");
-
-       /* root complex status */
-       target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS);
-       rcstat = be32_to_cpu(readl(target));
-       printk(KERN_EMERG "Calgary: 0x%08x@0x%x\n", rcstat,
-              PHB_ROOT_COMPLEX_STATUS);
-}
-
-static void calgary_watchdog(unsigned long data)
-{
-       struct pci_dev *dev = (struct pci_dev *)data;
-       struct iommu_table *tbl = pci_iommu(dev->bus);
-       void __iomem *bbar = tbl->bbar;
-       u32 val32;
-       void __iomem *target;
-
-       target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
-       val32 = be32_to_cpu(readl(target));
-
-       /* If no error, the agent ID in the CSR is not valid */
-       if (val32 & CSR_AGENT_MASK) {
-               tbl->chip_ops->dump_error_regs(tbl);
-
-               /* reset error */
-               writel(0, target);
-
-               /* Disable bus that caused the error */
-               target = calgary_reg(bbar, phb_offset(tbl->it_busno) |
-                                    PHB_CONFIG_RW_OFFSET);
-               val32 = be32_to_cpu(readl(target));
-               val32 |= PHB_SLOT_DISABLE;
-               writel(cpu_to_be32(val32), target);
-               readl(target); /* flush */
-       } else {
-               /* Reset the timer */
-               mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ);
-       }
-}
-
-static void __init calgary_set_split_completion_timeout(void __iomem *bbar,
-       unsigned char busnum, unsigned long timeout)
-{
-       u64 val64;
-       void __iomem *target;
-       unsigned int phb_shift = ~0; /* silence gcc */
-       u64 mask;
-
-       switch (busno_to_phbid(busnum)) {
-       case 0: phb_shift = (63 - 19);
-               break;
-       case 1: phb_shift = (63 - 23);
-               break;
-       case 2: phb_shift = (63 - 27);
-               break;
-       case 3: phb_shift = (63 - 35);
-               break;
-       default:
-               BUG_ON(busno_to_phbid(busnum));
-       }
-
-       target = calgary_reg(bbar, CALGARY_CONFIG_REG);
-       val64 = be64_to_cpu(readq(target));
-
-       /* zero out this PHB's timer bits */
-       mask = ~(0xFUL << phb_shift);
-       val64 &= mask;
-       val64 |= (timeout << phb_shift);
-       writeq(cpu_to_be64(val64), target);
-       readq(target); /* flush */
-}
-
-static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
-{
-       unsigned char busnum = dev->bus->number;
-       void __iomem *bbar = tbl->bbar;
-       void __iomem *target;
-       u32 val;
-
-       /*
-        * CalIOC2 designers recommend setting bit 8 in 0xnDB0 to 1
-        */
-       target = calgary_reg(bbar, phb_offset(busnum) | PHB_SAVIOR_L2);
-       val = cpu_to_be32(readl(target));
-       val |= 0x00800000;
-       writel(cpu_to_be32(val), target);
-}
-
-static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
-{
-       unsigned char busnum = dev->bus->number;
-
-       /*
-        * Give split completion a longer timeout on bus 1 for aic94xx
-        * http://bugzilla.kernel.org/show_bug.cgi?id=7180
-        */
-       if (is_calgary(dev->device) && (busnum == 1))
-               calgary_set_split_completion_timeout(tbl->bbar, busnum,
-                                                    CCR_2SEC_TIMEOUT);
-}
-
-static void __init calgary_enable_translation(struct pci_dev *dev)
-{
-       u32 val32;
-       unsigned char busnum;
-       void __iomem *target;
-       void __iomem *bbar;
-       struct iommu_table *tbl;
-
-       busnum = dev->bus->number;
-       tbl = pci_iommu(dev->bus);
-       bbar = tbl->bbar;
-
-       /* enable TCE in PHB Config Register */
-       target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
-       val32 = be32_to_cpu(readl(target));
-       val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE;
-
-       printk(KERN_INFO "Calgary: enabling translation on %s PHB %#x\n",
-              (dev->device == PCI_DEVICE_ID_IBM_CALGARY) ?
-              "Calgary" : "CalIOC2", busnum);
-       printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this "
-              "bus.\n");
-
-       writel(cpu_to_be32(val32), target);
-       readl(target); /* flush */
-
-       init_timer(&tbl->watchdog_timer);
-       tbl->watchdog_timer.function = &calgary_watchdog;
-       tbl->watchdog_timer.data = (unsigned long)dev;
-       mod_timer(&tbl->watchdog_timer, jiffies);
-}
-
-static void __init calgary_disable_translation(struct pci_dev *dev)
-{
-       u32 val32;
-       unsigned char busnum;
-       void __iomem *target;
-       void __iomem *bbar;
-       struct iommu_table *tbl;
-
-       busnum = dev->bus->number;
-       tbl = pci_iommu(dev->bus);
-       bbar = tbl->bbar;
-
-       /* disable TCE in PHB Config Register */
-       target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
-       val32 = be32_to_cpu(readl(target));
-       val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE);
-
-       printk(KERN_INFO "Calgary: disabling translation on PHB %#x!\n", busnum);
-       writel(cpu_to_be32(val32), target);
-       readl(target); /* flush */
-
-       del_timer_sync(&tbl->watchdog_timer);
-}
-
-static void __init calgary_init_one_nontraslated(struct pci_dev *dev)
-{
-       pci_dev_get(dev);
-       set_pci_iommu(dev->bus, NULL);
-
-       /* is the device behind a bridge? */
-       if (dev->bus->parent)
-               dev->bus->parent->self = dev;
-       else
-               dev->bus->self = dev;
-}
-
-static int __init calgary_init_one(struct pci_dev *dev)
-{
-       void __iomem *bbar;
-       struct iommu_table *tbl;
-       int ret;
-
-       BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM);
-
-       bbar = busno_to_bbar(dev->bus->number);
-       ret = calgary_setup_tar(dev, bbar);
-       if (ret)
-               goto done;
-
-       pci_dev_get(dev);
-
-       if (dev->bus->parent) {
-               if (dev->bus->parent->self)
-                       printk(KERN_WARNING "Calgary: IEEEE, dev %p has "
-                              "bus->parent->self!\n", dev);
-               dev->bus->parent->self = dev;
-       } else
-               dev->bus->self = dev;
-
-       tbl = pci_iommu(dev->bus);
-       tbl->chip_ops->handle_quirks(tbl, dev);
-
-       calgary_enable_translation(dev);
-
-       return 0;
-
-done:
-       return ret;
-}
-
-static int __init calgary_locate_bbars(void)
-{
-       int ret;
-       int rioidx, phb, bus;
-       void __iomem *bbar;
-       void __iomem *target;
-       unsigned long offset;
-       u8 start_bus, end_bus;
-       u32 val;
-
-       ret = -ENODATA;
-       for (rioidx = 0; rioidx < rio_table_hdr->num_rio_dev; rioidx++) {
-               struct rio_detail *rio = rio_devs[rioidx];
-
-               if ((rio->type != COMPAT_CALGARY) && (rio->type != ALT_CALGARY))
-                       continue;
-
-               /* map entire 1MB of Calgary config space */
-               bbar = ioremap_nocache(rio->BBAR, 1024 * 1024);
-               if (!bbar)
-                       goto error;
-
-               for (phb = 0; phb < PHBS_PER_CALGARY; phb++) {
-                       offset = phb_debug_offsets[phb] | PHB_DEBUG_STUFF_OFFSET;
-                       target = calgary_reg(bbar, offset);
-
-                       val = be32_to_cpu(readl(target));
-
-                       start_bus = (u8)((val & 0x00FF0000) >> 16);
-                       end_bus = (u8)((val & 0x0000FF00) >> 8);
-
-                       if (end_bus) {
-                               for (bus = start_bus; bus <= end_bus; bus++) {
-                                       bus_info[bus].bbar = bbar;
-                                       bus_info[bus].phbid = phb;
-                               }
-                       } else {
-                               bus_info[start_bus].bbar = bbar;
-                               bus_info[start_bus].phbid = phb;
-                       }
-               }
-       }
-
-       return 0;
-
-error:
-       /* scan bus_info and iounmap any bbars we previously ioremap'd */
-       for (bus = 0; bus < ARRAY_SIZE(bus_info); bus++)
-               if (bus_info[bus].bbar)
-                       iounmap(bus_info[bus].bbar);
-
-       return ret;
-}
-
-static int __init calgary_init(void)
-{
-       int ret;
-       struct pci_dev *dev = NULL;
-       void *tce_space;
-
-       ret = calgary_locate_bbars();
-       if (ret)
-               return ret;
-
-       do {
-               dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
-               if (!dev)
-                       break;
-               if (!is_cal_pci_dev(dev->device))
-                       continue;
-               if (!translate_phb(dev)) {
-                       calgary_init_one_nontraslated(dev);
-                       continue;
-               }
-               tce_space = bus_info[dev->bus->number].tce_space;
-               if (!tce_space && !translate_empty_slots)
-                       continue;
-
-               ret = calgary_init_one(dev);
-               if (ret)
-                       goto error;
-       } while (1);
-
-       return ret;
-
-error:
-       do {
-               dev = pci_get_device_reverse(PCI_VENDOR_ID_IBM,
-                                            PCI_ANY_ID, dev);
-               if (!dev)
-                       break;
-               if (!is_cal_pci_dev(dev->device))
-                       continue;
-               if (!translate_phb(dev)) {
-                       pci_dev_put(dev);
-                       continue;
-               }
-               if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots)
-                       continue;
-
-               calgary_disable_translation(dev);
-               calgary_free_bus(dev);
-               pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
-       } while (1);
-
-       return ret;
-}
-
-static inline int __init determine_tce_table_size(u64 ram)
-{
-       int ret;
-
-       if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED)
-               return specified_table_size;
-
-       /*
-        * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to
-        * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each
-        * larger table size has twice as many entries, so shift the
-        * max ram address by 13 to divide by 8K and then look at the
-        * order of the result to choose between 0-7.
-        */
-       ret = get_order(ram >> 13);
-       if (ret > TCE_TABLE_SIZE_8M)
-               ret = TCE_TABLE_SIZE_8M;
-
-       return ret;
-}
-
-static int __init build_detail_arrays(void)
-{
-       unsigned long ptr;
-       int i, scal_detail_size, rio_detail_size;
-
-       if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){
-               printk(KERN_WARNING
-                       "Calgary: MAX_NUMNODES too low! Defined as %d, "
-                       "but system has %d nodes.\n",
-                       MAX_NUMNODES, rio_table_hdr->num_scal_dev);
-               return -ENODEV;
-       }
-
-       switch (rio_table_hdr->version){
-       case 2:
-               scal_detail_size = 11;
-               rio_detail_size = 13;
-               break;
-       case 3:
-               scal_detail_size = 12;
-               rio_detail_size = 15;
-               break;
-       default:
-               printk(KERN_WARNING
-                      "Calgary: Invalid Rio Grande Table Version: %d\n",
-                      rio_table_hdr->version);
-               return -EPROTO;
-       }
-
-       ptr = ((unsigned long)rio_table_hdr) + 3;
-       for (i = 0; i < rio_table_hdr->num_scal_dev;
-                   i++, ptr += scal_detail_size)
-               scal_devs[i] = (struct scal_detail *)ptr;
-
-       for (i = 0; i < rio_table_hdr->num_rio_dev;
-                   i++, ptr += rio_detail_size)
-               rio_devs[i] = (struct rio_detail *)ptr;
-
-       return 0;
-}
-
-static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
-{
-       int dev;
-       u32 val;
-
-       if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) {
-               /*
-                * FIXME: properly scan for devices accross the
-                * PCI-to-PCI bridge on every CalIOC2 port.
-                */
-               return 1;
-       }
-
-       for (dev = 1; dev < 8; dev++) {
-               val = read_pci_config(bus, dev, 0, 0);
-               if (val != 0xffffffff)
-                       break;
-       }
-       return (val != 0xffffffff);
-}
-
-void __init detect_calgary(void)
-{
-       int bus;
-       void *tbl;
-       int calgary_found = 0;
-       unsigned long ptr;
-       unsigned int offset, prev_offset;
-       int ret;
-
-       /*
-        * if the user specified iommu=off or iommu=soft or we found
-        * another HW IOMMU already, bail out.
-        */
-       if (swiotlb || no_iommu || iommu_detected)
-               return;
-
-       if (!use_calgary)
-               return;
-
-       if (!early_pci_allowed())
-               return;
-
-       printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n");
-
-       ptr = (unsigned long)phys_to_virt(get_bios_ebda());
-
-       rio_table_hdr = NULL;
-       prev_offset = 0;
-       offset = 0x180;
-       /*
-        * The next offset is stored in the 1st word.
-        * Only parse up until the offset increases:
-        */
-       while (offset > prev_offset) {
-               /* The block id is stored in the 2nd word */
-               if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
-                       /* set the pointer past the offset & block id */
-                       rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
-                       break;
-               }
-               prev_offset = offset;
-               offset = *((unsigned short *)(ptr + offset));
-       }
-       if (!rio_table_hdr) {
-               printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table "
-                      "in EBDA - bailing!\n");
-               return;
-       }
-
-       ret = build_detail_arrays();
-       if (ret) {
-               printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret);
-               return;
-       }
-
-       specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);
-
-       for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
-               struct calgary_bus_info *info = &bus_info[bus];
-               unsigned short pci_device;
-               u32 val;
-
-               val = read_pci_config(bus, 0, 0, 0);
-               pci_device = (val & 0xFFFF0000) >> 16;
-
-               if (!is_cal_pci_dev(pci_device))
-                       continue;
-
-               if (info->translation_disabled)
-                       continue;
-
-               if (calgary_bus_has_devices(bus, pci_device) ||
-                   translate_empty_slots) {
-                       tbl = alloc_tce_table();
-                       if (!tbl)
-                               goto cleanup;
-                       info->tce_space = tbl;
-                       calgary_found = 1;
-               }
-       }
-
-       printk(KERN_DEBUG "Calgary: finished detection, Calgary %s\n",
-              calgary_found ? "found" : "not found");
-
-       if (calgary_found) {
-               iommu_detected = 1;
-               calgary_detected = 1;
-               printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
-               printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
-                      "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
-                      debugging ? "enabled" : "disabled");
-       }
-       return;
-
-cleanup:
-       for (--bus; bus >= 0; --bus) {
-               struct calgary_bus_info *info = &bus_info[bus];
-
-               if (info->tce_space)
-                       free_tce_table(info->tce_space);
-       }
-}
-
-int __init calgary_iommu_init(void)
-{
-       int ret;
-
-       if (no_iommu || swiotlb)
-               return -ENODEV;
-
-       if (!calgary_detected)
-               return -ENODEV;
-
-       /* ok, we're trying to use Calgary - let's roll */
-       printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
-
-       ret = calgary_init();
-       if (ret) {
-               printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
-                      "falling back to no_iommu\n", ret);
-               if (end_pfn > MAX_DMA32_PFN)
-                       printk(KERN_ERR "WARNING more than 4GB of memory, "
-                                       "32bit PCI may malfunction.\n");
-               return ret;
-       }
-
-       force_iommu = 1;
-       bad_dma_address = 0x0;
-       dma_ops = &calgary_dma_ops;
-
-       return 0;
-}
-
-static int __init calgary_parse_options(char *p)
-{
-       unsigned int bridge;
-       size_t len;
-       char* endp;
-
-       while (*p) {
-               if (!strncmp(p, "64k", 3))
-                       specified_table_size = TCE_TABLE_SIZE_64K;
-               else if (!strncmp(p, "128k", 4))
-                       specified_table_size = TCE_TABLE_SIZE_128K;
-               else if (!strncmp(p, "256k", 4))
-                       specified_table_size = TCE_TABLE_SIZE_256K;
-               else if (!strncmp(p, "512k", 4))
-                       specified_table_size = TCE_TABLE_SIZE_512K;
-               else if (!strncmp(p, "1M", 2))
-                       specified_table_size = TCE_TABLE_SIZE_1M;
-               else if (!strncmp(p, "2M", 2))
-                       specified_table_size = TCE_TABLE_SIZE_2M;
-               else if (!strncmp(p, "4M", 2))
-                       specified_table_size = TCE_TABLE_SIZE_4M;
-               else if (!strncmp(p, "8M", 2))
-                       specified_table_size = TCE_TABLE_SIZE_8M;
-
-               len = strlen("translate_empty_slots");
-               if (!strncmp(p, "translate_empty_slots", len))
-                       translate_empty_slots = 1;
-
-               len = strlen("disable");
-               if (!strncmp(p, "disable", len)) {
-                       p += len;
-                       if (*p == '=')
-                               ++p;
-                       if (*p == '\0')
-                               break;
-                       bridge = simple_strtol(p, &endp, 0);
-                       if (p == endp)
-                               break;
-
-                       if (bridge < MAX_PHB_BUS_NUM) {
-                               printk(KERN_INFO "Calgary: disabling "
-                                      "translation for PHB %#x\n", bridge);
-                               bus_info[bridge].translation_disabled = 1;
-                       }
-               }
-
-               p = strpbrk(p, ",");
-               if (!p)
-                       break;
-
-               p++; /* skip ',' */
-       }
-       return 1;
-}
-__setup("calgary=", calgary_parse_options);
-
-static void __init calgary_fixup_one_tce_space(struct pci_dev *dev)
-{
-       struct iommu_table *tbl;
-       unsigned int npages;
-       int i;
-
-       tbl = pci_iommu(dev->bus);
-
-       for (i = 0; i < 4; i++) {
-               struct resource *r = &dev->resource[PCI_BRIDGE_RESOURCES + i];
-
-               /* Don't give out TCEs that map MEM resources */
-               if (!(r->flags & IORESOURCE_MEM))
-                       continue;
-
-               /* 0-based? we reserve the whole 1st MB anyway */
-               if (!r->start)
-                       continue;
-
-               /* cover the whole region */
-               npages = (r->end - r->start) >> PAGE_SHIFT;
-               npages++;
-
-               iommu_range_reserve(tbl, r->start, npages);
-       }
-}
-
-static int __init calgary_fixup_tce_spaces(void)
-{
-       struct pci_dev *dev = NULL;
-       void *tce_space;
-
-       if (no_iommu || swiotlb || !calgary_detected)
-               return -ENODEV;
-
-       printk(KERN_DEBUG "Calgary: fixing up tce spaces\n");
-
-       do {
-               dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
-               if (!dev)
-                       break;
-               if (!is_cal_pci_dev(dev->device))
-                       continue;
-               if (!translate_phb(dev))
-                       continue;
-
-               tce_space = bus_info[dev->bus->number].tce_space;
-               if (!tce_space)
-                       continue;
-
-               calgary_fixup_one_tce_space(dev);
-
-       } while (1);
-
-       return 0;
-}
-
-/*
- * We need to be call after pcibios_assign_resources (fs_initcall level)
- * and before device_initcall.
- */
-rootfs_initcall(calgary_fixup_tce_spaces);
diff --git a/arch/x86_64/kernel/pci-dma_64.c b/arch/x86_64/kernel/pci-dma_64.c
deleted file mode 100644 (file)
index 2971144..0000000
+++ /dev/null
@@ -1,346 +0,0 @@
-/*
- * Dynamic DMA mapping support.
- */
-
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/pci.h>
-#include <linux/module.h>
-#include <asm/io.h>
-#include <asm/iommu.h>
-#include <asm/calgary.h>
-
-int iommu_merge __read_mostly = 0;
-EXPORT_SYMBOL(iommu_merge);
-
-dma_addr_t bad_dma_address __read_mostly;
-EXPORT_SYMBOL(bad_dma_address);
-
-/* This tells the BIO block layer to assume merging. Default to off
-   because we cannot guarantee merging later. */
-int iommu_bio_merge __read_mostly = 0;
-EXPORT_SYMBOL(iommu_bio_merge);
-
-static int iommu_sac_force __read_mostly = 0;
-
-int no_iommu __read_mostly;
-#ifdef CONFIG_IOMMU_DEBUG
-int panic_on_overflow __read_mostly = 1;
-int force_iommu __read_mostly = 1;
-#else
-int panic_on_overflow __read_mostly = 0;
-int force_iommu __read_mostly= 0;
-#endif
-
-/* Set this to 1 if there is a HW IOMMU in the system */
-int iommu_detected __read_mostly = 0;
-
-/* Dummy device used for NULL arguments (normally ISA). Better would
-   be probably a smaller DMA mask, but this is bug-to-bug compatible
-   to i386. */
-struct device fallback_dev = {
-       .bus_id = "fallback device",
-       .coherent_dma_mask = DMA_32BIT_MASK,
-       .dma_mask = &fallback_dev.coherent_dma_mask,
-};
-
-/* Allocate DMA memory on node near device */
-noinline static void *
-dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
-{
-       struct page *page;
-       int node;
-#ifdef CONFIG_PCI
-       if (dev->bus == &pci_bus_type)
-               node = pcibus_to_node(to_pci_dev(dev)->bus);
-       else
-#endif
-               node = numa_node_id();
-
-       if (node < first_node(node_online_map))
-               node = first_node(node_online_map);
-
-       page = alloc_pages_node(node, gfp, order);
-       return page ? page_address(page) : NULL;
-}
-
-/*
- * Allocate memory for a coherent mapping.
- */
-void *
-dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
-                  gfp_t gfp)
-{
-       void *memory;
-       unsigned long dma_mask = 0;
-       u64 bus;
-
-       if (!dev)
-               dev = &fallback_dev;
-       dma_mask = dev->coherent_dma_mask;
-       if (dma_mask == 0)
-               dma_mask = DMA_32BIT_MASK;
-
-       /* Device not DMA able */
-       if (dev->dma_mask == NULL)
-               return NULL;
-
-       /* Don't invoke OOM killer */
-       gfp |= __GFP_NORETRY;
-
-       /* Kludge to make it bug-to-bug compatible with i386. i386
-          uses the normal dma_mask for alloc_coherent. */
-       dma_mask &= *dev->dma_mask;
-
-       /* Why <=? Even when the mask is smaller than 4GB it is often
-          larger than 16MB and in this case we have a chance of
-          finding fitting memory in the next higher zone first. If
-          not retry with true GFP_DMA. -AK */
-       if (dma_mask <= DMA_32BIT_MASK)
-               gfp |= GFP_DMA32;
-
- again:
-       memory = dma_alloc_pages(dev, gfp, get_order(size));
-       if (memory == NULL)
-               return NULL;
-
-       {
-               int high, mmu;
-               bus = virt_to_bus(memory);
-               high = (bus + size) >= dma_mask;
-               mmu = high;
-               if (force_iommu && !(gfp & GFP_DMA))
-                       mmu = 1;
-               else if (high) {
-                       free_pages((unsigned long)memory,
-                                  get_order(size));
-
-                       /* Don't use the 16MB ZONE_DMA unless absolutely
-                          needed. It's better to use remapping first. */
-                       if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
-                               gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
-                               goto again;
-                       }
-
-                       /* Let low level make its own zone decisions */
-                       gfp &= ~(GFP_DMA32|GFP_DMA);
-
-                       if (dma_ops->alloc_coherent)
-                               return dma_ops->alloc_coherent(dev, size,
-                                                          dma_handle, gfp);
-                       return NULL;
-               }
-
-               memset(memory, 0, size);
-               if (!mmu) {
-                       *dma_handle = virt_to_bus(memory);
-                       return memory;
-               }
-       }
-
-       if (dma_ops->alloc_coherent) {
-               free_pages((unsigned long)memory, get_order(size));
-               gfp &= ~(GFP_DMA|GFP_DMA32);
-               return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
-       }
-
-       if (dma_ops->map_simple) {
-               *dma_handle = dma_ops->map_simple(dev, memory,
-                                             size,
-                                             PCI_DMA_BIDIRECTIONAL);
-               if (*dma_handle != bad_dma_address)
-                       return memory;
-       }
-
-       if (panic_on_overflow)
-               panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",size);
-       free_pages((unsigned long)memory, get_order(size));
-       return NULL;
-}
-EXPORT_SYMBOL(dma_alloc_coherent);
-
-/*
- * Unmap coherent memory.
- * The caller must ensure that the device has finished accessing the mapping.
- */
-void dma_free_coherent(struct device *dev, size_t size,
-                        void *vaddr, dma_addr_t bus)
-{
-       if (dma_ops->unmap_single)
-               dma_ops->unmap_single(dev, bus, size, 0);
-       free_pages((unsigned long)vaddr, get_order(size));
-}
-EXPORT_SYMBOL(dma_free_coherent);
-
-static int forbid_dac __read_mostly;
-
-int dma_supported(struct device *dev, u64 mask)
-{
-#ifdef CONFIG_PCI
-       if (mask > 0xffffffff && forbid_dac > 0) {
-
-
-
-               printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", dev->bus_id);
-               return 0;
-       }
-#endif
-
-       if (dma_ops->dma_supported)
-               return dma_ops->dma_supported(dev, mask);
-
-       /* Copied from i386. Doesn't make much sense, because it will
-          only work for pci_alloc_coherent.
-          The caller just has to use GFP_DMA in this case. */
-        if (mask < DMA_24BIT_MASK)
-                return 0;
-
-       /* Tell the device to use SAC when IOMMU force is on.  This
-          allows the driver to use cheaper accesses in some cases.
-
-          Problem with this is that if we overflow the IOMMU area and
-          return DAC as fallback address the device may not handle it
-          correctly.
-
-          As a special case some controllers have a 39bit address
-          mode that is as efficient as 32bit (aic79xx). Don't force
-          SAC for these.  Assume all masks <= 40 bits are of this
-          type. Normally this doesn't make any difference, but gives
-          more gentle handling of IOMMU overflow. */
-       if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
-               printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask);
-               return 0;
-       }
-
-       return 1;
-}
-EXPORT_SYMBOL(dma_supported);
-
-int dma_set_mask(struct device *dev, u64 mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-       *dev->dma_mask = mask;
-       return 0;
-}
-EXPORT_SYMBOL(dma_set_mask);
-
-/*
- * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
- * documentation.
- */
-__init int iommu_setup(char *p)
-{
-       iommu_merge = 1;
-
-       if (!p)
-               return -EINVAL;
-
-       while (*p) {
-               if (!strncmp(p,"off",3))
-                       no_iommu = 1;
-               /* gart_parse_options has more force support */
-               if (!strncmp(p,"force",5))
-                       force_iommu = 1;
-               if (!strncmp(p,"noforce",7)) {
-                       iommu_merge = 0;
-                       force_iommu = 0;
-               }
-
-               if (!strncmp(p, "biomerge",8)) {
-                       iommu_bio_merge = 4096;
-                       iommu_merge = 1;
-                       force_iommu = 1;
-               }
-               if (!strncmp(p, "panic",5))
-                       panic_on_overflow = 1;
-               if (!strncmp(p, "nopanic",7))
-                       panic_on_overflow = 0;
-               if (!strncmp(p, "merge",5)) {
-                       iommu_merge = 1;
-                       force_iommu = 1;
-               }
-               if (!strncmp(p, "nomerge",7))
-                       iommu_merge = 0;
-               if (!strncmp(p, "forcesac",8))
-                       iommu_sac_force = 1;
-               if (!strncmp(p, "allowdac", 8))
-                       forbid_dac = 0;
-               if (!strncmp(p, "nodac", 5))
-                       forbid_dac = -1;
-
-#ifdef CONFIG_SWIOTLB
-               if (!strncmp(p, "soft",4))
-                       swiotlb = 1;
-#endif
-
-#ifdef CONFIG_IOMMU
-               gart_parse_options(p);
-#endif
-
-#ifdef CONFIG_CALGARY_IOMMU
-               if (!strncmp(p, "calgary", 7))
-                       use_calgary = 1;
-#endif /* CONFIG_CALGARY_IOMMU */
-
-               p += strcspn(p, ",");
-               if (*p == ',')
-                       ++p;
-       }
-       return 0;
-}
-early_param("iommu", iommu_setup);
-
-void __init pci_iommu_alloc(void)
-{
-       /*
-        * The order of these functions is important for
-        * fall-back/fail-over reasons
-        */
-#ifdef CONFIG_IOMMU
-       iommu_hole_init();
-#endif
-
-#ifdef CONFIG_CALGARY_IOMMU
-       detect_calgary();
-#endif
-
-#ifdef CONFIG_SWIOTLB
-       pci_swiotlb_init();
-#endif
-}
-
-static int __init pci_iommu_init(void)
-{
-#ifdef CONFIG_CALGARY_IOMMU
-       calgary_iommu_init();
-#endif
-
-#ifdef CONFIG_IOMMU
-       gart_iommu_init();
-#endif
-
-       no_iommu_init();
-       return 0;
-}
-
-void pci_iommu_shutdown(void)
-{
-       gart_iommu_shutdown();
-}
-
-#ifdef CONFIG_PCI
-/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
-
-static __devinit void via_no_dac(struct pci_dev *dev)
-{
-       if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
-               printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
-               forbid_dac = 1;
-       }
-}
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
-#endif
-/* Must execute after PCI subsystem */
-fs_initcall(pci_iommu_init);
diff --git a/arch/x86_64/kernel/pci-gart_64.c b/arch/x86_64/kernel/pci-gart_64.c
deleted file mode 100644 (file)
index 4918c57..0000000
+++ /dev/null
@@ -1,740 +0,0 @@
-/*
- * Dynamic DMA mapping support for AMD Hammer.
- * 
- * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI.
- * This allows to use PCI devices that only support 32bit addresses on systems
- * with more than 4GB. 
- *
- * See Documentation/DMA-mapping.txt for the interface specification.
- * 
- * Copyright 2002 Andi Kleen, SuSE Labs.
- */
-
-#include <linux/types.h>
-#include <linux/ctype.h>
-#include <linux/agp_backend.h>
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/spinlock.h>
-#include <linux/pci.h>
-#include <linux/module.h>
-#include <linux/topology.h>
-#include <linux/interrupt.h>
-#include <linux/bitops.h>
-#include <linux/kdebug.h>
-#include <asm/atomic.h>
-#include <asm/io.h>
-#include <asm/mtrr.h>
-#include <asm/pgtable.h>
-#include <asm/proto.h>
-#include <asm/iommu.h>
-#include <asm/cacheflush.h>
-#include <asm/swiotlb.h>
-#include <asm/dma.h>
-#include <asm/k8.h>
-
-unsigned long iommu_bus_base;  /* GART remapping area (physical) */
-static unsigned long iommu_size;       /* size of remapping area bytes */
-static unsigned long iommu_pages;      /* .. and in pages */
-
-u32 *iommu_gatt_base;          /* Remapping table */
-
-/* If this is disabled the IOMMU will use an optimized flushing strategy
-   of only flushing when an mapping is reused. With it true the GART is flushed 
-   for every mapping. Problem is that doing the lazy flush seems to trigger
-   bugs with some popular PCI cards, in particular 3ware (but has been also
-   also seen with Qlogic at least). */
-int iommu_fullflush = 1;
-
-/* Allocation bitmap for the remapping area */ 
-static DEFINE_SPINLOCK(iommu_bitmap_lock);
-static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */
-
-static u32 gart_unmapped_entry; 
-
-#define GPTE_VALID    1
-#define GPTE_COHERENT 2
-#define GPTE_ENCODE(x) \
-       (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
-#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
-
-#define to_pages(addr,size) \
-       (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
-
-#define EMERGENCY_PAGES 32 /* = 128KB */ 
-
-#ifdef CONFIG_AGP
-#define AGPEXTERN extern
-#else
-#define AGPEXTERN
-#endif
-
-/* backdoor interface to AGP driver */
-AGPEXTERN int agp_memory_reserved;
-AGPEXTERN __u32 *agp_gatt_table;
-
-static unsigned long next_bit;  /* protected by iommu_bitmap_lock */
-static int need_flush;                 /* global flush state. set for each gart wrap */
-
-static unsigned long alloc_iommu(int size) 
-{      
-       unsigned long offset, flags;
-
-       spin_lock_irqsave(&iommu_bitmap_lock, flags);   
-       offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size);
-       if (offset == -1) {
-               need_flush = 1;
-               offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size);
-       }
-       if (offset != -1) { 
-               set_bit_string(iommu_gart_bitmap, offset, size); 
-               next_bit = offset+size; 
-               if (next_bit >= iommu_pages) { 
-                       next_bit = 0;
-                       need_flush = 1;
-               } 
-       } 
-       if (iommu_fullflush)
-               need_flush = 1;
-       spin_unlock_irqrestore(&iommu_bitmap_lock, flags);      
-       return offset;
-} 
-
-static void free_iommu(unsigned long offset, int size)
-{ 
-       unsigned long flags;
-       spin_lock_irqsave(&iommu_bitmap_lock, flags);
-       __clear_bit_string(iommu_gart_bitmap, offset, size);
-       spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
-} 
-
-/* 
- * Use global flush state to avoid races with multiple flushers.
- */
-static void flush_gart(void)
-{ 
-       unsigned long flags;
-       spin_lock_irqsave(&iommu_bitmap_lock, flags);
-       if (need_flush) {
-               k8_flush_garts();
-               need_flush = 0;
-       } 
-       spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
-} 
-
-#ifdef CONFIG_IOMMU_LEAK
-
-#define SET_LEAK(x) if (iommu_leak_tab) \
-                       iommu_leak_tab[x] = __builtin_return_address(0);
-#define CLEAR_LEAK(x) if (iommu_leak_tab) \
-                       iommu_leak_tab[x] = NULL;
-
-/* Debugging aid for drivers that don't free their IOMMU tables */
-static void **iommu_leak_tab; 
-static int leak_trace;
-int iommu_leak_pages = 20; 
-void dump_leak(void)
-{
-       int i;
-       static int dump; 
-       if (dump || !iommu_leak_tab) return;
-       dump = 1;
-       show_stack(NULL,NULL);
-       /* Very crude. dump some from the end of the table too */ 
-       printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages); 
-       for (i = 0; i < iommu_leak_pages; i+=2) {
-               printk("%lu: ", iommu_pages-i);
-               printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]);
-               printk("%c", (i+1)%2 == 0 ? '\n' : ' '); 
-       } 
-       printk("\n");
-}
-#else
-#define SET_LEAK(x)
-#define CLEAR_LEAK(x)
-#endif
-
-static void iommu_full(struct device *dev, size_t size, int dir)
-{
-       /* 
-        * Ran out of IOMMU space for this operation. This is very bad.
-        * Unfortunately the drivers cannot handle this operation properly.
-        * Return some non mapped prereserved space in the aperture and 
-        * let the Northbridge deal with it. This will result in garbage
-        * in the IO operation. When the size exceeds the prereserved space
-        * memory corruption will occur or random memory will be DMAed 
-        * out. Hopefully no network devices use single mappings that big.
-        */ 
-       
-       printk(KERN_ERR 
-  "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
-              size, dev->bus_id);
-
-       if (size > PAGE_SIZE*EMERGENCY_PAGES) {
-               if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
-                       panic("PCI-DMA: Memory would be corrupted\n");
-               if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) 
-                       panic(KERN_ERR "PCI-DMA: Random memory would be DMAed\n");
-       } 
-
-#ifdef CONFIG_IOMMU_LEAK
-       dump_leak(); 
-#endif
-} 
-
-static inline int need_iommu(struct device *dev, unsigned long addr, size_t size)
-{ 
-       u64 mask = *dev->dma_mask;
-       int high = addr + size > mask;
-       int mmu = high;
-       if (force_iommu) 
-               mmu = 1; 
-       return mmu; 
-}
-
-static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
-{ 
-       u64 mask = *dev->dma_mask;
-       int high = addr + size > mask;
-       int mmu = high;
-       return mmu; 
-}
-
-/* Map a single continuous physical area into the IOMMU.
- * Caller needs to check if the iommu is needed and flush.
- */
-static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
-                               size_t size, int dir)
-{ 
-       unsigned long npages = to_pages(phys_mem, size);
-       unsigned long iommu_page = alloc_iommu(npages);
-       int i;
-       if (iommu_page == -1) {
-               if (!nonforced_iommu(dev, phys_mem, size))
-                       return phys_mem; 
-               if (panic_on_overflow)
-                       panic("dma_map_area overflow %lu bytes\n", size);
-               iommu_full(dev, size, dir);
-               return bad_dma_address;
-       }
-
-       for (i = 0; i < npages; i++) {
-               iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
-               SET_LEAK(iommu_page + i);
-               phys_mem += PAGE_SIZE;
-       }
-       return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
-}
-
-static dma_addr_t gart_map_simple(struct device *dev, char *buf,
-                                size_t size, int dir)
-{
-       dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir);
-       flush_gart();
-       return map;
-}
-
-/* Map a single area into the IOMMU */
-static dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir)
-{
-       unsigned long phys_mem, bus;
-
-       if (!dev)
-               dev = &fallback_dev;
-
-       phys_mem = virt_to_phys(addr); 
-       if (!need_iommu(dev, phys_mem, size))
-               return phys_mem; 
-
-       bus = gart_map_simple(dev, addr, size, dir);
-       return bus; 
-}
-
-/*
- * Free a DMA mapping.
- */
-static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
-                     size_t size, int direction)
-{
-       unsigned long iommu_page;
-       int npages;
-       int i;
-
-       if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
-           dma_addr >= iommu_bus_base + iommu_size)
-               return;
-       iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
-       npages = to_pages(dma_addr, size);
-       for (i = 0; i < npages; i++) {
-               iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
-               CLEAR_LEAK(iommu_page + i);
-       }
-       free_iommu(iommu_page, npages);
-}
-
-/*
- * Wrapper for pci_unmap_single working with scatterlists.
- */
-static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
-{
-       int i;
-
-       for (i = 0; i < nents; i++) {
-               struct scatterlist *s = &sg[i];
-               if (!s->dma_length || !s->length)
-                       break;
-               gart_unmap_single(dev, s->dma_address, s->dma_length, dir);
-       }
-}
-
-/* Fallback for dma_map_sg in case of overflow */
-static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
-                              int nents, int dir)
-{
-       int i;
-
-#ifdef CONFIG_IOMMU_DEBUG
-       printk(KERN_DEBUG "dma_map_sg overflow\n");
-#endif
-
-       for (i = 0; i < nents; i++ ) {
-               struct scatterlist *s = &sg[i];
-               unsigned long addr = page_to_phys(s->page) + s->offset; 
-               if (nonforced_iommu(dev, addr, s->length)) { 
-                       addr = dma_map_area(dev, addr, s->length, dir);
-                       if (addr == bad_dma_address) { 
-                               if (i > 0) 
-                                       gart_unmap_sg(dev, sg, i, dir);
-                               nents = 0; 
-                               sg[0].dma_length = 0;
-                               break;
-                       }
-               }
-               s->dma_address = addr;
-               s->dma_length = s->length;
-       }
-       flush_gart();
-       return nents;
-}
-
-/* Map multiple scatterlist entries continuous into the first. */
-static int __dma_map_cont(struct scatterlist *sg, int start, int stopat,
-                     struct scatterlist *sout, unsigned long pages)
-{
-       unsigned long iommu_start = alloc_iommu(pages);
-       unsigned long iommu_page = iommu_start; 
-       int i;
-
-       if (iommu_start == -1)
-               return -1;
-       
-       for (i = start; i < stopat; i++) {
-               struct scatterlist *s = &sg[i];
-               unsigned long pages, addr;
-               unsigned long phys_addr = s->dma_address;
-               
-               BUG_ON(i > start && s->offset);
-               if (i == start) {
-                       *sout = *s; 
-                       sout->dma_address = iommu_bus_base;
-                       sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
-                       sout->dma_length = s->length;
-               } else { 
-                       sout->dma_length += s->length; 
-               }
-
-               addr = phys_addr;
-               pages = to_pages(s->offset, s->length); 
-               while (pages--) { 
-                       iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 
-                       SET_LEAK(iommu_page);
-                       addr += PAGE_SIZE;
-                       iommu_page++;
-               }
-       } 
-       BUG_ON(iommu_page - iommu_start != pages);      
-       return 0;
-}
-
-static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat,
-                     struct scatterlist *sout,
-                     unsigned long pages, int need)
-{
-       if (!need) { 
-               BUG_ON(stopat - start != 1);
-               *sout = sg[start]; 
-               sout->dma_length = sg[start].length; 
-               return 0;
-       } 
-       return __dma_map_cont(sg, start, stopat, sout, pages);
-}
-               
-/*
- * DMA map all entries in a scatterlist.
- * Merge chunks that have page aligned sizes into a continuous mapping. 
- */
-int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
-{
-       int i;
-       int out;
-       int start;
-       unsigned long pages = 0;
-       int need = 0, nextneed;
-
-       if (nents == 0) 
-               return 0;
-
-       if (!dev)
-               dev = &fallback_dev;
-
-       out = 0;
-       start = 0;
-       for (i = 0; i < nents; i++) {
-               struct scatterlist *s = &sg[i];
-               dma_addr_t addr = page_to_phys(s->page) + s->offset;
-               s->dma_address = addr;
-               BUG_ON(s->length == 0); 
-
-               nextneed = need_iommu(dev, addr, s->length); 
-
-               /* Handle the previous not yet processed entries */
-               if (i > start) {
-                       struct scatterlist *ps = &sg[i-1];
-                       /* Can only merge when the last chunk ends on a page 
-                          boundary and the new one doesn't have an offset. */
-                       if (!iommu_merge || !nextneed || !need || s->offset ||
-                           (ps->offset + ps->length) % PAGE_SIZE) { 
-                               if (dma_map_cont(sg, start, i, sg+out, pages,
-                                                need) < 0)
-                                       goto error;
-                               out++;
-                               pages = 0;
-                               start = i;      
-                       }
-               }
-
-               need = nextneed;
-               pages += to_pages(s->offset, s->length);
-       }
-       if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0)
-               goto error;
-       out++;
-       flush_gart();
-       if (out < nents) 
-               sg[out].dma_length = 0; 
-       return out;
-
-error:
-       flush_gart();
-       gart_unmap_sg(dev, sg, nents, dir);
-       /* When it was forced or merged try again in a dumb way */
-       if (force_iommu || iommu_merge) {
-               out = dma_map_sg_nonforce(dev, sg, nents, dir);
-               if (out > 0)
-                       return out;
-       }
-       if (panic_on_overflow)
-               panic("dma_map_sg: overflow on %lu pages\n", pages);
-       iommu_full(dev, pages << PAGE_SHIFT, dir);
-       for (i = 0; i < nents; i++)
-               sg[i].dma_address = bad_dma_address;
-       return 0;
-} 
-
-static int no_agp;
-
-static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
-{ 
-       unsigned long a; 
-       if (!iommu_size) { 
-               iommu_size = aper_size; 
-               if (!no_agp) 
-                       iommu_size /= 2; 
-       } 
-
-       a = aper + iommu_size; 
-       iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a;
-
-       if (iommu_size < 64*1024*1024) 
-               printk(KERN_WARNING
-  "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20); 
-       
-       return iommu_size;
-} 
-
-static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) 
-{ 
-       unsigned aper_size = 0, aper_base_32;
-       u64 aper_base;
-       unsigned aper_order;
-
-       pci_read_config_dword(dev, 0x94, &aper_base_32); 
-       pci_read_config_dword(dev, 0x90, &aper_order);
-       aper_order = (aper_order >> 1) & 7;     
-
-       aper_base = aper_base_32 & 0x7fff; 
-       aper_base <<= 25;
-
-       aper_size = (32 * 1024 * 1024) << aper_order; 
-       if (aper_base + aper_size > 0x100000000UL || !aper_size)
-               aper_base = 0;
-
-       *size = aper_size;
-       return aper_base;
-} 
-
-/* 
- * Private Northbridge GATT initialization in case we cannot use the
- * AGP driver for some reason.  
- */
-static __init int init_k8_gatt(struct agp_kern_info *info)
-{ 
-       struct pci_dev *dev;
-       void *gatt;
-       unsigned aper_base, new_aper_base;
-       unsigned aper_size, gatt_size, new_aper_size;
-       int i;
-
-       printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
-       aper_size = aper_base = info->aper_size = 0;
-       dev = NULL;
-       for (i = 0; i < num_k8_northbridges; i++) {
-               dev = k8_northbridges[i];
-               new_aper_base = read_aperture(dev, &new_aper_size); 
-               if (!new_aper_base) 
-                       goto nommu; 
-               
-               if (!aper_base) { 
-                       aper_size = new_aper_size;
-                       aper_base = new_aper_base;
-               } 
-               if (aper_size != new_aper_size || aper_base != new_aper_base) 
-                       goto nommu;
-       }
-       if (!aper_base)
-               goto nommu; 
-       info->aper_base = aper_base;
-       info->aper_size = aper_size>>20; 
-
-       gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); 
-       gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); 
-       if (!gatt) 
-               panic("Cannot allocate GATT table");
-       if (change_page_attr_addr((unsigned long)gatt, gatt_size >> PAGE_SHIFT, PAGE_KERNEL_NOCACHE))
-               panic("Could not set GART PTEs to uncacheable pages");
-       global_flush_tlb();
-
-       memset(gatt, 0, gatt_size); 
-       agp_gatt_table = gatt;
-
-       for (i = 0; i < num_k8_northbridges; i++) {
-               u32 ctl; 
-               u32 gatt_reg; 
-
-               dev = k8_northbridges[i];
-               gatt_reg = __pa(gatt) >> 12; 
-               gatt_reg <<= 4; 
-               pci_write_config_dword(dev, 0x98, gatt_reg);
-               pci_read_config_dword(dev, 0x90, &ctl); 
-
-               ctl |= 1;
-               ctl &= ~((1<<4) | (1<<5));
-
-               pci_write_config_dword(dev, 0x90, ctl); 
-       }
-       flush_gart();
-       
-       printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); 
-       return 0;
-
- nommu:
-       /* Should not happen anymore */
-       printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
-              KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n");
-       return -1; 
-} 
-
-extern int agp_amd64_init(void);
-
-static const struct dma_mapping_ops gart_dma_ops = {
-       .mapping_error = NULL,
-       .map_single = gart_map_single,
-       .map_simple = gart_map_simple,
-       .unmap_single = gart_unmap_single,
-       .sync_single_for_cpu = NULL,
-       .sync_single_for_device = NULL,
-       .sync_single_range_for_cpu = NULL,
-       .sync_single_range_for_device = NULL,
-       .sync_sg_for_cpu = NULL,
-       .sync_sg_for_device = NULL,
-       .map_sg = gart_map_sg,
-       .unmap_sg = gart_unmap_sg,
-};
-
-void gart_iommu_shutdown(void)
-{
-       struct pci_dev *dev;
-       int i;
-
-       if (no_agp && (dma_ops != &gart_dma_ops))
-               return;
-
-        for (i = 0; i < num_k8_northbridges; i++) {
-                u32 ctl;
-
-                dev = k8_northbridges[i];
-                pci_read_config_dword(dev, 0x90, &ctl);
-
-                ctl &= ~1;
-
-                pci_write_config_dword(dev, 0x90, ctl);
-        }
-}
-
-void __init gart_iommu_init(void)
-{ 
-       struct agp_kern_info info;
-       unsigned long aper_size;
-       unsigned long iommu_start;
-       unsigned long scratch;
-       long i;
-
-       if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) {
-               printk(KERN_INFO "PCI-GART: No AMD northbridge found.\n");
-               return;
-       }
-
-#ifndef CONFIG_AGP_AMD64
-       no_agp = 1; 
-#else
-       /* Makefile puts PCI initialization via subsys_initcall first. */
-       /* Add other K8 AGP bridge drivers here */
-       no_agp = no_agp || 
-               (agp_amd64_init() < 0) || 
-               (agp_copy_info(agp_bridge, &info) < 0);
-#endif 
-
-       if (swiotlb)
-               return;
-
-       /* Did we detect a different HW IOMMU? */
-       if (iommu_detected && !iommu_aperture)
-               return;
-
-       if (no_iommu ||
-           (!force_iommu && end_pfn <= MAX_DMA32_PFN) ||
-           !iommu_aperture ||
-           (no_agp && init_k8_gatt(&info) < 0)) {
-               if (end_pfn > MAX_DMA32_PFN) {
-                       printk(KERN_ERR "WARNING more than 4GB of memory "
-                                       "but GART IOMMU not available.\n"
-                              KERN_ERR "WARNING 32bit PCI may malfunction.\n");
-               }
-               return;
-       }
-
-       printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
-       aper_size = info.aper_size * 1024 * 1024;       
-       iommu_size = check_iommu_size(info.aper_base, aper_size); 
-       iommu_pages = iommu_size >> PAGE_SHIFT; 
-
-       iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, 
-                                                   get_order(iommu_pages/8)); 
-       if (!iommu_gart_bitmap) 
-               panic("Cannot allocate iommu bitmap\n"); 
-       memset(iommu_gart_bitmap, 0, iommu_pages/8);
-
-#ifdef CONFIG_IOMMU_LEAK
-       if (leak_trace) { 
-               iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, 
-                                 get_order(iommu_pages*sizeof(void *)));
-               if (iommu_leak_tab) 
-                       memset(iommu_leak_tab, 0, iommu_pages * 8); 
-               else
-                       printk("PCI-DMA: Cannot allocate leak trace area\n"); 
-       } 
-#endif
-
-       /* 
-        * Out of IOMMU space handling.
-        * Reserve some invalid pages at the beginning of the GART. 
-        */ 
-       set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 
-
-       agp_memory_reserved = iommu_size;       
-       printk(KERN_INFO
-              "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
-              iommu_size>>20); 
-
-       iommu_start = aper_size - iommu_size;   
-       iommu_bus_base = info.aper_base + iommu_start; 
-       bad_dma_address = iommu_bus_base;
-       iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
-
-       /* 
-        * Unmap the IOMMU part of the GART. The alias of the page is
-        * always mapped with cache enabled and there is no full cache
-        * coherency across the GART remapping. The unmapping avoids
-        * automatic prefetches from the CPU allocating cache lines in
-        * there. All CPU accesses are done via the direct mapping to
-        * the backing memory. The GART address is only used by PCI
-        * devices. 
-        */
-       clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size);
-
-       /* 
-        * Try to workaround a bug (thanks to BenH) 
-        * Set unmapped entries to a scratch page instead of 0. 
-        * Any prefetches that hit unmapped entries won't get an bus abort
-        * then.
-        */
-       scratch = get_zeroed_page(GFP_KERNEL); 
-       if (!scratch) 
-               panic("Cannot allocate iommu scratch page");
-       gart_unmapped_entry = GPTE_ENCODE(__pa(scratch));
-       for (i = EMERGENCY_PAGES; i < iommu_pages; i++) 
-               iommu_gatt_base[i] = gart_unmapped_entry;
-
-       flush_gart();
-       dma_ops = &gart_dma_ops;
-} 
-
-void __init gart_parse_options(char *p)
-{
-       int arg;
-
-#ifdef CONFIG_IOMMU_LEAK
-       if (!strncmp(p,"leak",4)) {
-               leak_trace = 1;
-               p += 4;
-               if (*p == '=') ++p;
-               if (isdigit(*p) && get_option(&p, &arg))
-                       iommu_leak_pages = arg;
-       }
-#endif
-       if (isdigit(*p) && get_option(&p, &arg))
-               iommu_size = arg;
-       if (!strncmp(p, "fullflush",8))
-               iommu_fullflush = 1;
-       if (!strncmp(p, "nofullflush",11))
-               iommu_fullflush = 0;
-       if (!strncmp(p,"noagp",5))
-               no_agp = 1;
-       if (!strncmp(p, "noaperture",10))
-               fix_aperture = 0;
-       /* duplicated from pci-dma.c */
-       if (!strncmp(p,"force",5))
-               iommu_aperture_allowed = 1;
-       if (!strncmp(p,"allowed",7))
-               iommu_aperture_allowed = 1;
-       if (!strncmp(p, "memaper", 7)) {
-               fallback_aper_force = 1;
-               p += 7;
-               if (*p == '=') {
-                       ++p;
-                       if (get_option(&p, &arg))
-                               fallback_aper_order = arg;
-               }
-       }
-}
diff --git a/arch/x86_64/kernel/pci-nommu_64.c b/arch/x86_64/kernel/pci-nommu_64.c
deleted file mode 100644 (file)
index 2a34c6c..0000000
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Fallback functions when the main IOMMU code is not compiled in. This
-   code is roughly equivalent to i386. */
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/string.h>
-#include <linux/dma-mapping.h>
-
-#include <asm/iommu.h>
-#include <asm/processor.h>
-#include <asm/dma.h>
-
-static int
-check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
-{
-        if (hwdev && bus + size > *hwdev->dma_mask) {
-               if (*hwdev->dma_mask >= DMA_32BIT_MASK)
-                       printk(KERN_ERR
-                           "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
-                               name, (long long)bus, size,
-                               (long long)*hwdev->dma_mask);
-               return 0;
-       }
-       return 1;
-}
-
-static dma_addr_t
-nommu_map_single(struct device *hwdev, void *ptr, size_t size,
-              int direction)
-{
-       dma_addr_t bus = virt_to_bus(ptr);
-       if (!check_addr("map_single", hwdev, bus, size))
-                               return bad_dma_address;
-       return bus;
-}
-
-static void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
-                       int direction)
-{
-}
-
-/* Map a set of buffers described by scatterlist in streaming
- * mode for DMA.  This is the scatter-gather version of the
- * above pci_map_single interface.  Here the scatter gather list
- * elements are each tagged with the appropriate dma address
- * and length.  They are obtained via sg_dma_{address,length}(SG).
- *
- * NOTE: An implementation may be able to use a smaller number of
- *       DMA address/length pairs than there are SG table elements.
- *       (for example via virtual mapping capabilities)
- *       The routine returns the number of addr/length pairs actually
- *       used, at most nents.
- *
- * Device ownership issues as mentioned above for pci_map_single are
- * the same here.
- */
-static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
-              int nents, int direction)
-{
-       int i;
-
-       for (i = 0; i < nents; i++ ) {
-               struct scatterlist *s = &sg[i];
-               BUG_ON(!s->page);
-               s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
-               if (!check_addr("map_sg", hwdev, s->dma_address, s->length))
-                       return 0;
-               s->dma_length = s->length;
-       }
-       return nents;
-}
-
-/* Unmap a set of streaming mode DMA translations.
- * Again, cpu read rules concerning calls here are the same as for
- * pci_unmap_single() above.
- */
-static void nommu_unmap_sg(struct device *dev, struct scatterlist *sg,
-                 int nents, int dir)
-{
-}
-
-const struct dma_mapping_ops nommu_dma_ops = {
-       .map_single = nommu_map_single,
-       .unmap_single = nommu_unmap_single,
-       .map_sg = nommu_map_sg,
-       .unmap_sg = nommu_unmap_sg,
-       .is_phys = 1,
-};
-
-void __init no_iommu_init(void)
-{
-       if (dma_ops)
-               return;
-
-       force_iommu = 0; /* no HW IOMMU */
-       dma_ops = &nommu_dma_ops;
-}
diff --git a/arch/x86_64/kernel/pci-swiotlb_64.c b/arch/x86_64/kernel/pci-swiotlb_64.c
deleted file mode 100644 (file)
index b2f405e..0000000
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Glue code to lib/swiotlb.c */
-
-#include <linux/pci.h>
-#include <linux/cache.h>
-#include <linux/module.h>
-#include <linux/dma-mapping.h>
-
-#include <asm/iommu.h>
-#include <asm/swiotlb.h>
-#include <asm/dma.h>
-
-int swiotlb __read_mostly;
-EXPORT_SYMBOL(swiotlb);
-
-const struct dma_mapping_ops swiotlb_dma_ops = {
-       .mapping_error = swiotlb_dma_mapping_error,
-       .alloc_coherent = swiotlb_alloc_coherent,
-       .free_coherent = swiotlb_free_coherent,
-       .map_single = swiotlb_map_single,
-       .unmap_single = swiotlb_unmap_single,
-       .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
-       .sync_single_for_device = swiotlb_sync_single_for_device,
-       .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
-       .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
-       .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
-       .sync_sg_for_device = swiotlb_sync_sg_for_device,
-       .map_sg = swiotlb_map_sg,
-       .unmap_sg = swiotlb_unmap_sg,
-       .dma_supported = NULL,
-};
-
-void __init pci_swiotlb_init(void)
-{
-       /* don't initialize swiotlb if iommu=off (no_iommu=1) */
-       if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN)
-              swiotlb = 1;
-       if (swiotlb_force)
-               swiotlb = 1;
-       if (swiotlb) {
-               printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
-               swiotlb_init();
-               dma_ops = &swiotlb_dma_ops;
-       }
-}
diff --git a/arch/x86_64/kernel/pmtimer_64.c b/arch/x86_64/kernel/pmtimer_64.c
deleted file mode 100644 (file)
index ae8f912..0000000
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Ported over from i386 by AK, original copyright was:
- *
- * (C) Dominik Brodowski <linux@brodo.de> 2003
- *
- * Driver to use the Power Management Timer (PMTMR) available in some
- * southbridges as primary timing source for the Linux kernel.
- *
- * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
- * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
- *
- * This file is licensed under the GPL v2.
- *
- * Dropped all the hardware bug workarounds for now. Hopefully they
- * are not needed on 64bit chipsets.
- */
-
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/time.h>
-#include <linux/init.h>
-#include <linux/cpumask.h>
-#include <asm/io.h>
-#include <asm/proto.h>
-#include <asm/msr.h>
-#include <asm/vsyscall.h>
-
-#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
-
-static inline u32 cyc2us(u32 cycles)
-{
-       /* The Power Management Timer ticks at 3.579545 ticks per microsecond.
-        * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
-        *
-        * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
-        * easily be multiplied with 286 (=0x11E) without having to fear
-        * u32 overflows.
-        */
-       cycles *= 286;
-       return (cycles >> 10);
-}
-
-static unsigned pmtimer_wait_tick(void)
-{
-       u32 a, b;
-       for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK;
-            a == b;
-            b = inl(pmtmr_ioport) & ACPI_PM_MASK)
-               cpu_relax();
-       return b;
-}
-
-/* note: wait time is rounded up to one tick */
-void pmtimer_wait(unsigned us)
-{
-       u32 a, b;
-       a = pmtimer_wait_tick();
-       do {
-               b = inl(pmtmr_ioport);
-               cpu_relax();
-       } while (cyc2us(b - a) < us);
-}
-
-static int __init nopmtimer_setup(char *s)
-{
-       pmtmr_ioport = 0;
-       return 1;
-}
-
-__setup("nopmtimer", nopmtimer_setup);
diff --git a/arch/x86_64/kernel/process_64.c b/arch/x86_64/kernel/process_64.c
deleted file mode 100644 (file)
index 9895655..0000000
+++ /dev/null
@@ -1,903 +0,0 @@
-/*
- *  linux/arch/x86-64/kernel/process.c
- *
- *  Copyright (C) 1995  Linus Torvalds
- *
- *  Pentium III FXSR, SSE support
- *     Gareth Hughes <gareth@valinux.com>, May 2000
- * 
- *  X86-64 port
- *     Andi Kleen.
- *
- *     CPU hotplug support - ashok.raj@intel.com
- */
-
-/*
- * This file handles the architecture-dependent parts of process handling..
- */
-
-#include <stdarg.h>
-
-#include <linux/cpu.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/elfcore.h>
-#include <linux/smp.h>
-#include <linux/slab.h>
-#include <linux/user.h>
-#include <linux/module.h>
-#include <linux/a.out.h>
-#include <linux/interrupt.h>
-#include <linux/delay.h>
-#include <linux/ptrace.h>
-#include <linux/utsname.h>
-#include <linux/random.h>
-#include <linux/notifier.h>
-#include <linux/kprobes.h>
-#include <linux/kdebug.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/system.h>
-#include <asm/io.h>
-#include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/mmu_context.h>
-#include <asm/pda.h>
-#include <asm/prctl.h>
-#include <asm/desc.h>
-#include <asm/proto.h>
-#include <asm/ia32.h>
-#include <asm/idle.h>
-
-asmlinkage extern void ret_from_fork(void);
-
-unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
-
-unsigned long boot_option_idle_override = 0;
-EXPORT_SYMBOL(boot_option_idle_override);
-
-/*
- * Powermanagement idle function, if any..
- */
-void (*pm_idle)(void);
-EXPORT_SYMBOL(pm_idle);
-static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
-
-static ATOMIC_NOTIFIER_HEAD(idle_notifier);
-
-void idle_notifier_register(struct notifier_block *n)
-{
-       atomic_notifier_chain_register(&idle_notifier, n);
-}
-EXPORT_SYMBOL_GPL(idle_notifier_register);
-
-void idle_notifier_unregister(struct notifier_block *n)
-{
-       atomic_notifier_chain_unregister(&idle_notifier, n);
-}
-EXPORT_SYMBOL(idle_notifier_unregister);
-
-void enter_idle(void)
-{
-       write_pda(isidle, 1);
-       atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
-}
-
-static void __exit_idle(void)
-{
-       if (test_and_clear_bit_pda(0, isidle) == 0)
-               return;
-       atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
-}
-
-/* Called from interrupts to signify idle end */
-void exit_idle(void)
-{
-       /* idle loop has pid 0 */
-       if (current->pid)
-               return;
-       __exit_idle();
-}
-
-/*
- * We use this if we don't have any better
- * idle routine..
- */
-static void default_idle(void)
-{
-       current_thread_info()->status &= ~TS_POLLING;
-       /*
-        * TS_POLLING-cleared state must be visible before we
-        * test NEED_RESCHED:
-        */
-       smp_mb();
-       local_irq_disable();
-       if (!need_resched()) {
-               /* Enables interrupts one instruction before HLT.
-                  x86 special cases this so there is no race. */
-               safe_halt();
-       } else
-               local_irq_enable();
-       current_thread_info()->status |= TS_POLLING;
-}
-
-/*
- * On SMP it's slightly faster (but much more power-consuming!)
- * to poll the ->need_resched flag instead of waiting for the
- * cross-CPU IPI to arrive. Use this option with caution.
- */
-static void poll_idle (void)
-{
-       local_irq_enable();
-       cpu_relax();
-}
-
-void cpu_idle_wait(void)
-{
-       unsigned int cpu, this_cpu = get_cpu();
-       cpumask_t map, tmp = current->cpus_allowed;
-
-       set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
-       put_cpu();
-
-       cpus_clear(map);
-       for_each_online_cpu(cpu) {
-               per_cpu(cpu_idle_state, cpu) = 1;
-               cpu_set(cpu, map);
-       }
-
-       __get_cpu_var(cpu_idle_state) = 0;
-
-       wmb();
-       do {
-               ssleep(1);
-               for_each_online_cpu(cpu) {
-                       if (cpu_isset(cpu, map) &&
-                                       !per_cpu(cpu_idle_state, cpu))
-                               cpu_clear(cpu, map);
-               }
-               cpus_and(map, map, cpu_online_map);
-       } while (!cpus_empty(map));
-
-       set_cpus_allowed(current, tmp);
-}
-EXPORT_SYMBOL_GPL(cpu_idle_wait);
-
-#ifdef CONFIG_HOTPLUG_CPU
-DECLARE_PER_CPU(int, cpu_state);
-
-#include <asm/nmi.h>
-/* We halt the CPU with physical CPU hotplug */
-static inline void play_dead(void)
-{
-       idle_task_exit();
-       wbinvd();
-       mb();
-       /* Ack it */
-       __get_cpu_var(cpu_state) = CPU_DEAD;
-
-       local_irq_disable();
-       while (1)
-               halt();
-}
-#else
-static inline void play_dead(void)
-{
-       BUG();
-}
-#endif /* CONFIG_HOTPLUG_CPU */
-
-/*
- * The idle thread. There's no useful work to be
- * done, so just try to conserve power and have a
- * low exit latency (ie sit in a loop waiting for
- * somebody to say that they'd like to reschedule)
- */
-void cpu_idle (void)
-{
-       current_thread_info()->status |= TS_POLLING;
-       /* endless idle loop with no priority at all */
-       while (1) {
-               while (!need_resched()) {
-                       void (*idle)(void);
-
-                       if (__get_cpu_var(cpu_idle_state))
-                               __get_cpu_var(cpu_idle_state) = 0;
-
-                       rmb();
-                       idle = pm_idle;
-                       if (!idle)
-                               idle = default_idle;
-                       if (cpu_is_offline(smp_processor_id()))
-                               play_dead();
-                       /*
-                        * Idle routines should keep interrupts disabled
-                        * from here on, until they go to idle.
-                        * Otherwise, idle callbacks can misfire.
-                        */
-                       local_irq_disable();
-                       enter_idle();
-                       idle();
-                       /* In many cases the interrupt that ended idle
-                          has already called exit_idle. But some idle
-                          loops can be woken up without interrupt. */
-                       __exit_idle();
-               }
-
-               preempt_enable_no_resched();
-               schedule();
-               preempt_disable();
-       }
-}
-
-/*
- * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
- * which can obviate IPI to trigger checking of need_resched.
- * We execute MONITOR against need_resched and enter optimized wait state
- * through MWAIT. Whenever someone changes need_resched, we would be woken
- * up from MWAIT (without an IPI).
- *
- * New with Core Duo processors, MWAIT can take some hints based on CPU
- * capability.
- */
-void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
-{
-       if (!need_resched()) {
-               __monitor((void *)&current_thread_info()->flags, 0, 0);
-               smp_mb();
-               if (!need_resched())
-                       __mwait(eax, ecx);
-       }
-}
-
-/* Default MONITOR/MWAIT with no hints, used for default C1 state */
-static void mwait_idle(void)
-{
-       if (!need_resched()) {
-               __monitor((void *)&current_thread_info()->flags, 0, 0);
-               smp_mb();
-               if (!need_resched())
-                       __sti_mwait(0, 0);
-               else
-                       local_irq_enable();
-       } else {
-               local_irq_enable();
-       }
-}
-
-void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
-{
-       static int printed;
-       if (cpu_has(c, X86_FEATURE_MWAIT)) {
-               /*
-                * Skip, if setup has overridden idle.
-                * One CPU supports mwait => All CPUs supports mwait
-                */
-               if (!pm_idle) {
-                       if (!printed) {
-                               printk(KERN_INFO "using mwait in idle threads.\n");
-                               printed = 1;
-                       }
-                       pm_idle = mwait_idle;
-               }
-       }
-}
-
-static int __init idle_setup (char *str)
-{
-       if (!strcmp(str, "poll")) {
-               printk("using polling idle threads.\n");
-               pm_idle = poll_idle;
-       } else if (!strcmp(str, "mwait"))
-               force_mwait = 1;
-       else
-               return -1;
-
-       boot_option_idle_override = 1;
-       return 0;
-}
-early_param("idle", idle_setup);
-
-/* Prints also some state that isn't saved in the pt_regs */ 
-void __show_regs(struct pt_regs * regs)
-{
-       unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
-       unsigned long d0, d1, d2, d3, d6, d7;
-       unsigned int fsindex,gsindex;
-       unsigned int ds,cs,es; 
-
-       printk("\n");
-       print_modules();
-       printk("Pid: %d, comm: %.20s %s %s %.*s\n",
-               current->pid, current->comm, print_tainted(),
-               init_utsname()->release,
-               (int)strcspn(init_utsname()->version, " "),
-               init_utsname()->version);
-       printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
-       printk_address(regs->rip); 
-       printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->rsp,
-               regs->eflags);
-       printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
-              regs->rax, regs->rbx, regs->rcx);
-       printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
-              regs->rdx, regs->rsi, regs->rdi); 
-       printk("RBP: %016lx R08: %016lx R09: %016lx\n",
-              regs->rbp, regs->r8, regs->r9); 
-       printk("R10: %016lx R11: %016lx R12: %016lx\n",
-              regs->r10, regs->r11, regs->r12); 
-       printk("R13: %016lx R14: %016lx R15: %016lx\n",
-              regs->r13, regs->r14, regs->r15); 
-
-       asm("movl %%ds,%0" : "=r" (ds)); 
-       asm("movl %%cs,%0" : "=r" (cs)); 
-       asm("movl %%es,%0" : "=r" (es)); 
-       asm("movl %%fs,%0" : "=r" (fsindex));
-       asm("movl %%gs,%0" : "=r" (gsindex));
-
-       rdmsrl(MSR_FS_BASE, fs);
-       rdmsrl(MSR_GS_BASE, gs); 
-       rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 
-
-       cr0 = read_cr0();
-       cr2 = read_cr2();
-       cr3 = read_cr3();
-       cr4 = read_cr4();
-
-       printk("FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 
-              fs,fsindex,gs,gsindex,shadowgs); 
-       printk("CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); 
-       printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
-
-       get_debugreg(d0, 0);
-       get_debugreg(d1, 1);
-       get_debugreg(d2, 2);
-       printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
-       get_debugreg(d3, 3);
-       get_debugreg(d6, 6);
-       get_debugreg(d7, 7);
-       printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
-}
-
-void show_regs(struct pt_regs *regs)
-{
-       printk("CPU %d:", smp_processor_id());
-       __show_regs(regs);
-       show_trace(NULL, regs, (void *)(regs + 1));
-}
-
-/*
- * Free current thread data structures etc..
- */
-void exit_thread(void)
-{
-       struct task_struct *me = current;
-       struct thread_struct *t = &me->thread;
-
-       if (me->thread.io_bitmap_ptr) { 
-               struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
-
-               kfree(t->io_bitmap_ptr);
-               t->io_bitmap_ptr = NULL;
-               clear_thread_flag(TIF_IO_BITMAP);
-               /*
-                * Careful, clear this in the TSS too:
-                */
-               memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
-               t->io_bitmap_max = 0;
-               put_cpu();
-       }
-}
-
-void flush_thread(void)
-{
-       struct task_struct *tsk = current;
-
-       if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
-               clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
-               if (test_tsk_thread_flag(tsk, TIF_IA32)) {
-                       clear_tsk_thread_flag(tsk, TIF_IA32);
-               } else {
-                       set_tsk_thread_flag(tsk, TIF_IA32);
-                       current_thread_info()->status |= TS_COMPAT;
-               }
-       }
-       clear_tsk_thread_flag(tsk, TIF_DEBUG);
-
-       tsk->thread.debugreg0 = 0;
-       tsk->thread.debugreg1 = 0;
-       tsk->thread.debugreg2 = 0;
-       tsk->thread.debugreg3 = 0;
-       tsk->thread.debugreg6 = 0;
-       tsk->thread.debugreg7 = 0;
-       memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));        
-       /*
-        * Forget coprocessor state..
-        */
-       clear_fpu(tsk);
-       clear_used_math();
-}
-
-void release_thread(struct task_struct *dead_task)
-{
-       if (dead_task->mm) {
-               if (dead_task->mm->context.size) {
-                       printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
-                                       dead_task->comm,
-                                       dead_task->mm->context.ldt,
-                                       dead_task->mm->context.size);
-                       BUG();
-               }
-       }
-}
-
-static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
-{
-       struct user_desc ud = { 
-               .base_addr = addr,
-               .limit = 0xfffff,
-               .seg_32bit = 1,
-               .limit_in_pages = 1,
-               .useable = 1,
-       };
-       struct n_desc_struct *desc = (void *)t->thread.tls_array;
-       desc += tls;
-       desc->a = LDT_entry_a(&ud); 
-       desc->b = LDT_entry_b(&ud); 
-}
-
-static inline u32 read_32bit_tls(struct task_struct *t, int tls)
-{
-       struct desc_struct *desc = (void *)t->thread.tls_array;
-       desc += tls;
-       return desc->base0 | 
-               (((u32)desc->base1) << 16) | 
-               (((u32)desc->base2) << 24);
-}
-
-/*
- * This gets called before we allocate a new thread and copy
- * the current task into it.
- */
-void prepare_to_copy(struct task_struct *tsk)
-{
-       unlazy_fpu(tsk);
-}
-
-int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, 
-               unsigned long unused,
-       struct task_struct * p, struct pt_regs * regs)
-{
-       int err;
-       struct pt_regs * childregs;
-       struct task_struct *me = current;
-
-       childregs = ((struct pt_regs *)
-                       (THREAD_SIZE + task_stack_page(p))) - 1;
-       *childregs = *regs;
-
-       childregs->rax = 0;
-       childregs->rsp = rsp;
-       if (rsp == ~0UL)
-               childregs->rsp = (unsigned long)childregs;
-
-       p->thread.rsp = (unsigned long) childregs;
-       p->thread.rsp0 = (unsigned long) (childregs+1);
-       p->thread.userrsp = me->thread.userrsp; 
-
-       set_tsk_thread_flag(p, TIF_FORK);
-
-       p->thread.fs = me->thread.fs;
-       p->thread.gs = me->thread.gs;
-
-       asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
-       asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
-       asm("mov %%es,%0" : "=m" (p->thread.es));
-       asm("mov %%ds,%0" : "=m" (p->thread.ds));
-
-       if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
-               p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
-               if (!p->thread.io_bitmap_ptr) {
-                       p->thread.io_bitmap_max = 0;
-                       return -ENOMEM;
-               }
-               memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
-                               IO_BITMAP_BYTES);
-               set_tsk_thread_flag(p, TIF_IO_BITMAP);
-       } 
-
-       /*
-        * Set a new TLS for the child thread?
-        */
-       if (clone_flags & CLONE_SETTLS) {
-#ifdef CONFIG_IA32_EMULATION
-               if (test_thread_flag(TIF_IA32))
-                       err = ia32_child_tls(p, childregs); 
-               else                    
-#endif  
-                       err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 
-               if (err) 
-                       goto out;
-       }
-       err = 0;
-out:
-       if (err && p->thread.io_bitmap_ptr) {
-               kfree(p->thread.io_bitmap_ptr);
-               p->thread.io_bitmap_max = 0;
-       }
-       return err;
-}
-
-/*
- * This special macro can be used to load a debugging register
- */
-#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
-
-static inline void __switch_to_xtra(struct task_struct *prev_p,
-                                   struct task_struct *next_p,
-                                   struct tss_struct *tss)
-{
-       struct thread_struct *prev, *next;
-
-       prev = &prev_p->thread,
-       next = &next_p->thread;
-
-       if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
-               loaddebug(next, 0);
-               loaddebug(next, 1);
-               loaddebug(next, 2);
-               loaddebug(next, 3);
-               /* no 4 and 5 */
-               loaddebug(next, 6);
-               loaddebug(next, 7);
-       }
-
-       if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
-               /*
-                * Copy the relevant range of the IO bitmap.
-                * Normally this is 128 bytes or less:
-                */
-               memcpy(tss->io_bitmap, next->io_bitmap_ptr,
-                      max(prev->io_bitmap_max, next->io_bitmap_max));
-       } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
-               /*
-                * Clear any possible leftover bits:
-                */
-               memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
-       }
-}
-
-/*
- *     switch_to(x,y) should switch tasks from x to y.
- *
- * This could still be optimized: 
- * - fold all the options into a flag word and test it with a single test.
- * - could test fs/gs bitsliced
- *
- * Kprobes not supported here. Set the probe on schedule instead.
- */
-__kprobes struct task_struct *
-__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
-{
-       struct thread_struct *prev = &prev_p->thread,
-                                *next = &next_p->thread;
-       int cpu = smp_processor_id();  
-       struct tss_struct *tss = &per_cpu(init_tss, cpu);
-
-       /* we're going to use this soon, after a few expensive things */
-       if (next_p->fpu_counter>5)
-               prefetch(&next->i387.fxsave);
-
-       /*
-        * Reload esp0, LDT and the page table pointer:
-        */
-       tss->rsp0 = next->rsp0;
-
-       /* 
-        * Switch DS and ES.
-        * This won't pick up thread selector changes, but I guess that is ok.
-        */
-       asm volatile("mov %%es,%0" : "=m" (prev->es));
-       if (unlikely(next->es | prev->es))
-               loadsegment(es, next->es); 
-       
-       asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
-       if (unlikely(next->ds | prev->ds))
-               loadsegment(ds, next->ds);
-
-       load_TLS(next, cpu);
-
-       /* 
-        * Switch FS and GS.
-        */
-       { 
-               unsigned fsindex;
-               asm volatile("movl %%fs,%0" : "=r" (fsindex)); 
-               /* segment register != 0 always requires a reload. 
-                  also reload when it has changed. 
-                  when prev process used 64bit base always reload
-                  to avoid an information leak. */
-               if (unlikely(fsindex | next->fsindex | prev->fs)) {
-                       loadsegment(fs, next->fsindex);
-                       /* check if the user used a selector != 0
-                        * if yes clear 64bit base, since overloaded base
-                         * is always mapped to the Null selector
-                         */
-                       if (fsindex)
-                       prev->fs = 0;                           
-               }
-               /* when next process has a 64bit base use it */
-               if (next->fs) 
-                       wrmsrl(MSR_FS_BASE, next->fs); 
-               prev->fsindex = fsindex;
-       }
-       { 
-               unsigned gsindex;
-               asm volatile("movl %%gs,%0" : "=r" (gsindex)); 
-               if (unlikely(gsindex | next->gsindex | prev->gs)) {
-                       load_gs_index(next->gsindex);
-                       if (gsindex)
-                       prev->gs = 0;                           
-               }
-               if (next->gs)
-                       wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 
-               prev->gsindex = gsindex;
-       }
-
-       /* Must be after DS reload */
-       unlazy_fpu(prev_p);
-
-       /* 
-        * Switch the PDA and FPU contexts.
-        */
-       prev->userrsp = read_pda(oldrsp); 
-       write_pda(oldrsp, next->userrsp); 
-       write_pda(pcurrent, next_p); 
-
-       write_pda(kernelstack,
-       (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
-#ifdef CONFIG_CC_STACKPROTECTOR
-       write_pda(stack_canary, next_p->stack_canary);
-       /*
-        * Build time only check to make sure the stack_canary is at
-        * offset 40 in the pda; this is a gcc ABI requirement
-        */
-       BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
-#endif
-
-       /*
-        * Now maybe reload the debug registers and handle I/O bitmaps
-        */
-       if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
-           || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
-               __switch_to_xtra(prev_p, next_p, tss);
-
-       /* If the task has used fpu the last 5 timeslices, just do a full
-        * restore of the math state immediately to avoid the trap; the
-        * chances of needing FPU soon are obviously high now
-        */
-       if (next_p->fpu_counter>5)
-               math_state_restore();
-       return prev_p;
-}
-
-/*
- * sys_execve() executes a new program.
- */
-asmlinkage 
-long sys_execve(char __user *name, char __user * __user *argv,
-               char __user * __user *envp, struct pt_regs regs)
-{
-       long error;
-       char * filename;
-
-       filename = getname(name);
-       error = PTR_ERR(filename);
-       if (IS_ERR(filename)) 
-               return error;
-       error = do_execve(filename, argv, envp, &regs); 
-       if (error == 0) {
-               task_lock(current);
-               current->ptrace &= ~PT_DTRACE;
-               task_unlock(current);
-       }
-       putname(filename);
-       return error;
-}
-
-void set_personality_64bit(void)
-{
-       /* inherit personality from parent */
-
-       /* Make sure to be in 64bit mode */
-       clear_thread_flag(TIF_IA32); 
-
-       /* TBD: overwrites user setup. Should have two bits.
-          But 64bit processes have always behaved this way,
-          so it's not too bad. The main problem is just that
-          32bit childs are affected again. */
-       current->personality &= ~READ_IMPLIES_EXEC;
-}
-
-asmlinkage long sys_fork(struct pt_regs *regs)
-{
-       return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
-}
-
-asmlinkage long
-sys_clone(unsigned long clone_flags, unsigned long newsp,
-         void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
-{
-       if (!newsp)
-               newsp = regs->rsp;
-       return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
-}
-
-/*
- * This is trivial, and on the face of it looks like it
- * could equally well be done in user mode.
- *
- * Not so, for quite unobvious reasons - register pressure.
- * In user mode vfork() cannot have a stack frame, and if
- * done by calling the "clone()" system call directly, you
- * do not have enough call-clobbered registers to hold all
- * the information you need.
- */
-asmlinkage long sys_vfork(struct pt_regs *regs)
-{
-       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
-                   NULL, NULL);
-}
-
-unsigned long get_wchan(struct task_struct *p)
-{
-       unsigned long stack;
-       u64 fp,rip;
-       int count = 0;
-
-       if (!p || p == current || p->state==TASK_RUNNING)
-               return 0; 
-       stack = (unsigned long)task_stack_page(p);
-       if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
-               return 0;
-       fp = *(u64 *)(p->thread.rsp);
-       do { 
-               if (fp < (unsigned long)stack ||
-                   fp > (unsigned long)stack+THREAD_SIZE)
-                       return 0; 
-               rip = *(u64 *)(fp+8); 
-               if (!in_sched_functions(rip))
-                       return rip; 
-               fp = *(u64 *)fp; 
-       } while (count++ < 16); 
-       return 0;
-}
-
-long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
-{ 
-       int ret = 0; 
-       int doit = task == current;
-       int cpu;
-
-       switch (code) { 
-       case ARCH_SET_GS:
-               if (addr >= TASK_SIZE_OF(task))
-                       return -EPERM; 
-               cpu = get_cpu();
-               /* handle small bases via the GDT because that's faster to 
-                  switch. */
-               if (addr <= 0xffffffff) {  
-                       set_32bit_tls(task, GS_TLS, addr); 
-                       if (doit) { 
-                               load_TLS(&task->thread, cpu);
-                               load_gs_index(GS_TLS_SEL); 
-                       }
-                       task->thread.gsindex = GS_TLS_SEL; 
-                       task->thread.gs = 0;
-               } else { 
-                       task->thread.gsindex = 0;
-                       task->thread.gs = addr;
-                       if (doit) {
-                               load_gs_index(0);
-                               ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
-                       } 
-               }
-               put_cpu();
-               break;
-       case ARCH_SET_FS:
-               /* Not strictly needed for fs, but do it for symmetry
-                  with gs */
-               if (addr >= TASK_SIZE_OF(task))
-                       return -EPERM; 
-               cpu = get_cpu();
-               /* handle small bases via the GDT because that's faster to 
-                  switch. */
-               if (addr <= 0xffffffff) { 
-                       set_32bit_tls(task, FS_TLS, addr);
-                       if (doit) { 
-                               load_TLS(&task->thread, cpu); 
-                               asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
-                       }
-                       task->thread.fsindex = FS_TLS_SEL;
-                       task->thread.fs = 0;
-               } else { 
-                       task->thread.fsindex = 0;
-                       task->thread.fs = addr;
-                       if (doit) {
-                               /* set the selector to 0 to not confuse
-                                  __switch_to */
-                               asm volatile("movl %0,%%fs" :: "r" (0));
-                               ret = checking_wrmsrl(MSR_FS_BASE, addr);
-                       }
-               }
-               put_cpu();
-               break;
-       case ARCH_GET_FS: { 
-               unsigned long base; 
-               if (task->thread.fsindex == FS_TLS_SEL)
-                       base = read_32bit_tls(task, FS_TLS);
-               else if (doit)
-                       rdmsrl(MSR_FS_BASE, base);
-               else
-                       base = task->thread.fs;
-               ret = put_user(base, (unsigned long __user *)addr); 
-               break; 
-       }
-       case ARCH_GET_GS: { 
-               unsigned long base;
-               unsigned gsindex;
-               if (task->thread.gsindex == GS_TLS_SEL)
-                       base = read_32bit_tls(task, GS_TLS);
-               else if (doit) {
-                       asm("movl %%gs,%0" : "=r" (gsindex));
-                       if (gsindex)
-                               rdmsrl(MSR_KERNEL_GS_BASE, base);
-                       else
-                               base = task->thread.gs;
-               }
-               else
-                       base = task->thread.gs;
-               ret = put_user(base, (unsigned long __user *)addr); 
-               break;
-       }
-
-       default:
-               ret = -EINVAL;
-               break;
-       } 
-
-       return ret;     
-} 
-
-long sys_arch_prctl(int code, unsigned long addr)
-{
-       return do_arch_prctl(current, code, addr);
-} 
-
-/* 
- * Capture the user space registers if the task is not running (in user space)
- */
-int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
-{
-       struct pt_regs *pp, ptregs;
-
-       pp = task_pt_regs(tsk);
-
-       ptregs = *pp; 
-       ptregs.cs &= 0xffff;
-       ptregs.ss &= 0xffff;
-
-       elf_core_copy_regs(regs, &ptregs);
-       return 1;
-}
-
-unsigned long arch_align_stack(unsigned long sp)
-{
-       if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
-               sp -= get_random_int() % 8192;
-       return sp & ~0xf;
-}
diff --git a/arch/x86_64/kernel/ptrace_64.c b/arch/x86_64/kernel/ptrace_64.c
deleted file mode 100644 (file)
index eea3702..0000000
+++ /dev/null
@@ -1,627 +0,0 @@
-/* ptrace.c */
-/* By Ross Biro 1/23/92 */
-/*
- * Pentium III FXSR, SSE support
- *     Gareth Hughes <gareth@valinux.com>, May 2000
- * 
- * x86-64 port 2000-2002 Andi Kleen
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/errno.h>
-#include <linux/ptrace.h>
-#include <linux/user.h>
-#include <linux/security.h>
-#include <linux/audit.h>
-#include <linux/seccomp.h>
-#include <linux/signal.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/system.h>
-#include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/debugreg.h>
-#include <asm/ldt.h>
-#include <asm/desc.h>
-#include <asm/proto.h>
-#include <asm/ia32.h>
-
-/*
- * does not yet catch signals sent when the child dies.
- * in exit.c or in signal.c.
- */
-
-/*
- * Determines which flags the user has access to [1 = access, 0 = no access].
- * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9).
- * Also masks reserved bits (63-22, 15, 5, 3, 1).
- */
-#define FLAG_MASK 0x54dd5UL
-
-/* set's the trap flag. */
-#define TRAP_FLAG 0x100UL
-
-/*
- * eflags and offset of eflags on child stack..
- */
-#define EFLAGS offsetof(struct pt_regs, eflags)
-#define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs)))
-
-/*
- * this routine will get a word off of the processes privileged stack. 
- * the offset is how far from the base addr as stored in the TSS.  
- * this routine assumes that all the privileged stacks are in our
- * data space.
- */   
-static inline unsigned long get_stack_long(struct task_struct *task, int offset)
-{
-       unsigned char *stack;
-
-       stack = (unsigned char *)task->thread.rsp0;
-       stack += offset;
-       return (*((unsigned long *)stack));
-}
-
-/*
- * this routine will put a word on the processes privileged stack. 
- * the offset is how far from the base addr as stored in the TSS.  
- * this routine assumes that all the privileged stacks are in our
- * data space.
- */
-static inline long put_stack_long(struct task_struct *task, int offset,
-       unsigned long data)
-{
-       unsigned char * stack;
-
-       stack = (unsigned char *) task->thread.rsp0;
-       stack += offset;
-       *(unsigned long *) stack = data;
-       return 0;
-}
-
-#define LDT_SEGMENT 4
-
-unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs)
-{
-       unsigned long addr, seg;
-
-       addr = regs->rip;
-       seg = regs->cs & 0xffff;
-
-       /*
-        * We'll assume that the code segments in the GDT
-        * are all zero-based. That is largely true: the
-        * TLS segments are used for data, and the PNPBIOS
-        * and APM bios ones we just ignore here.
-        */
-       if (seg & LDT_SEGMENT) {
-               u32 *desc;
-               unsigned long base;
-
-               seg &= ~7UL;
-
-               down(&child->mm->context.sem);
-               if (unlikely((seg >> 3) >= child->mm->context.size))
-                       addr = -1L; /* bogus selector, access would fault */
-               else {
-                       desc = child->mm->context.ldt + seg;
-                       base = ((desc[0] >> 16) |
-                               ((desc[1] & 0xff) << 16) |
-                               (desc[1] & 0xff000000));
-
-                       /* 16-bit code segment? */
-                       if (!((desc[1] >> 22) & 1))
-                               addr &= 0xffff;
-                       addr += base;
-               }
-               up(&child->mm->context.sem);
-       }
-
-       return addr;
-}
-
-static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
-{
-       int i, copied;
-       unsigned char opcode[15];
-       unsigned long addr = convert_rip_to_linear(child, regs);
-
-       copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
-       for (i = 0; i < copied; i++) {
-               switch (opcode[i]) {
-               /* popf and iret */
-               case 0x9d: case 0xcf:
-                       return 1;
-
-                       /* CHECKME: 64 65 */
-
-               /* opcode and address size prefixes */
-               case 0x66: case 0x67:
-                       continue;
-               /* irrelevant prefixes (segment overrides and repeats) */
-               case 0x26: case 0x2e:
-               case 0x36: case 0x3e:
-               case 0x64: case 0x65:
-               case 0xf2: case 0xf3:
-                       continue;
-
-               case 0x40 ... 0x4f:
-                       if (regs->cs != __USER_CS)
-                               /* 32-bit mode: register increment */
-                               return 0;
-                       /* 64-bit mode: REX prefix */
-                       continue;
-
-                       /* CHECKME: f2, f3 */
-
-               /*
-                * pushf: NOTE! We should probably not let
-                * the user see the TF bit being set. But
-                * it's more pain than it's worth to avoid
-                * it, and a debugger could emulate this
-                * all in user space if it _really_ cares.
-                */
-               case 0x9c:
-               default:
-                       return 0;
-               }
-       }
-       return 0;
-}
-
-static void set_singlestep(struct task_struct *child)
-{
-       struct pt_regs *regs = task_pt_regs(child);
-
-       /*
-        * Always set TIF_SINGLESTEP - this guarantees that
-        * we single-step system calls etc..  This will also
-        * cause us to set TF when returning to user mode.
-        */
-       set_tsk_thread_flag(child, TIF_SINGLESTEP);
-
-       /*
-        * If TF was already set, don't do anything else
-        */
-       if (regs->eflags & TRAP_FLAG)
-               return;
-
-       /* Set TF on the kernel stack.. */
-       regs->eflags |= TRAP_FLAG;
-
-       /*
-        * ..but if TF is changed by the instruction we will trace,
-        * don't mark it as being "us" that set it, so that we
-        * won't clear it by hand later.
-        */
-       if (is_setting_trap_flag(child, regs))
-               return;
-
-       child->ptrace |= PT_DTRACE;
-}
-
-static void clear_singlestep(struct task_struct *child)
-{
-       /* Always clear TIF_SINGLESTEP... */
-       clear_tsk_thread_flag(child, TIF_SINGLESTEP);
-
-       /* But touch TF only if it was set by us.. */
-       if (child->ptrace & PT_DTRACE) {
-               struct pt_regs *regs = task_pt_regs(child);
-               regs->eflags &= ~TRAP_FLAG;
-               child->ptrace &= ~PT_DTRACE;
-       }
-}
-
-/*
- * Called by kernel/ptrace.c when detaching..
- *
- * Make sure the single step bit is not set.
- */
-void ptrace_disable(struct task_struct *child)
-{ 
-       clear_singlestep(child);
-}
-
-static int putreg(struct task_struct *child,
-       unsigned long regno, unsigned long value)
-{
-       unsigned long tmp; 
-       
-       switch (regno) {
-               case offsetof(struct user_regs_struct,fs):
-                       if (value && (value & 3) != 3)
-                               return -EIO;
-                       child->thread.fsindex = value & 0xffff; 
-                       return 0;
-               case offsetof(struct user_regs_struct,gs):
-                       if (value && (value & 3) != 3)
-                               return -EIO;
-                       child->thread.gsindex = value & 0xffff;
-                       return 0;
-               case offsetof(struct user_regs_struct,ds):
-                       if (value && (value & 3) != 3)
-                               return -EIO;
-                       child->thread.ds = value & 0xffff;
-                       return 0;
-               case offsetof(struct user_regs_struct,es): 
-                       if (value && (value & 3) != 3)
-                               return -EIO;
-                       child->thread.es = value & 0xffff;
-                       return 0;
-               case offsetof(struct user_regs_struct,ss):
-                       if ((value & 3) != 3)
-                               return -EIO;
-                       value &= 0xffff;
-                       return 0;
-               case offsetof(struct user_regs_struct,fs_base):
-                       if (value >= TASK_SIZE_OF(child))
-                               return -EIO;
-                       child->thread.fs = value;
-                       return 0;
-               case offsetof(struct user_regs_struct,gs_base):
-                       if (value >= TASK_SIZE_OF(child))
-                               return -EIO;
-                       child->thread.gs = value;
-                       return 0;
-               case offsetof(struct user_regs_struct, eflags):
-                       value &= FLAG_MASK;
-                       tmp = get_stack_long(child, EFL_OFFSET); 
-                       tmp &= ~FLAG_MASK; 
-                       value |= tmp;
-                       break;
-               case offsetof(struct user_regs_struct,cs): 
-                       if ((value & 3) != 3)
-                               return -EIO;
-                       value &= 0xffff;
-                       break;
-       }
-       put_stack_long(child, regno - sizeof(struct pt_regs), value);
-       return 0;
-}
-
-static unsigned long getreg(struct task_struct *child, unsigned long regno)
-{
-       unsigned long val;
-       switch (regno) {
-               case offsetof(struct user_regs_struct, fs):
-                       return child->thread.fsindex;
-               case offsetof(struct user_regs_struct, gs):
-                       return child->thread.gsindex;
-               case offsetof(struct user_regs_struct, ds):
-                       return child->thread.ds;
-               case offsetof(struct user_regs_struct, es):
-                       return child->thread.es; 
-               case offsetof(struct user_regs_struct, fs_base):
-                       return child->thread.fs;
-               case offsetof(struct user_regs_struct, gs_base):
-                       return child->thread.gs;
-               default:
-                       regno = regno - sizeof(struct pt_regs);
-                       val = get_stack_long(child, regno);
-                       if (test_tsk_thread_flag(child, TIF_IA32))
-                               val &= 0xffffffff;
-                       return val;
-       }
-
-}
-
-long arch_ptrace(struct task_struct *child, long request, long addr, long data)
-{
-       long i, ret;
-       unsigned ui;
-
-       switch (request) {
-       /* when I and D space are separate, these will need to be fixed. */
-       case PTRACE_PEEKTEXT: /* read word at location addr. */ 
-       case PTRACE_PEEKDATA:
-               ret = generic_ptrace_peekdata(child, addr, data);
-               break;
-
-       /* read the word at location addr in the USER area. */
-       case PTRACE_PEEKUSR: {
-               unsigned long tmp;
-
-               ret = -EIO;
-               if ((addr & 7) ||
-                   addr > sizeof(struct user) - 7)
-                       break;
-
-               switch (addr) { 
-               case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
-                       tmp = getreg(child, addr);
-                       break;
-               case offsetof(struct user, u_debugreg[0]):
-                       tmp = child->thread.debugreg0;
-                       break;
-               case offsetof(struct user, u_debugreg[1]):
-                       tmp = child->thread.debugreg1;
-                       break;
-               case offsetof(struct user, u_debugreg[2]):
-                       tmp = child->thread.debugreg2;
-                       break;
-               case offsetof(struct user, u_debugreg[3]):
-                       tmp = child->thread.debugreg3;
-                       break;
-               case offsetof(struct user, u_debugreg[6]):
-                       tmp = child->thread.debugreg6;
-                       break;
-               case offsetof(struct user, u_debugreg[7]):
-                       tmp = child->thread.debugreg7;
-                       break;
-               default:
-                       tmp = 0;
-                       break;
-               }
-               ret = put_user(tmp,(unsigned long __user *) data);
-               break;
-       }
-
-       /* when I and D space are separate, this will have to be fixed. */
-       case PTRACE_POKETEXT: /* write the word at location addr. */
-       case PTRACE_POKEDATA:
-               ret = generic_ptrace_pokedata(child, addr, data);
-               break;
-
-       case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
-       {
-               int dsize = test_tsk_thread_flag(child, TIF_IA32) ? 3 : 7;
-               ret = -EIO;
-               if ((addr & 7) ||
-                   addr > sizeof(struct user) - 7)
-                       break;
-
-               switch (addr) { 
-               case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
-                       ret = putreg(child, addr, data);
-                       break;
-               /* Disallows to set a breakpoint into the vsyscall */
-               case offsetof(struct user, u_debugreg[0]):
-                       if (data >= TASK_SIZE_OF(child) - dsize) break;
-                       child->thread.debugreg0 = data;
-                       ret = 0;
-                       break;
-               case offsetof(struct user, u_debugreg[1]):
-                       if (data >= TASK_SIZE_OF(child) - dsize) break;
-                       child->thread.debugreg1 = data;
-                       ret = 0;
-                       break;
-               case offsetof(struct user, u_debugreg[2]):
-                       if (data >= TASK_SIZE_OF(child) - dsize) break;
-                       child->thread.debugreg2 = data;
-                       ret = 0;
-                       break;
-               case offsetof(struct user, u_debugreg[3]):
-                       if (data >= TASK_SIZE_OF(child) - dsize) break;
-                       child->thread.debugreg3 = data;
-                       ret = 0;
-                       break;
-               case offsetof(struct user, u_debugreg[6]):
-                                 if (data >> 32)
-                               break; 
-                       child->thread.debugreg6 = data;
-                       ret = 0;
-                       break;
-               case offsetof(struct user, u_debugreg[7]):
-                       /* See arch/i386/kernel/ptrace.c for an explanation of
-                        * this awkward check.*/
-                       data &= ~DR_CONTROL_RESERVED;
-                       for(i=0; i<4; i++)
-                               if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
-                                       break;
-                       if (i == 4) {
-                         child->thread.debugreg7 = data;
-                         if (data)
-                               set_tsk_thread_flag(child, TIF_DEBUG);
-                         else
-                               clear_tsk_thread_flag(child, TIF_DEBUG);
-                         ret = 0;
-                       }
-                 break;
-               }
-               break;
-       }
-       case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
-       case PTRACE_CONT:    /* restart after signal. */
-
-               ret = -EIO;
-               if (!valid_signal(data))
-                       break;
-               if (request == PTRACE_SYSCALL)
-                       set_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
-               else
-                       clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
-               clear_tsk_thread_flag(child, TIF_SINGLESTEP);
-               child->exit_code = data;
-               /* make sure the single step bit is not set. */
-               clear_singlestep(child);
-               wake_up_process(child);
-               ret = 0;
-               break;
-
-#ifdef CONFIG_IA32_EMULATION
-               /* This makes only sense with 32bit programs. Allow a
-                  64bit debugger to fully examine them too. Better
-                  don't use it against 64bit processes, use
-                  PTRACE_ARCH_PRCTL instead. */
-       case PTRACE_SET_THREAD_AREA: {
-               struct user_desc __user *p;
-               int old; 
-               p = (struct user_desc __user *)data;
-               get_user(old,  &p->entry_number); 
-               put_user(addr, &p->entry_number);
-               ret = do_set_thread_area(&child->thread, p);
-               put_user(old,  &p->entry_number); 
-               break;
-       case PTRACE_GET_THREAD_AREA:
-               p = (struct user_desc __user *)data;
-               get_user(old,  &p->entry_number); 
-               put_user(addr, &p->entry_number);
-               ret = do_get_thread_area(&child->thread, p);
-               put_user(old,  &p->entry_number); 
-               break;
-       } 
-#endif
-               /* normal 64bit interface to access TLS data. 
-                  Works just like arch_prctl, except that the arguments
-                  are reversed. */
-       case PTRACE_ARCH_PRCTL: 
-               ret = do_arch_prctl(child, data, addr);
-               break;
-
-/*
- * make the child exit.  Best I can do is send it a sigkill. 
- * perhaps it should be put in the status that it wants to 
- * exit.
- */
-       case PTRACE_KILL:
-               ret = 0;
-               if (child->exit_state == EXIT_ZOMBIE)   /* already dead */
-                       break;
-               clear_tsk_thread_flag(child, TIF_SINGLESTEP);
-               child->exit_code = SIGKILL;
-               /* make sure the single step bit is not set. */
-               clear_singlestep(child);
-               wake_up_process(child);
-               break;
-
-       case PTRACE_SINGLESTEP:    /* set the trap flag. */
-               ret = -EIO;
-               if (!valid_signal(data))
-                       break;
-               clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
-               set_singlestep(child);
-               child->exit_code = data;
-               /* give it a chance to run. */
-               wake_up_process(child);
-               ret = 0;
-               break;
-
-       case PTRACE_DETACH:
-               /* detach a process that was attached. */
-               ret = ptrace_detach(child, data);
-               break;
-
-       case PTRACE_GETREGS: { /* Get all gp regs from the child. */
-               if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
-                              sizeof(struct user_regs_struct))) {
-                       ret = -EIO;
-                       break;
-               }
-               ret = 0;
-               for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
-                       ret |= __put_user(getreg(child, ui),(unsigned long __user *) data);
-                       data += sizeof(long);
-               }
-               break;
-       }
-
-       case PTRACE_SETREGS: { /* Set all gp regs in the child. */
-               unsigned long tmp;
-               if (!access_ok(VERIFY_READ, (unsigned __user *)data,
-                              sizeof(struct user_regs_struct))) {
-                       ret = -EIO;
-                       break;
-               }
-               ret = 0;
-               for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
-                       ret = __get_user(tmp, (unsigned long __user *) data);
-                       if (ret)
-                               break;
-                       ret = putreg(child, ui, tmp);
-                       if (ret)
-                               break;
-                       data += sizeof(long);
-               }
-               break;
-       }
-
-       case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */
-               if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
-                              sizeof(struct user_i387_struct))) {
-                       ret = -EIO;
-                       break;
-               }
-               ret = get_fpregs((struct user_i387_struct __user *)data, child);
-               break;
-       }
-
-       case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */
-               if (!access_ok(VERIFY_READ, (unsigned __user *)data,
-                              sizeof(struct user_i387_struct))) {
-                       ret = -EIO;
-                       break;
-               }
-               set_stopped_child_used_math(child);
-               ret = set_fpregs(child, (struct user_i387_struct __user *)data);
-               break;
-       }
-
-       default:
-               ret = ptrace_request(child, request, addr, data);
-               break;
-       }
-       return ret;
-}
-
-static void syscall_trace(struct pt_regs *regs)
-{
-
-#if 0
-       printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n",
-              current->comm,
-              regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0),
-              current_thread_info()->flags, current->ptrace); 
-#endif
-
-       ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
-                               ? 0x80 : 0));
-       /*
-        * this isn't the same as continuing with a signal, but it will do
-        * for normal use.  strace only continues with a signal if the
-        * stopping signal is not SIGTRAP.  -brl
-        */
-       if (current->exit_code) {
-               send_sig(current->exit_code, current, 1);
-               current->exit_code = 0;
-       }
-}
-
-asmlinkage void syscall_trace_enter(struct pt_regs *regs)
-{
-       /* do the secure computing check first */
-       secure_computing(regs->orig_rax);
-
-       if (test_thread_flag(TIF_SYSCALL_TRACE)
-           && (current->ptrace & PT_PTRACED))
-               syscall_trace(regs);
-
-       if (unlikely(current->audit_context)) {
-               if (test_thread_flag(TIF_IA32)) {
-                       audit_syscall_entry(AUDIT_ARCH_I386,
-                                           regs->orig_rax,
-                                           regs->rbx, regs->rcx,
-                                           regs->rdx, regs->rsi);
-               } else {
-                       audit_syscall_entry(AUDIT_ARCH_X86_64,
-                                           regs->orig_rax,
-                                           regs->rdi, regs->rsi,
-                                           regs->rdx, regs->r10);
-               }
-       }
-}
-
-asmlinkage void syscall_trace_leave(struct pt_regs *regs)
-{
-       if (unlikely(current->audit_context))
-               audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax);
-
-       if ((test_thread_flag(TIF_SYSCALL_TRACE)
-            || test_thread_flag(TIF_SINGLESTEP))
-           && (current->ptrace & PT_PTRACED))
-               syscall_trace(regs);
-}
diff --git a/arch/x86_64/kernel/reboot_64.c b/arch/x86_64/kernel/reboot_64.c
deleted file mode 100644 (file)
index 368db2b..0000000
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Various gunk just to reboot the machine. */ 
-#include <linux/module.h>
-#include <linux/reboot.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/kernel.h>
-#include <linux/ctype.h>
-#include <linux/string.h>
-#include <linux/pm.h>
-#include <linux/kdebug.h>
-#include <linux/sched.h>
-#include <asm/io.h>
-#include <asm/delay.h>
-#include <asm/hw_irq.h>
-#include <asm/system.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <asm/apic.h>
-#include <asm/iommu.h>
-
-/*
- * Power off function, if any
- */
-void (*pm_power_off)(void);
-EXPORT_SYMBOL(pm_power_off);
-
-static long no_idt[3];
-static enum { 
-       BOOT_TRIPLE = 't',
-       BOOT_KBD = 'k'
-} reboot_type = BOOT_KBD;
-static int reboot_mode = 0;
-int reboot_force;
-
-/* reboot=t[riple] | k[bd] [, [w]arm | [c]old]
-   warm   Don't set the cold reboot flag
-   cold   Set the cold reboot flag
-   triple Force a triple fault (init)
-   kbd    Use the keyboard controller. cold reset (default)
-   force  Avoid anything that could hang.
- */ 
-static int __init reboot_setup(char *str)
-{
-       for (;;) {
-               switch (*str) {
-               case 'w': 
-                       reboot_mode = 0x1234;
-                       break;
-
-               case 'c':
-                       reboot_mode = 0;
-                       break;
-
-               case 't':
-               case 'b':
-               case 'k':
-                       reboot_type = *str;
-                       break;
-               case 'f':
-                       reboot_force = 1;
-                       break;
-               }
-               if((str = strchr(str,',')) != NULL)
-                       str++;
-               else
-                       break;
-       }
-       return 1;
-}
-
-__setup("reboot=", reboot_setup);
-
-static inline void kb_wait(void)
-{
-       int i;
-
-       for (i=0; i<0x10000; i++)
-               if ((inb_p(0x64) & 0x02) == 0)
-                       break;
-}
-
-void machine_shutdown(void)
-{
-       unsigned long flags;
-
-       /* Stop the cpus and apics */
-#ifdef CONFIG_SMP
-       int reboot_cpu_id;
-
-       /* The boot cpu is always logical cpu 0 */
-       reboot_cpu_id = 0;
-
-       /* Make certain the cpu I'm about to reboot on is online */
-       if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
-               reboot_cpu_id = smp_processor_id();
-       }
-
-       /* Make certain I only run on the appropriate processor */
-       set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
-
-       /* O.K Now that I'm on the appropriate processor,
-        * stop all of the others.
-        */
-       smp_send_stop();
-#endif
-
-       local_irq_save(flags);
-
-#ifndef CONFIG_SMP
-       disable_local_APIC();
-#endif
-
-       disable_IO_APIC();
-
-       local_irq_restore(flags);
-
-       pci_iommu_shutdown();
-}
-
-void machine_emergency_restart(void)
-{
-       int i;
-
-       /* Tell the BIOS if we want cold or warm reboot */
-       *((unsigned short *)__va(0x472)) = reboot_mode;
-       
-       for (;;) {
-               /* Could also try the reset bit in the Hammer NB */
-               switch (reboot_type) { 
-               case BOOT_KBD:
-               for (i=0; i<10; i++) {
-                       kb_wait();
-                       udelay(50);
-                       outb(0xfe,0x64);         /* pulse reset low */
-                       udelay(50);
-               }
-
-               case BOOT_TRIPLE: 
-                       __asm__ __volatile__("lidt (%0)": :"r" (&no_idt));
-                       __asm__ __volatile__("int3");
-
-                       reboot_type = BOOT_KBD;
-                       break;
-               }      
-       }      
-}
-
-void machine_restart(char * __unused)
-{
-       printk("machine restart\n");
-
-       if (!reboot_force) {
-               machine_shutdown();
-       }
-       machine_emergency_restart();
-}
-
-void machine_halt(void)
-{
-}
-
-void machine_power_off(void)
-{
-       if (pm_power_off) {
-               if (!reboot_force) {
-                       machine_shutdown();
-               }
-               pm_power_off();
-       }
-}
-
diff --git a/arch/x86_64/kernel/relocate_kernel_64.S b/arch/x86_64/kernel/relocate_kernel_64.S
deleted file mode 100644 (file)
index 14e9587..0000000
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * relocate_kernel.S - put the kernel image in place to boot
- * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2.  See the file COPYING for more details.
- */
-
-#include <linux/linkage.h>
-#include <asm/page.h>
-#include <asm/kexec.h>
-
-/*
- * Must be relocatable PIC code callable as a C function
- */
-
-#define PTR(x) (x << 3)
-#define PAGE_ALIGNED (1 << PAGE_SHIFT)
-#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
-
-       .text
-       .align PAGE_ALIGNED
-       .code64
-       .globl relocate_kernel
-relocate_kernel:
-       /* %rdi indirection_page
-        * %rsi page_list
-        * %rdx start address
-        */
-
-       /* map the control page at its virtual address */
-
-       movq    $0x0000ff8000000000, %r10        /* mask */
-       mov     $(39 - 3), %cl                   /* bits to shift */
-       movq    PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
-
-       movq    %r11, %r9
-       andq    %r10, %r9
-       shrq    %cl, %r9
-
-       movq    PTR(VA_PGD)(%rsi), %r8
-       addq    %r8, %r9
-       movq    PTR(PA_PUD_0)(%rsi), %r8
-       orq     $PAGE_ATTR, %r8
-       movq    %r8, (%r9)
-
-       shrq    $9, %r10
-       sub     $9, %cl
-
-       movq    %r11, %r9
-       andq    %r10, %r9
-       shrq    %cl, %r9
-
-       movq    PTR(VA_PUD_0)(%rsi), %r8
-       addq    %r8, %r9
-       movq    PTR(PA_PMD_0)(%rsi), %r8
-       orq     $PAGE_ATTR, %r8
-       movq    %r8, (%r9)
-
-       shrq    $9, %r10
-       sub     $9, %cl
-
-       movq    %r11, %r9
-       andq    %r10, %r9
-       shrq    %cl, %r9
-
-       movq    PTR(VA_PMD_0)(%rsi), %r8
-       addq    %r8, %r9
-       movq    PTR(PA_PTE_0)(%rsi), %r8
-       orq     $PAGE_ATTR, %r8
-       movq    %r8, (%r9)
-
-       shrq    $9, %r10
-       sub     $9, %cl
-
-       movq    %r11, %r9
-       andq    %r10, %r9
-       shrq    %cl, %r9
-
-       movq    PTR(VA_PTE_0)(%rsi), %r8
-       addq    %r8, %r9
-       movq    PTR(PA_CONTROL_PAGE)(%rsi), %r8
-       orq     $PAGE_ATTR, %r8
-       movq    %r8, (%r9)
-
-       /* identity map the control page at its physical address */
-
-       movq    $0x0000ff8000000000, %r10        /* mask */
-       mov     $(39 - 3), %cl                   /* bits to shift */
-       movq    PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
-
-       movq    %r11, %r9
-       andq    %r10, %r9
-       shrq    %cl, %r9
-
-       movq    PTR(VA_PGD)(%rsi), %r8
-       addq    %r8, %r9
-       movq    PTR(PA_PUD_1)(%rsi), %r8
-       orq     $PAGE_ATTR, %r8
-       movq    %r8, (%r9)
-
-       shrq    $9, %r10
-       sub     $9, %cl
-
-       movq    %r11, %r9
-       andq    %r10, %r9
-       shrq    %cl, %r9
-
-       movq    PTR(VA_PUD_1)(%rsi), %r8
-       addq    %r8, %r9
-       movq    PTR(PA_PMD_1)(%rsi), %r8
-       orq     $PAGE_ATTR, %r8
-       movq    %r8, (%r9)
-
-       shrq    $9, %r10
-       sub     $9, %cl
-
-       movq    %r11, %r9
-       andq    %r10, %r9
-       shrq    %cl, %r9
-
-       movq    PTR(VA_PMD_1)(%rsi), %r8
-       addq    %r8, %r9
-       movq    PTR(PA_PTE_1)(%rsi), %r8
-       orq     $PAGE_ATTR, %r8
-       movq    %r8, (%r9)
-
-       shrq    $9, %r10
-       sub     $9, %cl
-
-       movq    %r11, %r9
-       andq    %r10, %r9
-       shrq    %cl, %r9
-
-       movq    PTR(VA_PTE_1)(%rsi), %r8
-       addq    %r8, %r9
-       movq    PTR(PA_CONTROL_PAGE)(%rsi), %r8
-       orq     $PAGE_ATTR, %r8
-       movq    %r8, (%r9)
-
-relocate_new_kernel:
-       /* %rdi indirection_page
-        * %rsi page_list
-        * %rdx start address
-        */
-
-       /* zero out flags, and disable interrupts */
-       pushq $0
-       popfq
-
-       /* get physical address of control page now */
-       /* this is impossible after page table switch */
-       movq    PTR(PA_CONTROL_PAGE)(%rsi), %r8
-
-       /* get physical address of page table now too */
-       movq    PTR(PA_TABLE_PAGE)(%rsi), %rcx
-
-       /* switch to new set of page tables */
-       movq    PTR(PA_PGD)(%rsi), %r9
-       movq    %r9, %cr3
-
-       /* setup a new stack at the end of the physical control page */
-       lea     4096(%r8), %rsp
-
-       /* jump to identity mapped page */
-       addq    $(identity_mapped - relocate_kernel), %r8
-       pushq   %r8
-       ret
-
-identity_mapped:
-       /* store the start address on the stack */
-       pushq   %rdx
-
-       /* Set cr0 to a known state:
-        * 31 1 == Paging enabled
-        * 18 0 == Alignment check disabled
-        * 16 0 == Write protect disabled
-        * 3  0 == No task switch
-        * 2  0 == Don't do FP software emulation.
-        * 0  1 == Proctected mode enabled
-        */
-       movq    %cr0, %rax
-       andq    $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax
-       orl     $((1<<31)|(1<<0)), %eax
-       movq    %rax, %cr0
-
-       /* Set cr4 to a known state:
-        * 10 0 == xmm exceptions disabled
-        * 9  0 == xmm registers instructions disabled
-        * 8  0 == performance monitoring counter disabled
-        * 7  0 == page global disabled
-        * 6  0 == machine check exceptions disabled
-        * 5  1 == physical address extension enabled
-        * 4  0 == page size extensions disabled
-        * 3  0 == Debug extensions disabled
-        * 2  0 == Time stamp disable (disabled)
-        * 1  0 == Protected mode virtual interrupts disabled
-        * 0  0 == VME disabled
-        */
-
-       movq    $((1<<5)), %rax
-       movq    %rax, %cr4
-
-       jmp 1f
-1:
-
-       /* Switch to the identity mapped page tables,
-        * and flush the TLB.
-       */
-       movq    %rcx, %cr3
-
-       /* Do the copies */
-       movq    %rdi, %rcx      /* Put the page_list in %rcx */
-       xorq    %rdi, %rdi
-       xorq    %rsi, %rsi
-       jmp     1f
-
-0:     /* top, read another word for the indirection page */
-
-       movq    (%rbx), %rcx
-       addq    $8,     %rbx
-1:
-       testq   $0x1,   %rcx  /* is it a destination page? */
-       jz      2f
-       movq    %rcx,   %rdi
-       andq    $0xfffffffffffff000, %rdi
-       jmp     0b
-2:
-       testq   $0x2,   %rcx  /* is it an indirection page? */
-       jz      2f
-       movq    %rcx,   %rbx
-       andq    $0xfffffffffffff000, %rbx
-       jmp     0b
-2:
-       testq   $0x4,   %rcx  /* is it the done indicator? */
-       jz      2f
-       jmp     3f
-2:
-       testq   $0x8,   %rcx  /* is it the source indicator? */
-       jz      0b            /* Ignore it otherwise */
-       movq    %rcx,   %rsi  /* For ever source page do a copy */
-       andq    $0xfffffffffffff000, %rsi
-
-       movq    $512,   %rcx
-       rep ; movsq
-       jmp     0b
-3:
-
-       /* To be certain of avoiding problems with self-modifying code
-        * I need to execute a serializing instruction here.
-        * So I flush the TLB by reloading %cr3 here, it's handy,
-        * and not processor dependent.
-        */
-       movq    %cr3, %rax
-       movq    %rax, %cr3
-
-       /* set all of the registers to known values */
-       /* leave %rsp alone */
-
-       xorq    %rax, %rax
-       xorq    %rbx, %rbx
-       xorq    %rcx, %rcx
-       xorq    %rdx, %rdx
-       xorq    %rsi, %rsi
-       xorq    %rdi, %rdi
-       xorq    %rbp, %rbp
-       xorq    %r8,  %r8
-       xorq    %r9,  %r9
-       xorq    %r10, %r9
-       xorq    %r11, %r11
-       xorq    %r12, %r12
-       xorq    %r13, %r13
-       xorq    %r14, %r14
-       xorq    %r15, %r15
-
-       ret
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
deleted file mode 100644 (file)
index 1200aaa..0000000
+++ /dev/null
@@ -1,289 +0,0 @@
-/* 
- * X86-64 specific CPU setup.
- * Copyright (C) 1995  Linus Torvalds
- * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
- * See setup.c for older changelog.
- */ 
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/bootmem.h>
-#include <linux/bitops.h>
-#include <linux/module.h>
-#include <asm/bootsetup.h>
-#include <asm/pda.h>
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/desc.h>
-#include <asm/atomic.h>
-#include <asm/mmu_context.h>
-#include <asm/smp.h>
-#include <asm/i387.h>
-#include <asm/percpu.h>
-#include <asm/proto.h>
-#include <asm/sections.h>
-
-char x86_boot_params[BOOT_PARAM_SIZE] __initdata;
-
-cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
-
-struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
-struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
-
-struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
-
-char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
-
-unsigned long __supported_pte_mask __read_mostly = ~0UL;
-static int do_not_nx __cpuinitdata = 0;
-
-/* noexec=on|off
-Control non executable mappings for 64bit processes.
-
-on     Enable(default)
-off    Disable
-*/ 
-static int __init nonx_setup(char *str)
-{
-       if (!str)
-               return -EINVAL;
-       if (!strncmp(str, "on", 2)) {
-                __supported_pte_mask |= _PAGE_NX; 
-               do_not_nx = 0; 
-       } else if (!strncmp(str, "off", 3)) {
-               do_not_nx = 1;
-               __supported_pte_mask &= ~_PAGE_NX;
-        }
-       return 0;
-} 
-early_param("noexec", nonx_setup);
-
-int force_personality32 = 0; 
-
-/* noexec32=on|off
-Control non executable heap for 32bit processes.
-To control the stack too use noexec=off
-
-on     PROT_READ does not imply PROT_EXEC for 32bit processes
-off    PROT_READ implies PROT_EXEC (default)
-*/
-static int __init nonx32_setup(char *str)
-{
-       if (!strcmp(str, "on"))
-               force_personality32 &= ~READ_IMPLIES_EXEC;
-       else if (!strcmp(str, "off"))
-               force_personality32 |= READ_IMPLIES_EXEC;
-       return 1;
-}
-__setup("noexec32=", nonx32_setup);
-
-/*
- * Great future plan:
- * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
- * Always point %gs to its beginning
- */
-void __init setup_per_cpu_areas(void)
-{ 
-       int i;
-       unsigned long size;
-
-#ifdef CONFIG_HOTPLUG_CPU
-       prefill_possible_map();
-#endif
-
-       /* Copy section for each CPU (we discard the original) */
-       size = PERCPU_ENOUGH_ROOM;
-
-       printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
-       for_each_cpu_mask (i, cpu_possible_map) {
-               char *ptr;
-
-               if (!NODE_DATA(cpu_to_node(i))) {
-                       printk("cpu with no node %d, num_online_nodes %d\n",
-                              i, num_online_nodes());
-                       ptr = alloc_bootmem_pages(size);
-               } else { 
-                       ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size);
-               }
-               if (!ptr)
-                       panic("Cannot allocate cpu data for CPU %d\n", i);
-               cpu_pda(i)->data_offset = ptr - __per_cpu_start;
-               memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
-       }
-} 
-
-void pda_init(int cpu)
-{ 
-       struct x8664_pda *pda = cpu_pda(cpu);
-
-       /* Setup up data that may be needed in __get_free_pages early */
-       asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); 
-       /* Memory clobbers used to order PDA accessed */
-       mb();
-       wrmsrl(MSR_GS_BASE, pda);
-       mb();
-
-       pda->cpunumber = cpu; 
-       pda->irqcount = -1;
-       pda->kernelstack = 
-               (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; 
-       pda->active_mm = &init_mm;
-       pda->mmu_state = 0;
-
-       if (cpu == 0) {
-               /* others are initialized in smpboot.c */
-               pda->pcurrent = &init_task;
-               pda->irqstackptr = boot_cpu_stack; 
-       } else {
-               pda->irqstackptr = (char *)
-                       __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
-               if (!pda->irqstackptr)
-                       panic("cannot allocate irqstack for cpu %d", cpu); 
-       }
-
-
-       pda->irqstackptr += IRQSTACKSIZE-64;
-} 
-
-char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
-__attribute__((section(".bss.page_aligned")));
-
-extern asmlinkage void ignore_sysret(void);
-
-/* May not be marked __init: used by software suspend */
-void syscall_init(void)
-{
-       /* 
-        * LSTAR and STAR live in a bit strange symbiosis.
-        * They both write to the same internal register. STAR allows to set CS/DS
-        * but only a 32bit target. LSTAR sets the 64bit rip.    
-        */ 
-       wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32); 
-       wrmsrl(MSR_LSTAR, system_call); 
-       wrmsrl(MSR_CSTAR, ignore_sysret);
-
-#ifdef CONFIG_IA32_EMULATION                   
-       syscall32_cpu_init ();
-#endif
-
-       /* Flags to clear on syscall */
-       wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); 
-}
-
-void __cpuinit check_efer(void)
-{
-       unsigned long efer;
-
-       rdmsrl(MSR_EFER, efer); 
-        if (!(efer & EFER_NX) || do_not_nx) { 
-                __supported_pte_mask &= ~_PAGE_NX; 
-        }       
-}
-
-unsigned long kernel_eflags;
-
-/*
- * cpu_init() initializes state that is per-CPU. Some data is already
- * initialized (naturally) in the bootstrap process, such as the GDT
- * and IDT. We reload them nevertheless, this function acts as a
- * 'CPU state barrier', nothing should get across.
- * A lot of state is already set up in PDA init.
- */
-void __cpuinit cpu_init (void)
-{
-       int cpu = stack_smp_processor_id();
-       struct tss_struct *t = &per_cpu(init_tss, cpu);
-       struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
-       unsigned long v; 
-       char *estacks = NULL; 
-       struct task_struct *me;
-       int i;
-
-       /* CPU 0 is initialised in head64.c */
-       if (cpu != 0) {
-               pda_init(cpu);
-       } else 
-               estacks = boot_exception_stacks; 
-
-       me = current;
-
-       if (cpu_test_and_set(cpu, cpu_initialized))
-               panic("CPU#%d already initialized!\n", cpu);
-
-       printk("Initializing CPU#%d\n", cpu);
-
-       clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
-
-       /*
-        * Initialize the per-CPU GDT with the boot GDT,
-        * and set up the GDT descriptor:
-        */
-       if (cpu)
-               memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);
-
-       cpu_gdt_descr[cpu].size = GDT_SIZE;
-       asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu]));
-       asm volatile("lidt %0" :: "m" (idt_descr));
-
-       memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
-       syscall_init();
-
-       wrmsrl(MSR_FS_BASE, 0);
-       wrmsrl(MSR_KERNEL_GS_BASE, 0);
-       barrier(); 
-
-       check_efer();
-
-       /*
-        * set up and load the per-CPU TSS
-        */
-       for (v = 0; v < N_EXCEPTION_STACKS; v++) {
-               static const unsigned int order[N_EXCEPTION_STACKS] = {
-                       [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
-                       [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
-               };
-               if (cpu) {
-                       estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
-                       if (!estacks)
-                               panic("Cannot allocate exception stack %ld %d\n",
-                                     v, cpu); 
-               }
-               estacks += PAGE_SIZE << order[v];
-               orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
-       }
-
-       t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
-       /*
-        * <= is required because the CPU will access up to
-        * 8 bits beyond the end of the IO permission bitmap.
-        */
-       for (i = 0; i <= IO_BITMAP_LONGS; i++)
-               t->io_bitmap[i] = ~0UL;
-
-       atomic_inc(&init_mm.mm_count);
-       me->active_mm = &init_mm;
-       if (me->mm)
-               BUG();
-       enter_lazy_tlb(&init_mm, me);
-
-       set_tss_desc(cpu, t);
-       load_TR_desc();
-       load_LDT(&init_mm.context);
-
-       /*
-        * Clear all 6 debug registers:
-        */
-
-       set_debugreg(0UL, 0);
-       set_debugreg(0UL, 1);
-       set_debugreg(0UL, 2);
-       set_debugreg(0UL, 3);
-       set_debugreg(0UL, 6);
-       set_debugreg(0UL, 7);
-
-       fpu_init(); 
-
-       raw_local_save_flags(kernel_eflags);
-}
diff --git a/arch/x86_64/kernel/setup_64.c b/arch/x86_64/kernel/setup_64.c
deleted file mode 100644 (file)
index af838f6..0000000
+++ /dev/null
@@ -1,1117 +0,0 @@
-/*
- *  linux/arch/x86-64/kernel/setup.c
- *
- *  Copyright (C) 1995  Linus Torvalds
- *
- *  Nov 2001 Dave Jones <davej@suse.de>
- *  Forked from i386 setup code.
- */
-
-/*
- * This file handles the architecture-dependent parts of initialization
- */
-
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/stddef.h>
-#include <linux/unistd.h>
-#include <linux/ptrace.h>
-#include <linux/slab.h>
-#include <linux/user.h>
-#include <linux/a.out.h>
-#include <linux/screen_info.h>
-#include <linux/ioport.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/initrd.h>
-#include <linux/highmem.h>
-#include <linux/bootmem.h>
-#include <linux/module.h>
-#include <asm/processor.h>
-#include <linux/console.h>
-#include <linux/seq_file.h>
-#include <linux/crash_dump.h>
-#include <linux/root_dev.h>
-#include <linux/pci.h>
-#include <linux/acpi.h>
-#include <linux/kallsyms.h>
-#include <linux/edd.h>
-#include <linux/mmzone.h>
-#include <linux/kexec.h>
-#include <linux/cpufreq.h>
-#include <linux/dmi.h>
-#include <linux/dma-mapping.h>
-#include <linux/ctype.h>
-
-#include <asm/mtrr.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <asm/io.h>
-#include <asm/smp.h>
-#include <asm/msr.h>
-#include <asm/desc.h>
-#include <video/edid.h>
-#include <asm/e820.h>
-#include <asm/dma.h>
-#include <asm/mpspec.h>
-#include <asm/mmu_context.h>
-#include <asm/bootsetup.h>
-#include <asm/proto.h>
-#include <asm/setup.h>
-#include <asm/mach_apic.h>
-#include <asm/numa.h>
-#include <asm/sections.h>
-#include <asm/dmi.h>
-
-/*
- * Machine setup..
- */
-
-struct cpuinfo_x86 boot_cpu_data __read_mostly;
-EXPORT_SYMBOL(boot_cpu_data);
-
-unsigned long mmu_cr4_features;
-
-/* Boot loader ID as an integer, for the benefit of proc_dointvec */
-int bootloader_type;
-
-unsigned long saved_video_mode;
-
-int force_mwait __cpuinitdata;
-
-/* 
- * Early DMI memory
- */
-int dmi_alloc_index;
-char dmi_alloc_data[DMI_MAX_DATA];
-
-/*
- * Setup options
- */
-struct screen_info screen_info;
-EXPORT_SYMBOL(screen_info);
-struct sys_desc_table_struct {
-       unsigned short length;
-       unsigned char table[0];
-};
-
-struct edid_info edid_info;
-EXPORT_SYMBOL_GPL(edid_info);
-
-extern int root_mountflags;
-
-char __initdata command_line[COMMAND_LINE_SIZE];
-
-struct resource standard_io_resources[] = {
-       { .name = "dma1", .start = 0x00, .end = 0x1f,
-               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
-       { .name = "pic1", .start = 0x20, .end = 0x21,
-               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
-       { .name = "timer0", .start = 0x40, .end = 0x43,
-               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
-       { .name = "timer1", .start = 0x50, .end = 0x53,
-               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
-       { .name = "keyboard", .start = 0x60, .end = 0x6f,
-               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
-       { .name = "dma page reg", .start = 0x80, .end = 0x8f,
-               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
-       { .name = "pic2", .start = 0xa0, .end = 0xa1,
-               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
-       { .name = "dma2", .start = 0xc0, .end = 0xdf,
-               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
-       { .name = "fpu", .start = 0xf0, .end = 0xff,
-               .flags = IORESOURCE_BUSY | IORESOURCE_IO }
-};
-
-#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
-
-struct resource data_resource = {
-       .name = "Kernel data",
-       .start = 0,
-       .end = 0,
-       .flags = IORESOURCE_RAM,
-};
-struct resource code_resource = {
-       .name = "Kernel code",
-       .start = 0,
-       .end = 0,
-       .flags = IORESOURCE_RAM,
-};
-
-#ifdef CONFIG_PROC_VMCORE
-/* elfcorehdr= specifies the location of elf core header
- * stored by the crashed kernel. This option will be passed
- * by kexec loader to the capture kernel.
- */
-static int __init setup_elfcorehdr(char *arg)
-{
-       char *end;
-       if (!arg)
-               return -EINVAL;
-       elfcorehdr_addr = memparse(arg, &end);
-       return end > arg ? 0 : -EINVAL;
-}
-early_param("elfcorehdr", setup_elfcorehdr);
-#endif
-
-#ifndef CONFIG_NUMA
-static void __init
-contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
-{
-       unsigned long bootmap_size, bootmap;
-
-       bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
-       bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
-       if (bootmap == -1L)
-               panic("Cannot find bootmem map of size %ld\n",bootmap_size);
-       bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
-       e820_register_active_regions(0, start_pfn, end_pfn);
-       free_bootmem_with_active_regions(0, end_pfn);
-       reserve_bootmem(bootmap, bootmap_size);
-} 
-#endif
-
-#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
-struct edd edd;
-#ifdef CONFIG_EDD_MODULE
-EXPORT_SYMBOL(edd);
-#endif
-/**
- * copy_edd() - Copy the BIOS EDD information
- *              from boot_params into a safe place.
- *
- */
-static inline void copy_edd(void)
-{
-     memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
-     memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
-     edd.mbr_signature_nr = EDD_MBR_SIG_NR;
-     edd.edd_info_nr = EDD_NR;
-}
-#else
-static inline void copy_edd(void)
-{
-}
-#endif
-
-#define EBDA_ADDR_POINTER 0x40E
-
-unsigned __initdata ebda_addr;
-unsigned __initdata ebda_size;
-
-static void discover_ebda(void)
-{
-       /*
-        * there is a real-mode segmented pointer pointing to the 
-        * 4K EBDA area at 0x40E
-        */
-       ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
-       ebda_addr <<= 4;
-
-       ebda_size = *(unsigned short *)__va(ebda_addr);
-
-       /* Round EBDA up to pages */
-       if (ebda_size == 0)
-               ebda_size = 1;
-       ebda_size <<= 10;
-       ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
-       if (ebda_size > 64*1024)
-               ebda_size = 64*1024;
-}
-
-void __init setup_arch(char **cmdline_p)
-{
-       printk(KERN_INFO "Command line: %s\n", boot_command_line);
-
-       ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
-       screen_info = SCREEN_INFO;
-       edid_info = EDID_INFO;
-       saved_video_mode = SAVED_VIDEO_MODE;
-       bootloader_type = LOADER_TYPE;
-
-#ifdef CONFIG_BLK_DEV_RAM
-       rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
-       rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
-       rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
-#endif
-       setup_memory_region();
-       copy_edd();
-
-       if (!MOUNT_ROOT_RDONLY)
-               root_mountflags &= ~MS_RDONLY;
-       init_mm.start_code = (unsigned long) &_text;
-       init_mm.end_code = (unsigned long) &_etext;
-       init_mm.end_data = (unsigned long) &_edata;
-       init_mm.brk = (unsigned long) &_end;
-
-       code_resource.start = virt_to_phys(&_text);
-       code_resource.end = virt_to_phys(&_etext)-1;
-       data_resource.start = virt_to_phys(&_etext);
-       data_resource.end = virt_to_phys(&_edata)-1;
-
-       early_identify_cpu(&boot_cpu_data);
-
-       strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
-       *cmdline_p = command_line;
-
-       parse_early_param();
-
-       finish_e820_parsing();
-
-       e820_register_active_regions(0, 0, -1UL);
-       /*
-        * partially used pages are not usable - thus
-        * we are rounding upwards:
-        */
-       end_pfn = e820_end_of_ram();
-       num_physpages = end_pfn;
-
-       check_efer();
-
-       discover_ebda();
-
-       init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
-
-       dmi_scan_machine();
-
-#ifdef CONFIG_ACPI
-       /*
-        * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
-        * Call this early for SRAT node setup.
-        */
-       acpi_boot_table_init();
-#endif
-
-       /* How many end-of-memory variables you have, grandma! */
-       max_low_pfn = end_pfn;
-       max_pfn = end_pfn;
-       high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
-
-       /* Remove active ranges so rediscovery with NUMA-awareness happens */
-       remove_all_active_ranges();
-
-#ifdef CONFIG_ACPI_NUMA
-       /*
-        * Parse SRAT to discover nodes.
-        */
-       acpi_numa_init();
-#endif
-
-#ifdef CONFIG_NUMA
-       numa_initmem_init(0, end_pfn); 
-#else
-       contig_initmem_init(0, end_pfn);
-#endif
-
-       /* Reserve direct mapping */
-       reserve_bootmem_generic(table_start << PAGE_SHIFT, 
-                               (table_end - table_start) << PAGE_SHIFT);
-
-       /* reserve kernel */
-       reserve_bootmem_generic(__pa_symbol(&_text),
-                               __pa_symbol(&_end) - __pa_symbol(&_text));
-
-       /*
-        * reserve physical page 0 - it's a special BIOS page on many boxes,
-        * enabling clean reboots, SMP operation, laptop functions.
-        */
-       reserve_bootmem_generic(0, PAGE_SIZE);
-
-       /* reserve ebda region */
-       if (ebda_addr)
-               reserve_bootmem_generic(ebda_addr, ebda_size);
-#ifdef CONFIG_NUMA
-       /* reserve nodemap region */
-       if (nodemap_addr)
-               reserve_bootmem_generic(nodemap_addr, nodemap_size);
-#endif
-
-#ifdef CONFIG_SMP
-       /* Reserve SMP trampoline */
-       reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE);
-#endif
-
-#ifdef CONFIG_ACPI_SLEEP
-       /*
-        * Reserve low memory region for sleep support.
-        */
-       acpi_reserve_bootmem();
-#endif
-       /*
-        * Find and reserve possible boot-time SMP configuration:
-        */
-       find_smp_config();
-#ifdef CONFIG_BLK_DEV_INITRD
-       if (LOADER_TYPE && INITRD_START) {
-               if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
-                       reserve_bootmem_generic(INITRD_START, INITRD_SIZE);
-                       initrd_start = INITRD_START + PAGE_OFFSET;
-                       initrd_end = initrd_start+INITRD_SIZE;
-               }
-               else {
-                       printk(KERN_ERR "initrd extends beyond end of memory "
-                           "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
-                           (unsigned long)(INITRD_START + INITRD_SIZE),
-                           (unsigned long)(end_pfn << PAGE_SHIFT));
-                       initrd_start = 0;
-               }
-       }
-#endif
-#ifdef CONFIG_KEXEC
-       if (crashk_res.start != crashk_res.end) {
-               reserve_bootmem_generic(crashk_res.start,
-                       crashk_res.end - crashk_res.start + 1);
-       }
-#endif
-
-       paging_init();
-
-#ifdef CONFIG_PCI
-       early_quirks();
-#endif
-
-       /*
-        * set this early, so we dont allocate cpu0
-        * if MADT list doesnt list BSP first
-        * mpparse.c/MP_processor_info() allocates logical cpu numbers.
-        */
-       cpu_set(0, cpu_present_map);
-#ifdef CONFIG_ACPI
-       /*
-        * Read APIC and some other early information from ACPI tables.
-        */
-       acpi_boot_init();
-#endif
-
-       init_cpu_to_node();
-
-       /*
-        * get boot-time SMP configuration:
-        */
-       if (smp_found_config)
-               get_smp_config();
-       init_apic_mappings();
-
-       /*
-        * We trust e820 completely. No explicit ROM probing in memory.
-        */
-       e820_reserve_resources(); 
-       e820_mark_nosave_regions();
-
-       {
-       unsigned i;
-       /* request I/O space for devices used on all i[345]86 PCs */
-       for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
-               request_resource(&ioport_resource, &standard_io_resources[i]);
-       }
-
-       e820_setup_gap();
-
-#ifdef CONFIG_VT
-#if defined(CONFIG_VGA_CONSOLE)
-       conswitchp = &vga_con;
-#elif defined(CONFIG_DUMMY_CONSOLE)
-       conswitchp = &dummy_con;
-#endif
-#endif
-}
-
-static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
-{
-       unsigned int *v;
-
-       if (c->extended_cpuid_level < 0x80000004)
-               return 0;
-
-       v = (unsigned int *) c->x86_model_id;
-       cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
-       cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
-       cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
-       c->x86_model_id[48] = 0;
-       return 1;
-}
-
-
-static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
-{
-       unsigned int n, dummy, eax, ebx, ecx, edx;
-
-       n = c->extended_cpuid_level;
-
-       if (n >= 0x80000005) {
-               cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
-               printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
-                       edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
-               c->x86_cache_size=(ecx>>24)+(edx>>24);
-               /* On K8 L1 TLB is inclusive, so don't count it */
-               c->x86_tlbsize = 0;
-       }
-
-       if (n >= 0x80000006) {
-               cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
-               ecx = cpuid_ecx(0x80000006);
-               c->x86_cache_size = ecx >> 16;
-               c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
-
-               printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
-               c->x86_cache_size, ecx & 0xFF);
-       }
-
-       if (n >= 0x80000007)
-               cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); 
-       if (n >= 0x80000008) {
-               cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); 
-               c->x86_virt_bits = (eax >> 8) & 0xff;
-               c->x86_phys_bits = eax & 0xff;
-       }
-}
-
-#ifdef CONFIG_NUMA
-static int nearby_node(int apicid)
-{
-       int i;
-       for (i = apicid - 1; i >= 0; i--) {
-               int node = apicid_to_node[i];
-               if (node != NUMA_NO_NODE && node_online(node))
-                       return node;
-       }
-       for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
-               int node = apicid_to_node[i];
-               if (node != NUMA_NO_NODE && node_online(node))
-                       return node;
-       }
-       return first_node(node_online_map); /* Shouldn't happen */
-}
-#endif
-
-/*
- * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
- * Assumes number of cores is a power of two.
- */
-static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
-       unsigned bits;
-#ifdef CONFIG_NUMA
-       int cpu = smp_processor_id();
-       int node = 0;
-       unsigned apicid = hard_smp_processor_id();
-#endif
-       unsigned ecx = cpuid_ecx(0x80000008);
-
-       c->x86_max_cores = (ecx & 0xff) + 1;
-
-       /* CPU telling us the core id bits shift? */
-       bits = (ecx >> 12) & 0xF;
-
-       /* Otherwise recompute */
-       if (bits == 0) {
-               while ((1 << bits) < c->x86_max_cores)
-                       bits++;
-       }
-
-       /* Low order bits define the core id (index of core in socket) */
-       c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
-       /* Convert the APIC ID into the socket ID */
-       c->phys_proc_id = phys_pkg_id(bits);
-
-#ifdef CONFIG_NUMA
-       node = c->phys_proc_id;
-       if (apicid_to_node[apicid] != NUMA_NO_NODE)
-               node = apicid_to_node[apicid];
-       if (!node_online(node)) {
-               /* Two possibilities here:
-                  - The CPU is missing memory and no node was created.
-                  In that case try picking one from a nearby CPU
-                  - The APIC IDs differ from the HyperTransport node IDs
-                  which the K8 northbridge parsing fills in.
-                  Assume they are all increased by a constant offset,
-                  but in the same order as the HT nodeids.
-                  If that doesn't result in a usable node fall back to the
-                  path for the previous case.  */
-               int ht_nodeid = apicid - (cpu_data[0].phys_proc_id << bits);
-               if (ht_nodeid >= 0 &&
-                   apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
-                       node = apicid_to_node[ht_nodeid];
-               /* Pick a nearby node */
-               if (!node_online(node))
-                       node = nearby_node(apicid);
-       }
-       numa_set_node(cpu, node);
-
-       printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
-#endif
-#endif
-}
-
-static void __cpuinit init_amd(struct cpuinfo_x86 *c)
-{
-       unsigned level;
-
-#ifdef CONFIG_SMP
-       unsigned long value;
-
-       /*
-        * Disable TLB flush filter by setting HWCR.FFDIS on K8
-        * bit 6 of msr C001_0015
-        *
-        * Errata 63 for SH-B3 steppings
-        * Errata 122 for all steppings (F+ have it disabled by default)
-        */
-       if (c->x86 == 15) {
-               rdmsrl(MSR_K8_HWCR, value);
-               value |= 1 << 6;
-               wrmsrl(MSR_K8_HWCR, value);
-       }
-#endif
-
-       /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
-          3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
-       clear_bit(0*32+31, &c->x86_capability);
-       
-       /* On C+ stepping K8 rep microcode works well for copy/memset */
-       level = cpuid_eax(1);
-       if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
-               set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
-       if (c->x86 == 0x10)
-               set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
-
-       /* Enable workaround for FXSAVE leak */
-       if (c->x86 >= 6)
-               set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
-
-       level = get_model_name(c);
-       if (!level) {
-               switch (c->x86) { 
-               case 15:
-                       /* Should distinguish Models here, but this is only
-                          a fallback anyways. */
-                       strcpy(c->x86_model_id, "Hammer");
-                       break; 
-               } 
-       } 
-       display_cacheinfo(c);
-
-       /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
-       if (c->x86_power & (1<<8))
-               set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
-
-       /* Multi core CPU? */
-       if (c->extended_cpuid_level >= 0x80000008)
-               amd_detect_cmp(c);
-
-       if (c->extended_cpuid_level >= 0x80000006 &&
-               (cpuid_edx(0x80000006) & 0xf000))
-               num_cache_leaves = 4;
-       else
-               num_cache_leaves = 3;
-
-       if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
-               set_bit(X86_FEATURE_K8, &c->x86_capability);
-
-       /* RDTSC can be speculated around */
-       clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
-
-       /* Family 10 doesn't support C states in MWAIT so don't use it */
-       if (c->x86 == 0x10 && !force_mwait)
-               clear_bit(X86_FEATURE_MWAIT, &c->x86_capability);
-}
-
-static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
-       u32     eax, ebx, ecx, edx;
-       int     index_msb, core_bits;
-
-       cpuid(1, &eax, &ebx, &ecx, &edx);
-
-
-       if (!cpu_has(c, X86_FEATURE_HT))
-               return;
-       if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
-               goto out;
-
-       smp_num_siblings = (ebx & 0xff0000) >> 16;
-
-       if (smp_num_siblings == 1) {
-               printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
-       } else if (smp_num_siblings > 1 ) {
-
-               if (smp_num_siblings > NR_CPUS) {
-                       printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
-                       smp_num_siblings = 1;
-                       return;
-               }
-
-               index_msb = get_count_order(smp_num_siblings);
-               c->phys_proc_id = phys_pkg_id(index_msb);
-
-               smp_num_siblings = smp_num_siblings / c->x86_max_cores;
-
-               index_msb = get_count_order(smp_num_siblings) ;
-
-               core_bits = get_count_order(c->x86_max_cores);
-
-               c->cpu_core_id = phys_pkg_id(index_msb) &
-                                              ((1 << core_bits) - 1);
-       }
-out:
-       if ((c->x86_max_cores * smp_num_siblings) > 1) {
-               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n", c->phys_proc_id);
-               printk(KERN_INFO  "CPU: Processor Core ID: %d\n", c->cpu_core_id);
-       }
-
-#endif
-}
-
-/*
- * find out the number of processor cores on the die
- */
-static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
-{
-       unsigned int eax, t;
-
-       if (c->cpuid_level < 4)
-               return 1;
-
-       cpuid_count(4, 0, &eax, &t, &t, &t);
-
-       if (eax & 0x1f)
-               return ((eax >> 26) + 1);
-       else
-               return 1;
-}
-
-static void srat_detect_node(void)
-{
-#ifdef CONFIG_NUMA
-       unsigned node;
-       int cpu = smp_processor_id();
-       int apicid = hard_smp_processor_id();
-
-       /* Don't do the funky fallback heuristics the AMD version employs
-          for now. */
-       node = apicid_to_node[apicid];
-       if (node == NUMA_NO_NODE)
-               node = first_node(node_online_map);
-       numa_set_node(cpu, node);
-
-       printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
-#endif
-}
-
-static void __cpuinit init_intel(struct cpuinfo_x86 *c)
-{
-       /* Cache sizes */
-       unsigned n;
-
-       init_intel_cacheinfo(c);
-       if (c->cpuid_level > 9 ) {
-               unsigned eax = cpuid_eax(10);
-               /* Check for version and the number of counters */
-               if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
-                       set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
-       }
-
-       if (cpu_has_ds) {
-               unsigned int l1, l2;
-               rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
-               if (!(l1 & (1<<11)))
-                       set_bit(X86_FEATURE_BTS, c->x86_capability);
-               if (!(l1 & (1<<12)))
-                       set_bit(X86_FEATURE_PEBS, c->x86_capability);
-       }
-
-       n = c->extended_cpuid_level;
-       if (n >= 0x80000008) {
-               unsigned eax = cpuid_eax(0x80000008);
-               c->x86_virt_bits = (eax >> 8) & 0xff;
-               c->x86_phys_bits = eax & 0xff;
-               /* CPUID workaround for Intel 0F34 CPU */
-               if (c->x86_vendor == X86_VENDOR_INTEL &&
-                   c->x86 == 0xF && c->x86_model == 0x3 &&
-                   c->x86_mask == 0x4)
-                       c->x86_phys_bits = 36;
-       }
-
-       if (c->x86 == 15)
-               c->x86_cache_alignment = c->x86_clflush_size * 2;
-       if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
-           (c->x86 == 0x6 && c->x86_model >= 0x0e))
-               set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
-       if (c->x86 == 6)
-               set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
-       if (c->x86 == 15)
-               set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
-       else
-               clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
-       c->x86_max_cores = intel_num_cpu_cores(c);
-
-       srat_detect_node();
-}
-
-static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
-{
-       char *v = c->x86_vendor_id;
-
-       if (!strcmp(v, "AuthenticAMD"))
-               c->x86_vendor = X86_VENDOR_AMD;
-       else if (!strcmp(v, "GenuineIntel"))
-               c->x86_vendor = X86_VENDOR_INTEL;
-       else
-               c->x86_vendor = X86_VENDOR_UNKNOWN;
-}
-
-struct cpu_model_info {
-       int vendor;
-       int family;
-       char *model_names[16];
-};
-
-/* Do some early cpuid on the boot CPU to get some parameter that are
-   needed before check_bugs. Everything advanced is in identify_cpu
-   below. */
-void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
-{
-       u32 tfms;
-
-       c->loops_per_jiffy = loops_per_jiffy;
-       c->x86_cache_size = -1;
-       c->x86_vendor = X86_VENDOR_UNKNOWN;
-       c->x86_model = c->x86_mask = 0; /* So far unknown... */
-       c->x86_vendor_id[0] = '\0'; /* Unset */
-       c->x86_model_id[0] = '\0';  /* Unset */
-       c->x86_clflush_size = 64;
-       c->x86_cache_alignment = c->x86_clflush_size;
-       c->x86_max_cores = 1;
-       c->extended_cpuid_level = 0;
-       memset(&c->x86_capability, 0, sizeof c->x86_capability);
-
-       /* Get vendor name */
-       cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
-             (unsigned int *)&c->x86_vendor_id[0],
-             (unsigned int *)&c->x86_vendor_id[8],
-             (unsigned int *)&c->x86_vendor_id[4]);
-               
-       get_cpu_vendor(c);
-
-       /* Initialize the standard set of capabilities */
-       /* Note that the vendor-specific code below might override */
-
-       /* Intel-defined flags: level 0x00000001 */
-       if (c->cpuid_level >= 0x00000001) {
-               __u32 misc;
-               cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
-                     &c->x86_capability[0]);
-               c->x86 = (tfms >> 8) & 0xf;
-               c->x86_model = (tfms >> 4) & 0xf;
-               c->x86_mask = tfms & 0xf;
-               if (c->x86 == 0xf)
-                       c->x86 += (tfms >> 20) & 0xff;
-               if (c->x86 >= 0x6)
-                       c->x86_model += ((tfms >> 16) & 0xF) << 4;
-               if (c->x86_capability[0] & (1<<19)) 
-                       c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
-       } else {
-               /* Have CPUID level 0 only - unheard of */
-               c->x86 = 4;
-       }
-
-#ifdef CONFIG_SMP
-       c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
-#endif
-}
-
-/*
- * This does the hard work of actually picking apart the CPU stuff...
- */
-void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
-{
-       int i;
-       u32 xlvl;
-
-       early_identify_cpu(c);
-
-       /* AMD-defined flags: level 0x80000001 */
-       xlvl = cpuid_eax(0x80000000);
-       c->extended_cpuid_level = xlvl;
-       if ((xlvl & 0xffff0000) == 0x80000000) {
-               if (xlvl >= 0x80000001) {
-                       c->x86_capability[1] = cpuid_edx(0x80000001);
-                       c->x86_capability[6] = cpuid_ecx(0x80000001);
-               }
-               if (xlvl >= 0x80000004)
-                       get_model_name(c); /* Default name */
-       }
-
-       /* Transmeta-defined flags: level 0x80860001 */
-       xlvl = cpuid_eax(0x80860000);
-       if ((xlvl & 0xffff0000) == 0x80860000) {
-               /* Don't set x86_cpuid_level here for now to not confuse. */
-               if (xlvl >= 0x80860001)
-                       c->x86_capability[2] = cpuid_edx(0x80860001);
-       }
-
-       init_scattered_cpuid_features(c);
-
-       c->apicid = phys_pkg_id(0);
-
-       /*
-        * Vendor-specific initialization.  In this section we
-        * canonicalize the feature flags, meaning if there are
-        * features a certain CPU supports which CPUID doesn't
-        * tell us, CPUID claiming incorrect flags, or other bugs,
-        * we handle them here.
-        *
-        * At the end of this section, c->x86_capability better
-        * indicate the features this CPU genuinely supports!
-        */
-       switch (c->x86_vendor) {
-       case X86_VENDOR_AMD:
-               init_amd(c);
-               break;
-
-       case X86_VENDOR_INTEL:
-               init_intel(c);
-               break;
-
-       case X86_VENDOR_UNKNOWN:
-       default:
-               display_cacheinfo(c);
-               break;
-       }
-
-       select_idle_routine(c);
-       detect_ht(c); 
-
-       /*
-        * On SMP, boot_cpu_data holds the common feature set between
-        * all CPUs; so make sure that we indicate which features are
-        * common between the CPUs.  The first time this routine gets
-        * executed, c == &boot_cpu_data.
-        */
-       if (c != &boot_cpu_data) {
-               /* AND the already accumulated flags with these */
-               for (i = 0 ; i < NCAPINTS ; i++)
-                       boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
-       }
-
-#ifdef CONFIG_X86_MCE
-       mcheck_init(c);
-#endif
-       if (c != &boot_cpu_data)
-               mtrr_ap_init();
-#ifdef CONFIG_NUMA
-       numa_add_cpu(smp_processor_id());
-#endif
-}
-
-void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
-{
-       if (c->x86_model_id[0])
-               printk("%s", c->x86_model_id);
-
-       if (c->x86_mask || c->cpuid_level >= 0) 
-               printk(" stepping %02x\n", c->x86_mask);
-       else
-               printk("\n");
-}
-
-/*
- *     Get CPU information for use by the procfs.
- */
-
-static int show_cpuinfo(struct seq_file *m, void *v)
-{
-       struct cpuinfo_x86 *c = v;
-
-       /* 
-        * These flag bits must match the definitions in <asm/cpufeature.h>.
-        * NULL means this bit is undefined or reserved; either way it doesn't
-        * have meaning as far as Linux is concerned.  Note that it's important
-        * to realize there is a difference between this table and CPUID -- if
-        * applications want to get the raw CPUID data, they should access
-        * /dev/cpu/<cpu_nr>/cpuid instead.
-        */
-       static char *x86_cap_flags[] = {
-               /* Intel-defined */
-               "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
-               "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
-               "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
-               "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
-
-               /* AMD-defined */
-               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-               NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
-               NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
-               NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
-               "3dnowext", "3dnow",
-
-               /* Transmeta-defined */
-               "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
-               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-               /* Other (Linux-defined) */
-               "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
-               NULL, NULL, NULL, NULL,
-               "constant_tsc", "up", NULL, "arch_perfmon",
-               "pebs", "bts", NULL, "sync_rdtsc",
-               "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-               /* Intel-defined (#2) */
-               "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
-               "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
-               NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt",
-               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-               /* VIA/Cyrix/Centaur-defined */
-               NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
-               "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
-               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-               /* AMD-defined (#2) */
-               "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8_legacy",
-               "altmovcr8", "abm", "sse4a",
-               "misalignsse", "3dnowprefetch",
-               "osvw", "ibs", NULL, NULL, NULL, NULL,
-               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-               /* Auxiliary (Linux-defined) */
-               "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-               NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-       };
-       static char *x86_power_flags[] = { 
-               "ts",   /* temperature sensor */
-               "fid",  /* frequency id control */
-               "vid",  /* voltage id control */
-               "ttp",  /* thermal trip */
-               "tm",
-               "stc",
-               "100mhzsteps",
-               "hwpstate",
-               "",     /* tsc invariant mapped to constant_tsc */
-               /* nothing */
-       };
-
-
-#ifdef CONFIG_SMP
-       if (!cpu_online(c-cpu_data))
-               return 0;
-#endif
-
-       seq_printf(m,"processor\t: %u\n"
-                    "vendor_id\t: %s\n"
-                    "cpu family\t: %d\n"
-                    "model\t\t: %d\n"
-                    "model name\t: %s\n",
-                    (unsigned)(c-cpu_data),
-                    c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
-                    c->x86,
-                    (int)c->x86_model,
-                    c->x86_model_id[0] ? c->x86_model_id : "unknown");
-       
-       if (c->x86_mask || c->cpuid_level >= 0)
-               seq_printf(m, "stepping\t: %d\n", c->x86_mask);
-       else
-               seq_printf(m, "stepping\t: unknown\n");
-       
-       if (cpu_has(c,X86_FEATURE_TSC)) {
-               unsigned int freq = cpufreq_quick_get((unsigned)(c-cpu_data));
-               if (!freq)
-                       freq = cpu_khz;
-               seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
-                            freq / 1000, (freq % 1000));
-       }
-
-       /* Cache size */
-       if (c->x86_cache_size >= 0) 
-               seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
-       
-#ifdef CONFIG_SMP
-       if (smp_num_siblings * c->x86_max_cores > 1) {
-               int cpu = c - cpu_data;
-               seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
-               seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu]));
-               seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
-               seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
-       }
-#endif 
-
-       seq_printf(m,
-               "fpu\t\t: yes\n"
-               "fpu_exception\t: yes\n"
-               "cpuid level\t: %d\n"
-               "wp\t\t: yes\n"
-               "flags\t\t:",
-                  c->cpuid_level);
-
-       { 
-               int i; 
-               for ( i = 0 ; i < 32*NCAPINTS ; i++ )
-                       if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
-                               seq_printf(m, " %s", x86_cap_flags[i]);
-       }
-               
-       seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
-                  c->loops_per_jiffy/(500000/HZ),
-                  (c->loops_per_jiffy/(5000/HZ)) % 100);
-
-       if (c->x86_tlbsize > 0) 
-               seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
-       seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
-       seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
-
-       seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", 
-                  c->x86_phys_bits, c->x86_virt_bits);
-
-       seq_printf(m, "power management:");
-       {
-               unsigned i;
-               for (i = 0; i < 32; i++) 
-                       if (c->x86_power & (1 << i)) {
-                               if (i < ARRAY_SIZE(x86_power_flags) &&
-                                       x86_power_flags[i])
-                                       seq_printf(m, "%s%s",
-                                               x86_power_flags[i][0]?" ":"",
-                                               x86_power_flags[i]);
-                               else
-                                       seq_printf(m, " [%d]", i);
-                       }
-       }
-
-       seq_printf(m, "\n\n");
-
-       return 0;
-}
-
-static void *c_start(struct seq_file *m, loff_t *pos)
-{
-       return *pos < NR_CPUS ? cpu_data + *pos : NULL;
-}
-
-static void *c_next(struct seq_file *m, void *v, loff_t *pos)
-{
-       ++*pos;
-       return c_start(m, pos);
-}
-
-static void c_stop(struct seq_file *m, void *v)
-{
-}
-
-struct seq_operations cpuinfo_op = {
-       .start =c_start,
-       .next = c_next,
-       .stop = c_stop,
-       .show = show_cpuinfo,
-};
diff --git a/arch/x86_64/kernel/signal_64.c b/arch/x86_64/kernel/signal_64.c
deleted file mode 100644 (file)
index 739175b..0000000
+++ /dev/null
@@ -1,495 +0,0 @@
-/*
- *  linux/arch/x86_64/kernel/signal.c
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- *  Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
- *
- *  1997-11-28  Modified for POSIX.1b signals by Richard Henderson
- *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
- *  2000-2002   x86-64 support by Andi Kleen
- */
-
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/kernel.h>
-#include <linux/signal.h>
-#include <linux/errno.h>
-#include <linux/wait.h>
-#include <linux/ptrace.h>
-#include <linux/unistd.h>
-#include <linux/stddef.h>
-#include <linux/personality.h>
-#include <linux/compiler.h>
-#include <asm/ucontext.h>
-#include <asm/uaccess.h>
-#include <asm/i387.h>
-#include <asm/proto.h>
-#include <asm/ia32_unistd.h>
-#include <asm/mce.h>
-
-/* #define DEBUG_SIG 1 */
-
-#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-
-int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
-               sigset_t *set, struct pt_regs * regs); 
-int ia32_setup_frame(int sig, struct k_sigaction *ka,
-            sigset_t *set, struct pt_regs * regs); 
-
-asmlinkage long
-sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
-               struct pt_regs *regs)
-{
-       return do_sigaltstack(uss, uoss, regs->rsp);
-}
-
-
-/*
- * Do a signal return; undo the signal stack.
- */
-
-struct rt_sigframe
-{
-       char __user *pretcode;
-       struct ucontext uc;
-       struct siginfo info;
-};
-
-static int
-restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *prax)
-{
-       unsigned int err = 0;
-
-       /* Always make any pending restarted system calls return -EINTR */
-       current_thread_info()->restart_block.fn = do_no_restart_syscall;
-
-#define COPY(x)                err |= __get_user(regs->x, &sc->x)
-
-       COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx);
-       COPY(rdx); COPY(rcx); COPY(rip);
-       COPY(r8);
-       COPY(r9);
-       COPY(r10);
-       COPY(r11);
-       COPY(r12);
-       COPY(r13);
-       COPY(r14);
-       COPY(r15);
-
-       /* Kernel saves and restores only the CS segment register on signals,
-        * which is the bare minimum needed to allow mixed 32/64-bit code.
-        * App's signal handler can save/restore other segments if needed. */
-       {
-               unsigned cs;
-               err |= __get_user(cs, &sc->cs);
-               regs->cs = cs | 3;      /* Force into user mode */
-       }
-
-       {
-               unsigned int tmpflags;
-               err |= __get_user(tmpflags, &sc->eflags);
-               regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5);
-               regs->orig_rax = -1;            /* disable syscall checks */
-       }
-
-       {
-               struct _fpstate __user * buf;
-               err |= __get_user(buf, &sc->fpstate);
-
-               if (buf) {
-                       if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
-                               goto badframe;
-                       err |= restore_i387(buf);
-               } else {
-                       struct task_struct *me = current;
-                       if (used_math()) {
-                               clear_fpu(me);
-                               clear_used_math();
-                       }
-               }
-       }
-
-       err |= __get_user(*prax, &sc->rax);
-       return err;
-
-badframe:
-       return 1;
-}
-
-asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
-{
-       struct rt_sigframe __user *frame;
-       sigset_t set;
-       unsigned long eax;
-
-       frame = (struct rt_sigframe __user *)(regs->rsp - 8);
-       if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) {
-               goto badframe;
-       } 
-       if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) { 
-               goto badframe;
-       } 
-
-       sigdelsetmask(&set, ~_BLOCKABLE);
-       spin_lock_irq(&current->sighand->siglock);
-       current->blocked = set;
-       recalc_sigpending();
-       spin_unlock_irq(&current->sighand->siglock);
-       
-       if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
-               goto badframe;
-
-#ifdef DEBUG_SIG
-       printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs->rip,regs->rsp,frame,eax);
-#endif
-
-       if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT)
-               goto badframe;
-
-       return eax;
-
-badframe:
-       signal_fault(regs,frame,"sigreturn");
-       return 0;
-}      
-
-/*
- * Set up a signal frame.
- */
-
-static inline int
-setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me)
-{
-       int err = 0;
-
-       err |= __put_user(regs->cs, &sc->cs);
-       err |= __put_user(0, &sc->gs);
-       err |= __put_user(0, &sc->fs);
-
-       err |= __put_user(regs->rdi, &sc->rdi);
-       err |= __put_user(regs->rsi, &sc->rsi);
-       err |= __put_user(regs->rbp, &sc->rbp);
-       err |= __put_user(regs->rsp, &sc->rsp);
-       err |= __put_user(regs->rbx, &sc->rbx);
-       err |= __put_user(regs->rdx, &sc->rdx);
-       err |= __put_user(regs->rcx, &sc->rcx);
-       err |= __put_user(regs->rax, &sc->rax);
-       err |= __put_user(regs->r8, &sc->r8);
-       err |= __put_user(regs->r9, &sc->r9);
-       err |= __put_user(regs->r10, &sc->r10);
-       err |= __put_user(regs->r11, &sc->r11);
-       err |= __put_user(regs->r12, &sc->r12);
-       err |= __put_user(regs->r13, &sc->r13);
-       err |= __put_user(regs->r14, &sc->r14);
-       err |= __put_user(regs->r15, &sc->r15);
-       err |= __put_user(me->thread.trap_no, &sc->trapno);
-       err |= __put_user(me->thread.error_code, &sc->err);
-       err |= __put_user(regs->rip, &sc->rip);
-       err |= __put_user(regs->eflags, &sc->eflags);
-       err |= __put_user(mask, &sc->oldmask);
-       err |= __put_user(me->thread.cr2, &sc->cr2);
-
-       return err;
-}
-
-/*
- * Determine which stack to use..
- */
-
-static void __user *
-get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
-{
-       unsigned long rsp;
-
-       /* Default to using normal stack - redzone*/
-       rsp = regs->rsp - 128;
-
-       /* This is the X/Open sanctioned signal stack switching.  */
-       if (ka->sa.sa_flags & SA_ONSTACK) {
-               if (sas_ss_flags(rsp) == 0)
-                       rsp = current->sas_ss_sp + current->sas_ss_size;
-       }
-
-       return (void __user *)round_down(rsp - size, 16); 
-}
-
-static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
-                          sigset_t *set, struct pt_regs * regs)
-{
-       struct rt_sigframe __user *frame;
-       struct _fpstate __user *fp = NULL; 
-       int err = 0;
-       struct task_struct *me = current;
-
-       if (used_math()) {
-               fp = get_stack(ka, regs, sizeof(struct _fpstate)); 
-               frame = (void __user *)round_down(
-                       (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
-
-               if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate)))
-                       goto give_sigsegv;
-
-               if (save_i387(fp) < 0) 
-                       err |= -1; 
-       } else
-               frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
-
-       if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
-               goto give_sigsegv;
-
-       if (ka->sa.sa_flags & SA_SIGINFO) { 
-               err |= copy_siginfo_to_user(&frame->info, info);
-               if (err)
-                       goto give_sigsegv;
-       }
-               
-       /* Create the ucontext.  */
-       err |= __put_user(0, &frame->uc.uc_flags);
-       err |= __put_user(0, &frame->uc.uc_link);
-       err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
-       err |= __put_user(sas_ss_flags(regs->rsp),
-                         &frame->uc.uc_stack.ss_flags);
-       err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
-       err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
-       err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
-       if (sizeof(*set) == 16) { 
-               __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
-               __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); 
-       } else
-               err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
-
-       /* Set up to return from userspace.  If provided, use a stub
-          already in userspace.  */
-       /* x86-64 should always use SA_RESTORER. */
-       if (ka->sa.sa_flags & SA_RESTORER) {
-               err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
-       } else {
-               /* could use a vstub here */
-               goto give_sigsegv; 
-       }
-
-       if (err)
-               goto give_sigsegv;
-
-#ifdef DEBUG_SIG
-       printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax);
-#endif
-
-       /* Set up registers for signal handler */
-       regs->rdi = sig;
-       /* In case the signal handler was declared without prototypes */ 
-       regs->rax = 0;  
-
-       /* This also works for non SA_SIGINFO handlers because they expect the
-          next argument after the signal number on the stack. */
-       regs->rsi = (unsigned long)&frame->info; 
-       regs->rdx = (unsigned long)&frame->uc; 
-       regs->rip = (unsigned long) ka->sa.sa_handler;
-
-       regs->rsp = (unsigned long)frame;
-
-       /* Set up the CS register to run signal handlers in 64-bit mode,
-          even if the handler happens to be interrupting 32-bit code. */
-       regs->cs = __USER_CS;
-
-       /* This, by contrast, has nothing to do with segment registers -
-          see include/asm-x86_64/uaccess.h for details. */
-       set_fs(USER_DS);
-
-       regs->eflags &= ~TF_MASK;
-       if (test_thread_flag(TIF_SINGLESTEP))
-               ptrace_notify(SIGTRAP);
-#ifdef DEBUG_SIG
-       printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n",
-               current->comm, current->pid, frame, regs->rip, frame->pretcode);
-#endif
-
-       return 0;
-
-give_sigsegv:
-       force_sigsegv(sig, current);
-       return -EFAULT;
-}
-
-/*
- * OK, we're invoking a handler
- */    
-
-static int
-handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
-               sigset_t *oldset, struct pt_regs *regs)
-{
-       int ret;
-
-#ifdef DEBUG_SIG
-       printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n",
-               current->pid, sig,
-               regs->rip, regs->rsp, regs);
-#endif
-
-       /* Are we from a system call? */
-       if ((long)regs->orig_rax >= 0) {
-               /* If so, check system call restarting.. */
-               switch (regs->rax) {
-                       case -ERESTART_RESTARTBLOCK:
-                       case -ERESTARTNOHAND:
-                               regs->rax = -EINTR;
-                               break;
-
-                       case -ERESTARTSYS:
-                               if (!(ka->sa.sa_flags & SA_RESTART)) {
-                                       regs->rax = -EINTR;
-                                       break;
-                               }
-                               /* fallthrough */
-                       case -ERESTARTNOINTR:
-                               regs->rax = regs->orig_rax;
-                               regs->rip -= 2;
-                               break;
-               }
-       }
-
-       /*
-        * If TF is set due to a debugger (PT_DTRACE), clear the TF
-        * flag so that register information in the sigcontext is
-        * correct.
-        */
-       if (unlikely(regs->eflags & TF_MASK)) {
-               if (likely(current->ptrace & PT_DTRACE)) {
-                       current->ptrace &= ~PT_DTRACE;
-                       regs->eflags &= ~TF_MASK;
-               }
-       }
-
-#ifdef CONFIG_IA32_EMULATION
-       if (test_thread_flag(TIF_IA32)) {
-               if (ka->sa.sa_flags & SA_SIGINFO)
-                       ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
-               else
-                       ret = ia32_setup_frame(sig, ka, oldset, regs);
-       } else 
-#endif
-       ret = setup_rt_frame(sig, ka, info, oldset, regs);
-
-       if (ret == 0) {
-               spin_lock_irq(&current->sighand->siglock);
-               sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
-               if (!(ka->sa.sa_flags & SA_NODEFER))
-                       sigaddset(&current->blocked,sig);
-               recalc_sigpending();
-               spin_unlock_irq(&current->sighand->siglock);
-       }
-
-       return ret;
-}
-
-/*
- * Note that 'init' is a special process: it doesn't get signals it doesn't
- * want to handle. Thus you cannot kill init even with a SIGKILL even by
- * mistake.
- */
-static void do_signal(struct pt_regs *regs)
-{
-       struct k_sigaction ka;
-       siginfo_t info;
-       int signr;
-       sigset_t *oldset;
-
-       /*
-        * We want the common case to go fast, which
-        * is why we may in certain cases get here from
-        * kernel mode. Just return without doing anything
-        * if so.
-        */
-       if (!user_mode(regs))
-               return;
-
-       if (test_thread_flag(TIF_RESTORE_SIGMASK))
-               oldset = &current->saved_sigmask;
-       else
-               oldset = &current->blocked;
-
-       signr = get_signal_to_deliver(&info, &ka, regs, NULL);
-       if (signr > 0) {
-               /* Reenable any watchpoints before delivering the
-                * signal to user space. The processor register will
-                * have been cleared if the watchpoint triggered
-                * inside the kernel.
-                */
-               if (current->thread.debugreg7)
-                       set_debugreg(current->thread.debugreg7, 7);
-
-               /* Whee!  Actually deliver the signal.  */
-               if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
-                       /* a signal was successfully delivered; the saved
-                        * sigmask will have been stored in the signal frame,
-                        * and will be restored by sigreturn, so we can simply
-                        * clear the TIF_RESTORE_SIGMASK flag */
-                       clear_thread_flag(TIF_RESTORE_SIGMASK);
-               }
-               return;
-       }
-
-       /* Did we come from a system call? */
-       if ((long)regs->orig_rax >= 0) {
-               /* Restart the system call - no handlers present */
-               long res = regs->rax;
-               switch (res) {
-               case -ERESTARTNOHAND:
-               case -ERESTARTSYS:
-               case -ERESTARTNOINTR:
-                       regs->rax = regs->orig_rax;
-                       regs->rip -= 2;
-                       break;
-               case -ERESTART_RESTARTBLOCK:
-                       regs->rax = test_thread_flag(TIF_IA32) ?
-                                       __NR_ia32_restart_syscall :
-                                       __NR_restart_syscall;
-                       regs->rip -= 2;
-                       break;
-               }
-       }
-
-       /* if there's no signal to deliver, we just put the saved sigmask
-          back. */
-       if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
-               clear_thread_flag(TIF_RESTORE_SIGMASK);
-               sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
-       }
-}
-
-void
-do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
-{
-#ifdef DEBUG_SIG
-       printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%p pending:%x\n",
-              thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); 
-#endif
-              
-       /* Pending single-step? */
-       if (thread_info_flags & _TIF_SINGLESTEP) {
-               regs->eflags |= TF_MASK;
-               clear_thread_flag(TIF_SINGLESTEP);
-       }
-
-#ifdef CONFIG_X86_MCE
-       /* notify userspace of pending MCEs */
-       if (thread_info_flags & _TIF_MCE_NOTIFY)
-               mce_notify_user();
-#endif /* CONFIG_X86_MCE */
-
-       /* deal with pending signal delivery */
-       if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
-               do_signal(regs);
-}
-
-void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
-{ 
-       struct task_struct *me = current; 
-       if (show_unhandled_signals && printk_ratelimit())
-               printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n",
-              me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); 
-
-       force_sig(SIGSEGV, me); 
-} 
diff --git a/arch/x86_64/kernel/smp_64.c b/arch/x86_64/kernel/smp_64.c
deleted file mode 100644 (file)
index df4a828..0000000
+++ /dev/null
@@ -1,523 +0,0 @@
-/*
- *     Intel SMP support routines.
- *
- *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
- *     (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
- *      (c) 2002,2003 Andi Kleen, SuSE Labs.
- *
- *     This code is released under the GNU General Public License version 2 or
- *     later.
- */
-
-#include <linux/init.h>
-
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/kernel_stat.h>
-#include <linux/mc146818rtc.h>
-#include <linux/interrupt.h>
-
-#include <asm/mtrr.h>
-#include <asm/pgalloc.h>
-#include <asm/tlbflush.h>
-#include <asm/mach_apic.h>
-#include <asm/mmu_context.h>
-#include <asm/proto.h>
-#include <asm/apicdef.h>
-#include <asm/idle.h>
-
-/*
- *     Smarter SMP flushing macros. 
- *             c/o Linus Torvalds.
- *
- *     These mean you can really definitely utterly forget about
- *     writing to user space from interrupts. (Its not allowed anyway).
- *
- *     Optimizations Manfred Spraul <manfred@colorfullife.com>
- *
- *     More scalable flush, from Andi Kleen
- *
- *     To avoid global state use 8 different call vectors.
- *     Each CPU uses a specific vector to trigger flushes on other
- *     CPUs. Depending on the received vector the target CPUs look into
- *     the right per cpu variable for the flush data.
- *
- *     With more than 8 CPUs they are hashed to the 8 available
- *     vectors. The limited global vector space forces us to this right now.
- *     In future when interrupts are split into per CPU domains this could be
- *     fixed, at the cost of triggering multiple IPIs in some cases.
- */
-
-union smp_flush_state {
-       struct {
-               cpumask_t flush_cpumask;
-               struct mm_struct *flush_mm;
-               unsigned long flush_va;
-#define FLUSH_ALL      -1ULL
-               spinlock_t tlbstate_lock;
-       };
-       char pad[SMP_CACHE_BYTES];
-} ____cacheline_aligned;
-
-/* State is put into the per CPU data section, but padded
-   to a full cache line because other CPUs can access it and we don't
-   want false sharing in the per cpu data segment. */
-static DEFINE_PER_CPU(union smp_flush_state, flush_state);
-
-/*
- * We cannot call mmdrop() because we are in interrupt context, 
- * instead update mm->cpu_vm_mask.
- */
-static inline void leave_mm(int cpu)
-{
-       if (read_pda(mmu_state) == TLBSTATE_OK)
-               BUG();
-       cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
-       load_cr3(swapper_pg_dir);
-}
-
-/*
- *
- * The flush IPI assumes that a thread switch happens in this order:
- * [cpu0: the cpu that switches]
- * 1) switch_mm() either 1a) or 1b)
- * 1a) thread switch to a different mm
- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
- *     Stop ipi delivery for the old mm. This is not synchronized with
- *     the other cpus, but smp_invalidate_interrupt ignore flush ipis
- *     for the wrong mm, and in the worst case we perform a superfluous
- *     tlb flush.
- * 1a2) set cpu mmu_state to TLBSTATE_OK
- *     Now the smp_invalidate_interrupt won't call leave_mm if cpu0
- *     was in lazy tlb mode.
- * 1a3) update cpu active_mm
- *     Now cpu0 accepts tlb flushes for the new mm.
- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
- *     Now the other cpus will send tlb flush ipis.
- * 1a4) change cr3.
- * 1b) thread switch without mm change
- *     cpu active_mm is correct, cpu0 already handles
- *     flush ipis.
- * 1b1) set cpu mmu_state to TLBSTATE_OK
- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
- *     Atomically set the bit [other cpus will start sending flush ipis],
- *     and test the bit.
- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
- * 2) switch %%esp, ie current
- *
- * The interrupt must handle 2 special cases:
- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
- *   runs in kernel space, the cpu could load tlb entries for user space
- *   pages.
- *
- * The good news is that cpu mmu_state is local to each cpu, no
- * write/read ordering problems.
- */
-
-/*
- * TLB flush IPI:
- *
- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
- * 2) Leave the mm if we are in the lazy tlb mode.
- *
- * Interrupts are disabled.
- */
-
-asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
-{
-       int cpu;
-       int sender;
-       union smp_flush_state *f;
-
-       cpu = smp_processor_id();
-       /*
-        * orig_rax contains the negated interrupt vector.
-        * Use that to determine where the sender put the data.
-        */
-       sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
-       f = &per_cpu(flush_state, sender);
-
-       if (!cpu_isset(cpu, f->flush_cpumask))
-               goto out;
-               /* 
-                * This was a BUG() but until someone can quote me the
-                * line from the intel manual that guarantees an IPI to
-                * multiple CPUs is retried _only_ on the erroring CPUs
-                * its staying as a return
-                *
-                * BUG();
-                */
-                
-       if (f->flush_mm == read_pda(active_mm)) {
-               if (read_pda(mmu_state) == TLBSTATE_OK) {
-                       if (f->flush_va == FLUSH_ALL)
-                               local_flush_tlb();
-                       else
-                               __flush_tlb_one(f->flush_va);
-               } else
-                       leave_mm(cpu);
-       }
-out:
-       ack_APIC_irq();
-       cpu_clear(cpu, f->flush_cpumask);
-}
-
-static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
-                                               unsigned long va)
-{
-       int sender;
-       union smp_flush_state *f;
-
-       /* Caller has disabled preemption */
-       sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
-       f = &per_cpu(flush_state, sender);
-
-       /* Could avoid this lock when
-          num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
-          probably not worth checking this for a cache-hot lock. */
-       spin_lock(&f->tlbstate_lock);
-
-       f->flush_mm = mm;
-       f->flush_va = va;
-       cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
-
-       /*
-        * We have to send the IPI only to
-        * CPUs affected.
-        */
-       send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
-
-       while (!cpus_empty(f->flush_cpumask))
-               cpu_relax();
-
-       f->flush_mm = NULL;
-       f->flush_va = 0;
-       spin_unlock(&f->tlbstate_lock);
-}
-
-int __cpuinit init_smp_flush(void)
-{
-       int i;
-       for_each_cpu_mask(i, cpu_possible_map) {
-               spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
-       }
-       return 0;
-}
-
-core_initcall(init_smp_flush);
-       
-void flush_tlb_current_task(void)
-{
-       struct mm_struct *mm = current->mm;
-       cpumask_t cpu_mask;
-
-       preempt_disable();
-       cpu_mask = mm->cpu_vm_mask;
-       cpu_clear(smp_processor_id(), cpu_mask);
-
-       local_flush_tlb();
-       if (!cpus_empty(cpu_mask))
-               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
-       preempt_enable();
-}
-EXPORT_SYMBOL(flush_tlb_current_task);
-
-void flush_tlb_mm (struct mm_struct * mm)
-{
-       cpumask_t cpu_mask;
-
-       preempt_disable();
-       cpu_mask = mm->cpu_vm_mask;
-       cpu_clear(smp_processor_id(), cpu_mask);
-
-       if (current->active_mm == mm) {
-               if (current->mm)
-                       local_flush_tlb();
-               else
-                       leave_mm(smp_processor_id());
-       }
-       if (!cpus_empty(cpu_mask))
-               flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
-
-       preempt_enable();
-}
-EXPORT_SYMBOL(flush_tlb_mm);
-
-void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
-{
-       struct mm_struct *mm = vma->vm_mm;
-       cpumask_t cpu_mask;
-
-       preempt_disable();
-       cpu_mask = mm->cpu_vm_mask;
-       cpu_clear(smp_processor_id(), cpu_mask);
-
-       if (current->active_mm == mm) {
-               if(current->mm)
-                       __flush_tlb_one(va);
-                else
-                       leave_mm(smp_processor_id());
-       }
-
-       if (!cpus_empty(cpu_mask))
-               flush_tlb_others(cpu_mask, mm, va);
-
-       preempt_enable();
-}
-EXPORT_SYMBOL(flush_tlb_page);
-
-static void do_flush_tlb_all(void* info)
-{
-       unsigned long cpu = smp_processor_id();
-
-       __flush_tlb_all();
-       if (read_pda(mmu_state) == TLBSTATE_LAZY)
-               leave_mm(cpu);
-}
-
-void flush_tlb_all(void)
-{
-       on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
-}
-
-/*
- * this function sends a 'reschedule' IPI to another CPU.
- * it goes straight through and wastes no time serializing
- * anything. Worst case is that we lose a reschedule ...
- */
-
-void smp_send_reschedule(int cpu)
-{
-       send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
-}
-
-/*
- * Structure and data for smp_call_function(). This is designed to minimise
- * static memory requirements. It also looks cleaner.
- */
-static DEFINE_SPINLOCK(call_lock);
-
-struct call_data_struct {
-       void (*func) (void *info);
-       void *info;
-       atomic_t started;
-       atomic_t finished;
-       int wait;
-};
-
-static struct call_data_struct * call_data;
-
-void lock_ipi_call_lock(void)
-{
-       spin_lock_irq(&call_lock);
-}
-
-void unlock_ipi_call_lock(void)
-{
-       spin_unlock_irq(&call_lock);
-}
-
-/*
- * this function sends a 'generic call function' IPI to one other CPU
- * in the system.
- *
- * cpu is a standard Linux logical CPU number.
- */
-static void
-__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
-                               int nonatomic, int wait)
-{
-       struct call_data_struct data;
-       int cpus = 1;
-
-       data.func = func;
-       data.info = info;
-       atomic_set(&data.started, 0);
-       data.wait = wait;
-       if (wait)
-               atomic_set(&data.finished, 0);
-
-       call_data = &data;
-       wmb();
-       /* Send a message to all other CPUs and wait for them to respond */
-       send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
-
-       /* Wait for response */
-       while (atomic_read(&data.started) != cpus)
-               cpu_relax();
-
-       if (!wait)
-               return;
-
-       while (atomic_read(&data.finished) != cpus)
-               cpu_relax();
-}
-
-/*
- * smp_call_function_single - Run a function on a specific CPU
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @nonatomic: Currently unused.
- * @wait: If true, wait until function has completed on other CPUs.
- *
- * Retrurns 0 on success, else a negative status code.
- *
- * Does not return until the remote CPU is nearly ready to execute <func>
- * or is or has executed.
- */
-
-int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
-       int nonatomic, int wait)
-{
-       /* prevent preemption and reschedule on another processor */
-       int me = get_cpu();
-
-       /* Can deadlock when called with interrupts disabled */
-       WARN_ON(irqs_disabled());
-
-       if (cpu == me) {
-               local_irq_disable();
-               func(info);
-               local_irq_enable();
-               put_cpu();
-               return 0;
-       }
-
-       spin_lock(&call_lock);
-       __smp_call_function_single(cpu, func, info, nonatomic, wait);
-       spin_unlock(&call_lock);
-       put_cpu();
-       return 0;
-}
-EXPORT_SYMBOL(smp_call_function_single);
-
-/*
- * this function sends a 'generic call function' IPI to all other CPUs
- * in the system.
- */
-static void __smp_call_function (void (*func) (void *info), void *info,
-                               int nonatomic, int wait)
-{
-       struct call_data_struct data;
-       int cpus = num_online_cpus()-1;
-
-       if (!cpus)
-               return;
-
-       data.func = func;
-       data.info = info;
-       atomic_set(&data.started, 0);
-       data.wait = wait;
-       if (wait)
-               atomic_set(&data.finished, 0);
-
-       call_data = &data;
-       wmb();
-       /* Send a message to all other CPUs and wait for them to respond */
-       send_IPI_allbutself(CALL_FUNCTION_VECTOR);
-
-       /* Wait for response */
-       while (atomic_read(&data.started) != cpus)
-               cpu_relax();
-
-       if (!wait)
-               return;
-
-       while (atomic_read(&data.finished) != cpus)
-               cpu_relax();
-}
-
-/*
- * smp_call_function - run a function on all other CPUs.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @nonatomic: currently unused.
- * @wait: If true, wait (atomically) until function has completed on other
- *        CPUs.
- *
- * Returns 0 on success, else a negative status code. Does not return until
- * remote CPUs are nearly ready to execute func or are or have executed.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler.
- * Actually there are a few legal cases, like panic.
- */
-int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
-                       int wait)
-{
-       spin_lock(&call_lock);
-       __smp_call_function(func,info,nonatomic,wait);
-       spin_unlock(&call_lock);
-       return 0;
-}
-EXPORT_SYMBOL(smp_call_function);
-
-static void stop_this_cpu(void *dummy)
-{
-       local_irq_disable();
-       /*
-        * Remove this CPU:
-        */
-       cpu_clear(smp_processor_id(), cpu_online_map);
-       disable_local_APIC();
-       for (;;) 
-               halt();
-} 
-
-void smp_send_stop(void)
-{
-       int nolock;
-       unsigned long flags;
-
-       if (reboot_force)
-               return;
-
-       /* Don't deadlock on the call lock in panic */
-       nolock = !spin_trylock(&call_lock);
-       local_irq_save(flags);
-       __smp_call_function(stop_this_cpu, NULL, 0, 0);
-       if (!nolock)
-               spin_unlock(&call_lock);
-       disable_local_APIC();
-       local_irq_restore(flags);
-}
-
-/*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
- */
-asmlinkage void smp_reschedule_interrupt(void)
-{
-       ack_APIC_irq();
-}
-
-asmlinkage void smp_call_function_interrupt(void)
-{
-       void (*func) (void *info) = call_data->func;
-       void *info = call_data->info;
-       int wait = call_data->wait;
-
-       ack_APIC_irq();
-       /*
-        * Notify initiating CPU that I've grabbed the data and am
-        * about to execute the function
-        */
-       mb();
-       atomic_inc(&call_data->started);
-       /*
-        * At this point the info structure may be out of scope unless wait==1
-        */
-       exit_idle();
-       irq_enter();
-       (*func)(info);
-       irq_exit();
-       if (wait) {
-               mb();
-               atomic_inc(&call_data->finished);
-       }
-}
-
diff --git a/arch/x86_64/kernel/smpboot_64.c b/arch/x86_64/kernel/smpboot_64.c
deleted file mode 100644 (file)
index 32f5078..0000000
+++ /dev/null
@@ -1,1085 +0,0 @@
-/*
- *     x86 SMP booting functions
- *
- *     (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
- *     (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
- *     Copyright 2001 Andi Kleen, SuSE Labs.
- *
- *     Much of the core SMP work is based on previous work by Thomas Radke, to
- *     whom a great many thanks are extended.
- *
- *     Thanks to Intel for making available several different Pentium,
- *     Pentium Pro and Pentium-II/Xeon MP machines.
- *     Original development of Linux SMP code supported by Caldera.
- *
- *     This code is released under the GNU General Public License version 2
- *
- *     Fixes
- *             Felix Koop      :       NR_CPUS used properly
- *             Jose Renau      :       Handle single CPU case.
- *             Alan Cox        :       By repeated request 8) - Total BogoMIP report.
- *             Greg Wright     :       Fix for kernel stacks panic.
- *             Erich Boleyn    :       MP v1.4 and additional changes.
- *     Matthias Sattler        :       Changes for 2.1 kernel map.
- *     Michel Lespinasse       :       Changes for 2.1 kernel map.
- *     Michael Chastain        :       Change trampoline.S to gnu as.
- *             Alan Cox        :       Dumb bug: 'B' step PPro's are fine
- *             Ingo Molnar     :       Added APIC timers, based on code
- *                                     from Jose Renau
- *             Ingo Molnar     :       various cleanups and rewrites
- *             Tigran Aivazian :       fixed "0.00 in /proc/uptime on SMP" bug.
- *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs
- *     Andi Kleen              :       Changed for SMP boot into long mode.
- *             Rusty Russell   :       Hacked into shape for new "hotplug" boot process.
- *      Andi Kleen              :       Converted to new state machine.
- *                                     Various cleanups.
- *                                     Probably mostly hotplug CPU ready now.
- *     Ashok Raj                       : CPU hotplug support
- */
-
-
-#include <linux/init.h>
-
-#include <linux/mm.h>
-#include <linux/kernel_stat.h>
-#include <linux/bootmem.h>
-#include <linux/thread_info.h>
-#include <linux/module.h>
-#include <linux/delay.h>
-#include <linux/mc146818rtc.h>
-#include <linux/smp.h>
-#include <linux/kdebug.h>
-
-#include <asm/mtrr.h>
-#include <asm/pgalloc.h>
-#include <asm/desc.h>
-#include <asm/tlbflush.h>
-#include <asm/proto.h>
-#include <asm/nmi.h>
-#include <asm/irq.h>
-#include <asm/hw_irq.h>
-#include <asm/numa.h>
-
-/* Number of siblings per CPU package */
-int smp_num_siblings = 1;
-EXPORT_SYMBOL(smp_num_siblings);
-
-/* Last level cache ID of each logical CPU */
-u8 cpu_llc_id[NR_CPUS] __cpuinitdata  = {[0 ... NR_CPUS-1] = BAD_APICID};
-
-/* Bitmask of currently online CPUs */
-cpumask_t cpu_online_map __read_mostly;
-
-EXPORT_SYMBOL(cpu_online_map);
-
-/*
- * Private maps to synchronize booting between AP and BP.
- * Probably not needed anymore, but it makes for easier debugging. -AK
- */
-cpumask_t cpu_callin_map;
-cpumask_t cpu_callout_map;
-EXPORT_SYMBOL(cpu_callout_map);
-
-cpumask_t cpu_possible_map;
-EXPORT_SYMBOL(cpu_possible_map);
-
-/* Per CPU bogomips and other parameters */
-struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
-EXPORT_SYMBOL(cpu_data);
-
-/* Set when the idlers are all forked */
-int smp_threads_ready;
-
-/* representing HT siblings of each logical CPU */
-cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
-EXPORT_SYMBOL(cpu_sibling_map);
-
-/* representing HT and core siblings of each logical CPU */
-cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
-EXPORT_SYMBOL(cpu_core_map);
-
-/*
- * Trampoline 80x86 program as an array.
- */
-
-extern unsigned char trampoline_data[];
-extern unsigned char trampoline_end[];
-
-/* State of each CPU */
-DEFINE_PER_CPU(int, cpu_state) = { 0 };
-
-/*
- * Store all idle threads, this can be reused instead of creating
- * a new thread. Also avoids complicated thread destroy functionality
- * for idle threads.
- */
-struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
-
-#define get_idle_for_cpu(x)     (idle_thread_array[(x)])
-#define set_idle_for_cpu(x,p)   (idle_thread_array[(x)] = (p))
-
-/*
- * Currently trivial. Write the real->protected mode
- * bootstrap into the page concerned. The caller
- * has made sure it's suitably aligned.
- */
-
-static unsigned long __cpuinit setup_trampoline(void)
-{
-       void *tramp = __va(SMP_TRAMPOLINE_BASE); 
-       memcpy(tramp, trampoline_data, trampoline_end - trampoline_data);
-       return virt_to_phys(tramp);
-}
-
-/*
- * The bootstrap kernel entry code has set these up. Save them for
- * a given CPU
- */
-
-static void __cpuinit smp_store_cpu_info(int id)
-{
-       struct cpuinfo_x86 *c = cpu_data + id;
-
-       *c = boot_cpu_data;
-       identify_cpu(c);
-       print_cpu_info(c);
-}
-
-static atomic_t init_deasserted __cpuinitdata;
-
-/*
- * Report back to the Boot Processor.
- * Running on AP.
- */
-void __cpuinit smp_callin(void)
-{
-       int cpuid, phys_id;
-       unsigned long timeout;
-
-       /*
-        * If waken up by an INIT in an 82489DX configuration
-        * we may get here before an INIT-deassert IPI reaches
-        * our local APIC.  We have to wait for the IPI or we'll
-        * lock up on an APIC access.
-        */
-       while (!atomic_read(&init_deasserted))
-               cpu_relax();
-
-       /*
-        * (This works even if the APIC is not enabled.)
-        */
-       phys_id = GET_APIC_ID(apic_read(APIC_ID));
-       cpuid = smp_processor_id();
-       if (cpu_isset(cpuid, cpu_callin_map)) {
-               panic("smp_callin: phys CPU#%d, CPU#%d already present??\n",
-                                       phys_id, cpuid);
-       }
-       Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
-
-       /*
-        * STARTUP IPIs are fragile beasts as they might sometimes
-        * trigger some glue motherboard logic. Complete APIC bus
-        * silence for 1 second, this overestimates the time the
-        * boot CPU is spending to send the up to 2 STARTUP IPIs
-        * by a factor of two. This should be enough.
-        */
-
-       /*
-        * Waiting 2s total for startup (udelay is not yet working)
-        */
-       timeout = jiffies + 2*HZ;
-       while (time_before(jiffies, timeout)) {
-               /*
-                * Has the boot CPU finished it's STARTUP sequence?
-                */
-               if (cpu_isset(cpuid, cpu_callout_map))
-                       break;
-               cpu_relax();
-       }
-
-       if (!time_before(jiffies, timeout)) {
-               panic("smp_callin: CPU%d started up but did not get a callout!\n",
-                       cpuid);
-       }
-
-       /*
-        * the boot CPU has finished the init stage and is spinning
-        * on callin_map until we finish. We are free to set up this
-        * CPU, first the APIC. (this is probably redundant on most
-        * boards)
-        */
-
-       Dprintk("CALLIN, before setup_local_APIC().\n");
-       setup_local_APIC();
-
-       /*
-        * Get our bogomips.
-        *
-        * Need to enable IRQs because it can take longer and then
-        * the NMI watchdog might kill us.
-        */
-       local_irq_enable();
-       calibrate_delay();
-       local_irq_disable();
-       Dprintk("Stack at about %p\n",&cpuid);
-
-       disable_APIC_timer();
-
-       /*
-        * Save our processor parameters
-        */
-       smp_store_cpu_info(cpuid);
-
-       /*
-        * Allow the master to continue.
-        */
-       cpu_set(cpuid, cpu_callin_map);
-}
-
-/* maps the cpu to the sched domain representing multi-core */
-cpumask_t cpu_coregroup_map(int cpu)
-{
-       struct cpuinfo_x86 *c = cpu_data + cpu;
-       /*
-        * For perf, we return last level cache shared map.
-        * And for power savings, we return cpu_core_map
-        */
-       if (sched_mc_power_savings || sched_smt_power_savings)
-               return cpu_core_map[cpu];
-       else
-               return c->llc_shared_map;
-}
-
-/* representing cpus for which sibling maps can be computed */
-static cpumask_t cpu_sibling_setup_map;
-
-static inline void set_cpu_sibling_map(int cpu)
-{
-       int i;
-       struct cpuinfo_x86 *c = cpu_data;
-
-       cpu_set(cpu, cpu_sibling_setup_map);
-
-       if (smp_num_siblings > 1) {
-               for_each_cpu_mask(i, cpu_sibling_setup_map) {
-                       if (c[cpu].phys_proc_id == c[i].phys_proc_id &&
-                           c[cpu].cpu_core_id == c[i].cpu_core_id) {
-                               cpu_set(i, cpu_sibling_map[cpu]);
-                               cpu_set(cpu, cpu_sibling_map[i]);
-                               cpu_set(i, cpu_core_map[cpu]);
-                               cpu_set(cpu, cpu_core_map[i]);
-                               cpu_set(i, c[cpu].llc_shared_map);
-                               cpu_set(cpu, c[i].llc_shared_map);
-                       }
-               }
-       } else {
-               cpu_set(cpu, cpu_sibling_map[cpu]);
-       }
-
-       cpu_set(cpu, c[cpu].llc_shared_map);
-
-       if (current_cpu_data.x86_max_cores == 1) {
-               cpu_core_map[cpu] = cpu_sibling_map[cpu];
-               c[cpu].booted_cores = 1;
-               return;
-       }
-
-       for_each_cpu_mask(i, cpu_sibling_setup_map) {
-               if (cpu_llc_id[cpu] != BAD_APICID &&
-                   cpu_llc_id[cpu] == cpu_llc_id[i]) {
-                       cpu_set(i, c[cpu].llc_shared_map);
-                       cpu_set(cpu, c[i].llc_shared_map);
-               }
-               if (c[cpu].phys_proc_id == c[i].phys_proc_id) {
-                       cpu_set(i, cpu_core_map[cpu]);
-                       cpu_set(cpu, cpu_core_map[i]);
-                       /*
-                        *  Does this new cpu bringup a new core?
-                        */
-                       if (cpus_weight(cpu_sibling_map[cpu]) == 1) {
-                               /*
-                                * for each core in package, increment
-                                * the booted_cores for this new cpu
-                                */
-                               if (first_cpu(cpu_sibling_map[i]) == i)
-                                       c[cpu].booted_cores++;
-                               /*
-                                * increment the core count for all
-                                * the other cpus in this package
-                                */
-                               if (i != cpu)
-                                       c[i].booted_cores++;
-                       } else if (i != cpu && !c[cpu].booted_cores)
-                               c[cpu].booted_cores = c[i].booted_cores;
-               }
-       }
-}
-
-/*
- * Setup code on secondary processor (after comming out of the trampoline)
- */
-void __cpuinit start_secondary(void)
-{
-       /*
-        * Dont put anything before smp_callin(), SMP
-        * booting is too fragile that we want to limit the
-        * things done here to the most necessary things.
-        */
-       cpu_init();
-       preempt_disable();
-       smp_callin();
-
-       /* otherwise gcc will move up the smp_processor_id before the cpu_init */
-       barrier();
-
-       /*
-        * Check TSC sync first:
-        */
-       check_tsc_sync_target();
-
-       Dprintk("cpu %d: setting up apic clock\n", smp_processor_id());         
-       setup_secondary_APIC_clock();
-
-       Dprintk("cpu %d: enabling apic timer\n", smp_processor_id());
-
-       if (nmi_watchdog == NMI_IO_APIC) {
-               disable_8259A_irq(0);
-               enable_NMI_through_LVT0(NULL);
-               enable_8259A_irq(0);
-       }
-
-       enable_APIC_timer();
-
-       /*
-        * The sibling maps must be set before turing the online map on for
-        * this cpu
-        */
-       set_cpu_sibling_map(smp_processor_id());
-
-       /*
-        * We need to hold call_lock, so there is no inconsistency
-        * between the time smp_call_function() determines number of
-        * IPI receipients, and the time when the determination is made
-        * for which cpus receive the IPI in genapic_flat.c. Holding this
-        * lock helps us to not include this cpu in a currently in progress
-        * smp_call_function().
-        */
-       lock_ipi_call_lock();
-       spin_lock(&vector_lock);
-
-       /* Setup the per cpu irq handling data structures */
-       __setup_vector_irq(smp_processor_id());
-       /*
-        * Allow the master to continue.
-        */
-       cpu_set(smp_processor_id(), cpu_online_map);
-       per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
-       spin_unlock(&vector_lock);
-
-       unlock_ipi_call_lock();
-
-       cpu_idle();
-}
-
-extern volatile unsigned long init_rsp;
-extern void (*initial_code)(void);
-
-#ifdef APIC_DEBUG
-static void inquire_remote_apic(int apicid)
-{
-       unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
-       char *names[] = { "ID", "VERSION", "SPIV" };
-       int timeout;
-       unsigned int status;
-
-       printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
-
-       for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) {
-               printk("... APIC #%d %s: ", apicid, names[i]);
-
-               /*
-                * Wait for idle.
-                */
-               status = safe_apic_wait_icr_idle();
-               if (status)
-                       printk("a previous APIC delivery may have failed\n");
-
-               apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
-               apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
-
-               timeout = 0;
-               do {
-                       udelay(100);
-                       status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
-               } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
-
-               switch (status) {
-               case APIC_ICR_RR_VALID:
-                       status = apic_read(APIC_RRR);
-                       printk("%08x\n", status);
-                       break;
-               default:
-                       printk("failed\n");
-               }
-       }
-}
-#endif
-
-/*
- * Kick the secondary to wake up.
- */
-static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip)
-{
-       unsigned long send_status, accept_status = 0;
-       int maxlvt, num_starts, j;
-
-       Dprintk("Asserting INIT.\n");
-
-       /*
-        * Turn INIT on target chip
-        */
-       apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
-
-       /*
-        * Send IPI
-        */
-       apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
-                               | APIC_DM_INIT);
-
-       Dprintk("Waiting for send to finish...\n");
-       send_status = safe_apic_wait_icr_idle();
-
-       mdelay(10);
-
-       Dprintk("Deasserting INIT.\n");
-
-       /* Target chip */
-       apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
-
-       /* Send IPI */
-       apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
-
-       Dprintk("Waiting for send to finish...\n");
-       send_status = safe_apic_wait_icr_idle();
-
-       mb();
-       atomic_set(&init_deasserted, 1);
-
-       num_starts = 2;
-
-       /*
-        * Run STARTUP IPI loop.
-        */
-       Dprintk("#startup loops: %d.\n", num_starts);
-
-       maxlvt = get_maxlvt();
-
-       for (j = 1; j <= num_starts; j++) {
-               Dprintk("Sending STARTUP #%d.\n",j);
-               apic_write(APIC_ESR, 0);
-               apic_read(APIC_ESR);
-               Dprintk("After apic_write.\n");
-
-               /*
-                * STARTUP IPI
-                */
-
-               /* Target chip */
-               apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
-
-               /* Boot on the stack */
-               /* Kick the second */
-               apic_write(APIC_ICR, APIC_DM_STARTUP | (start_rip >> 12));
-
-               /*
-                * Give the other CPU some time to accept the IPI.
-                */
-               udelay(300);
-
-               Dprintk("Startup point 1.\n");
-
-               Dprintk("Waiting for send to finish...\n");
-               send_status = safe_apic_wait_icr_idle();
-
-               /*
-                * Give the other CPU some time to accept the IPI.
-                */
-               udelay(200);
-               /*
-                * Due to the Pentium erratum 3AP.
-                */
-               if (maxlvt > 3) {
-                       apic_write(APIC_ESR, 0);
-               }
-               accept_status = (apic_read(APIC_ESR) & 0xEF);
-               if (send_status || accept_status)
-                       break;
-       }
-       Dprintk("After Startup.\n");
-
-       if (send_status)
-               printk(KERN_ERR "APIC never delivered???\n");
-       if (accept_status)
-               printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
-
-       return (send_status | accept_status);
-}
-
-struct create_idle {
-       struct work_struct work;
-       struct task_struct *idle;
-       struct completion done;
-       int cpu;
-};
-
-void do_fork_idle(struct work_struct *work)
-{
-       struct create_idle *c_idle =
-               container_of(work, struct create_idle, work);
-
-       c_idle->idle = fork_idle(c_idle->cpu);
-       complete(&c_idle->done);
-}
-
-/*
- * Boot one CPU.
- */
-static int __cpuinit do_boot_cpu(int cpu, int apicid)
-{
-       unsigned long boot_error;
-       int timeout;
-       unsigned long start_rip;
-       struct create_idle c_idle = {
-               .work = __WORK_INITIALIZER(c_idle.work, do_fork_idle),
-               .cpu = cpu,
-               .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
-       };
-
-       /* allocate memory for gdts of secondary cpus. Hotplug is considered */
-       if (!cpu_gdt_descr[cpu].address &&
-               !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) {
-               printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu);
-               return -1;
-       }
-
-       /* Allocate node local memory for AP pdas */
-       if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) {
-               struct x8664_pda *newpda, *pda;
-               int node = cpu_to_node(cpu);
-               pda = cpu_pda(cpu);
-               newpda = kmalloc_node(sizeof (struct x8664_pda), GFP_ATOMIC,
-                                     node);
-               if (newpda) {
-                       memcpy(newpda, pda, sizeof (struct x8664_pda));
-                       cpu_pda(cpu) = newpda;
-               } else
-                       printk(KERN_ERR
-               "Could not allocate node local PDA for CPU %d on node %d\n",
-                               cpu, node);
-       }
-
-       alternatives_smp_switch(1);
-
-       c_idle.idle = get_idle_for_cpu(cpu);
-
-       if (c_idle.idle) {
-               c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *)
-                       (THREAD_SIZE +  task_stack_page(c_idle.idle))) - 1);
-               init_idle(c_idle.idle, cpu);
-               goto do_rest;
-       }
-
-       /*
-        * During cold boot process, keventd thread is not spun up yet.
-        * When we do cpu hot-add, we create idle threads on the fly, we should
-        * not acquire any attributes from the calling context. Hence the clean
-        * way to create kernel_threads() is to do that from keventd().
-        * We do the current_is_keventd() due to the fact that ACPI notifier
-        * was also queuing to keventd() and when the caller is already running
-        * in context of keventd(), we would end up with locking up the keventd
-        * thread.
-        */
-       if (!keventd_up() || current_is_keventd())
-               c_idle.work.func(&c_idle.work);
-       else {
-               schedule_work(&c_idle.work);
-               wait_for_completion(&c_idle.done);
-       }
-
-       if (IS_ERR(c_idle.idle)) {
-               printk("failed fork for CPU %d\n", cpu);
-               return PTR_ERR(c_idle.idle);
-       }
-
-       set_idle_for_cpu(cpu, c_idle.idle);
-
-do_rest:
-
-       cpu_pda(cpu)->pcurrent = c_idle.idle;
-
-       start_rip = setup_trampoline();
-
-       init_rsp = c_idle.idle->thread.rsp;
-       per_cpu(init_tss,cpu).rsp0 = init_rsp;
-       initial_code = start_secondary;
-       clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
-
-       printk(KERN_INFO "Booting processor %d/%d APIC 0x%x\n", cpu,
-               cpus_weight(cpu_present_map),
-               apicid);
-
-       /*
-        * This grunge runs the startup process for
-        * the targeted processor.
-        */
-
-       atomic_set(&init_deasserted, 0);
-
-       Dprintk("Setting warm reset code and vector.\n");
-
-       CMOS_WRITE(0xa, 0xf);
-       local_flush_tlb();
-       Dprintk("1.\n");
-       *((volatile unsigned short *) phys_to_virt(0x469)) = start_rip >> 4;
-       Dprintk("2.\n");
-       *((volatile unsigned short *) phys_to_virt(0x467)) = start_rip & 0xf;
-       Dprintk("3.\n");
-
-       /*
-        * Be paranoid about clearing APIC errors.
-        */
-       apic_write(APIC_ESR, 0);
-       apic_read(APIC_ESR);
-
-       /*
-        * Status is now clean
-        */
-       boot_error = 0;
-
-       /*
-        * Starting actual IPI sequence...
-        */
-       boot_error = wakeup_secondary_via_INIT(apicid, start_rip);
-
-       if (!boot_error) {
-               /*
-                * allow APs to start initializing.
-                */
-               Dprintk("Before Callout %d.\n", cpu);
-               cpu_set(cpu, cpu_callout_map);
-               Dprintk("After Callout %d.\n", cpu);
-
-               /*
-                * Wait 5s total for a response
-                */
-               for (timeout = 0; timeout < 50000; timeout++) {
-                       if (cpu_isset(cpu, cpu_callin_map))
-                               break;  /* It has booted */
-                       udelay(100);
-               }
-
-               if (cpu_isset(cpu, cpu_callin_map)) {
-                       /* number CPUs logically, starting from 1 (BSP is 0) */
-                       Dprintk("CPU has booted.\n");
-               } else {
-                       boot_error = 1;
-                       if (*((volatile unsigned char *)phys_to_virt(SMP_TRAMPOLINE_BASE))
-                                       == 0xA5)
-                               /* trampoline started but...? */
-                               printk("Stuck ??\n");
-                       else
-                               /* trampoline code not run */
-                               printk("Not responding.\n");
-#ifdef APIC_DEBUG
-                       inquire_remote_apic(apicid);
-#endif
-               }
-       }
-       if (boot_error) {
-               cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
-               clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
-               clear_node_cpumask(cpu); /* was set by numa_add_cpu */
-               cpu_clear(cpu, cpu_present_map);
-               cpu_clear(cpu, cpu_possible_map);
-               x86_cpu_to_apicid[cpu] = BAD_APICID;
-               x86_cpu_to_log_apicid[cpu] = BAD_APICID;
-               return -EIO;
-       }
-
-       return 0;
-}
-
-cycles_t cacheflush_time;
-unsigned long cache_decay_ticks;
-
-/*
- * Cleanup possible dangling ends...
- */
-static __cpuinit void smp_cleanup_boot(void)
-{
-       /*
-        * Paranoid:  Set warm reset code and vector here back
-        * to default values.
-        */
-       CMOS_WRITE(0, 0xf);
-
-       /*
-        * Reset trampoline flag
-        */
-       *((volatile int *) phys_to_virt(0x467)) = 0;
-}
-
-/*
- * Fall back to non SMP mode after errors.
- *
- * RED-PEN audit/test this more. I bet there is more state messed up here.
- */
-static __init void disable_smp(void)
-{
-       cpu_present_map = cpumask_of_cpu(0);
-       cpu_possible_map = cpumask_of_cpu(0);
-       if (smp_found_config)
-               phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
-       else
-               phys_cpu_present_map = physid_mask_of_physid(0);
-       cpu_set(0, cpu_sibling_map[0]);
-       cpu_set(0, cpu_core_map[0]);
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-int additional_cpus __initdata = -1;
-
-/*
- * cpu_possible_map should be static, it cannot change as cpu's
- * are onlined, or offlined. The reason is per-cpu data-structures
- * are allocated by some modules at init time, and dont expect to
- * do this dynamically on cpu arrival/departure.
- * cpu_present_map on the other hand can change dynamically.
- * In case when cpu_hotplug is not compiled, then we resort to current
- * behaviour, which is cpu_possible == cpu_present.
- * - Ashok Raj
- *
- * Three ways to find out the number of additional hotplug CPUs:
- * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
- * - The user can overwrite it with additional_cpus=NUM
- * - Otherwise don't reserve additional CPUs.
- * We do this because additional CPUs waste a lot of memory.
- * -AK
- */
-__init void prefill_possible_map(void)
-{
-       int i;
-       int possible;
-
-       if (additional_cpus == -1) {
-               if (disabled_cpus > 0)
-                       additional_cpus = disabled_cpus;
-               else
-                       additional_cpus = 0;
-       }
-       possible = num_processors + additional_cpus;
-       if (possible > NR_CPUS) 
-               possible = NR_CPUS;
-
-       printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
-               possible,
-               max_t(int, possible - num_processors, 0));
-
-       for (i = 0; i < possible; i++)
-               cpu_set(i, cpu_possible_map);
-}
-#endif
-
-/*
- * Various sanity checks.
- */
-static int __init smp_sanity_check(unsigned max_cpus)
-{
-       if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
-               printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
-                      hard_smp_processor_id());
-               physid_set(hard_smp_processor_id(), phys_cpu_present_map);
-       }
-
-       /*
-        * If we couldn't find an SMP configuration at boot time,
-        * get out of here now!
-        */
-       if (!smp_found_config) {
-               printk(KERN_NOTICE "SMP motherboard not detected.\n");
-               disable_smp();
-               if (APIC_init_uniprocessor())
-                       printk(KERN_NOTICE "Local APIC not detected."
-                                          " Using dummy APIC emulation.\n");
-               return -1;
-       }
-
-       /*
-        * Should not be necessary because the MP table should list the boot
-        * CPU too, but we do it for the sake of robustness anyway.
-        */
-       if (!physid_isset(boot_cpu_id, phys_cpu_present_map)) {
-               printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n",
-                                                                boot_cpu_id);
-               physid_set(hard_smp_processor_id(), phys_cpu_present_map);
-       }
-
-       /*
-        * If we couldn't find a local APIC, then get out of here now!
-        */
-       if (!cpu_has_apic) {
-               printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
-                       boot_cpu_id);
-               printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
-               nr_ioapics = 0;
-               return -1;
-       }
-
-       /*
-        * If SMP should be disabled, then really disable it!
-        */
-       if (!max_cpus) {
-               printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
-               nr_ioapics = 0;
-               return -1;
-       }
-
-       return 0;
-}
-
-/*
- * Prepare for SMP bootup.  The MP table or ACPI has been read
- * earlier.  Just do some sanity checking here and enable APIC mode.
- */
-void __init smp_prepare_cpus(unsigned int max_cpus)
-{
-       nmi_watchdog_default();
-       current_cpu_data = boot_cpu_data;
-       current_thread_info()->cpu = 0;  /* needed? */
-       set_cpu_sibling_map(0);
-
-       if (smp_sanity_check(max_cpus) < 0) {
-               printk(KERN_INFO "SMP disabled\n");
-               disable_smp();
-               return;
-       }
-
-
-       /*
-        * Switch from PIC to APIC mode.
-        */
-       setup_local_APIC();
-
-       if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) {
-               panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
-                     GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id);
-               /* Or can we switch back to PIC here? */
-       }
-
-       /*
-        * Now start the IO-APICs
-        */
-       if (!skip_ioapic_setup && nr_ioapics)
-               setup_IO_APIC();
-       else
-               nr_ioapics = 0;
-
-       /*
-        * Set up local APIC timer on boot CPU.
-        */
-
-       setup_boot_APIC_clock();
-}
-
-/*
- * Early setup to make printk work.
- */
-void __init smp_prepare_boot_cpu(void)
-{
-       int me = smp_processor_id();
-       cpu_set(me, cpu_online_map);
-       cpu_set(me, cpu_callout_map);
-       per_cpu(cpu_state, me) = CPU_ONLINE;
-}
-
-/*
- * Entry point to boot a CPU.
- */
-int __cpuinit __cpu_up(unsigned int cpu)
-{
-       int apicid = cpu_present_to_apicid(cpu);
-       unsigned long flags;
-       int err;
-
-       WARN_ON(irqs_disabled());
-
-       Dprintk("++++++++++++++++++++=_---CPU UP  %u\n", cpu);
-
-       if (apicid == BAD_APICID || apicid == boot_cpu_id ||
-           !physid_isset(apicid, phys_cpu_present_map)) {
-               printk("__cpu_up: bad cpu %d\n", cpu);
-               return -EINVAL;
-       }
-
-       /*
-        * Already booted CPU?
-        */
-       if (cpu_isset(cpu, cpu_callin_map)) {
-               Dprintk("do_boot_cpu %d Already started\n", cpu);
-               return -ENOSYS;
-       }
-
-       /*
-        * Save current MTRR state in case it was changed since early boot
-        * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
-        */
-       mtrr_save_state();
-
-       per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
-       /* Boot it! */
-       err = do_boot_cpu(cpu, apicid);
-       if (err < 0) {
-               Dprintk("do_boot_cpu failed %d\n", err);
-               return err;
-       }
-
-       /* Unleash the CPU! */
-       Dprintk("waiting for cpu %d\n", cpu);
-
-       /*
-        * Make sure and check TSC sync:
-        */
-       local_irq_save(flags);
-       check_tsc_sync_source(cpu);
-       local_irq_restore(flags);
-
-       while (!cpu_isset(cpu, cpu_online_map))
-               cpu_relax();
-       err = 0;
-
-       return err;
-}
-
-/*
- * Finish the SMP boot.
- */
-void __init smp_cpus_done(unsigned int max_cpus)
-{
-       smp_cleanup_boot();
-       setup_ioapic_dest();
-       check_nmi_watchdog();
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-static void remove_siblinginfo(int cpu)
-{
-       int sibling;
-       struct cpuinfo_x86 *c = cpu_data;
-
-       for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
-               cpu_clear(cpu, cpu_core_map[sibling]);
-               /*
-                * last thread sibling in this cpu core going down
-                */
-               if (cpus_weight(cpu_sibling_map[cpu]) == 1)
-                       c[sibling].booted_cores--;
-       }
-                       
-       for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
-               cpu_clear(cpu, cpu_sibling_map[sibling]);
-       cpus_clear(cpu_sibling_map[cpu]);
-       cpus_clear(cpu_core_map[cpu]);
-       c[cpu].phys_proc_id = 0;
-       c[cpu].cpu_core_id = 0;
-       cpu_clear(cpu, cpu_sibling_setup_map);
-}
-
-void remove_cpu_from_maps(void)
-{
-       int cpu = smp_processor_id();
-
-       cpu_clear(cpu, cpu_callout_map);
-       cpu_clear(cpu, cpu_callin_map);
-       clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
-       clear_node_cpumask(cpu);
-}
-
-int __cpu_disable(void)
-{
-       int cpu = smp_processor_id();
-
-       /*
-        * Perhaps use cpufreq to drop frequency, but that could go
-        * into generic code.
-        *
-        * We won't take down the boot processor on i386 due to some
-        * interrupts only being able to be serviced by the BSP.
-        * Especially so if we're not using an IOAPIC   -zwane
-        */
-       if (cpu == 0)
-               return -EBUSY;
-
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               stop_apic_nmi_watchdog(NULL);
-       clear_local_APIC();
-
-       /*
-        * HACK:
-        * Allow any queued timer interrupts to get serviced
-        * This is only a temporary solution until we cleanup
-        * fixup_irqs as we do for IA64.
-        */
-       local_irq_enable();
-       mdelay(1);
-
-       local_irq_disable();
-       remove_siblinginfo(cpu);
-
-       spin_lock(&vector_lock);
-       /* It's now safe to remove this processor from the online map */
-       cpu_clear(cpu, cpu_online_map);
-       spin_unlock(&vector_lock);
-       remove_cpu_from_maps();
-       fixup_irqs(cpu_online_map);
-       return 0;
-}
-
-void __cpu_die(unsigned int cpu)
-{
-       /* We don't do anything here: idle task is faking death itself. */
-       unsigned int i;
-
-       for (i = 0; i < 10; i++) {
-               /* They ack this in play_dead by setting CPU_DEAD */
-               if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
-                       printk ("CPU %d is now offline\n", cpu);
-                       if (1 == num_online_cpus())
-                               alternatives_smp_switch(0);
-                       return;
-               }
-               msleep(100);
-       }
-       printk(KERN_ERR "CPU %u didn't die...\n", cpu);
-}
-
-static __init int setup_additional_cpus(char *s)
-{
-       return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL;
-}
-early_param("additional_cpus", setup_additional_cpus);
-
-#else /* ... !CONFIG_HOTPLUG_CPU */
-
-int __cpu_disable(void)
-{
-       return -ENOSYS;
-}
-
-void __cpu_die(unsigned int cpu)
-{
-       /* We said "no" in __cpu_disable */
-       BUG();
-}
-#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c
deleted file mode 100644 (file)
index cb91091..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * arch/x86_64/kernel/stacktrace.c
- *
- * Stack trace management functions
- *
- *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- */
-#include <linux/sched.h>
-#include <linux/stacktrace.h>
-#include <linux/module.h>
-#include <asm/stacktrace.h>
-
-static void save_stack_warning(void *data, char *msg)
-{
-}
-
-static void
-save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-}
-
-static int save_stack_stack(void *data, char *name)
-{
-       return -1;
-}
-
-static void save_stack_address(void *data, unsigned long addr)
-{
-       struct stack_trace *trace = (struct stack_trace *)data;
-       if (trace->skip > 0) {
-               trace->skip--;
-               return;
-       }
-       if (trace->nr_entries < trace->max_entries)
-               trace->entries[trace->nr_entries++] = addr;
-}
-
-static struct stacktrace_ops save_stack_ops = {
-       .warning = save_stack_warning,
-       .warning_symbol = save_stack_warning_symbol,
-       .stack = save_stack_stack,
-       .address = save_stack_address,
-};
-
-/*
- * Save stack-backtrace addresses into a stack_trace buffer.
- */
-void save_stack_trace(struct stack_trace *trace)
-{
-       dump_trace(current, NULL, NULL, &save_stack_ops, trace);
-       if (trace->nr_entries < trace->max_entries)
-               trace->entries[trace->nr_entries++] = ULONG_MAX;
-}
-EXPORT_SYMBOL(save_stack_trace);
diff --git a/arch/x86_64/kernel/suspend_64.c b/arch/x86_64/kernel/suspend_64.c
deleted file mode 100644 (file)
index 573c0a6..0000000
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- * Suspend support specific for i386.
- *
- * Distribute under GPLv2
- *
- * Copyright (c) 2002 Pavel Machek <pavel@suse.cz>
- * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
- */
-
-#include <linux/smp.h>
-#include <linux/suspend.h>
-#include <asm/proto.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/mtrr.h>
-
-/* References to section boundaries */
-extern const void __nosave_begin, __nosave_end;
-
-struct saved_context saved_context;
-
-unsigned long saved_context_eax, saved_context_ebx, saved_context_ecx, saved_context_edx;
-unsigned long saved_context_esp, saved_context_ebp, saved_context_esi, saved_context_edi;
-unsigned long saved_context_r08, saved_context_r09, saved_context_r10, saved_context_r11;
-unsigned long saved_context_r12, saved_context_r13, saved_context_r14, saved_context_r15;
-unsigned long saved_context_eflags;
-
-void __save_processor_state(struct saved_context *ctxt)
-{
-       kernel_fpu_begin();
-
-       /*
-        * descriptor tables
-        */
-       asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit));
-       asm volatile ("sidt %0" : "=m" (ctxt->idt_limit));
-       asm volatile ("str %0"  : "=m" (ctxt->tr));
-
-       /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */
-       /*
-        * segment registers
-        */
-       asm volatile ("movw %%ds, %0" : "=m" (ctxt->ds));
-       asm volatile ("movw %%es, %0" : "=m" (ctxt->es));
-       asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs));
-       asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs));
-       asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss));
-
-       rdmsrl(MSR_FS_BASE, ctxt->fs_base);
-       rdmsrl(MSR_GS_BASE, ctxt->gs_base);
-       rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
-       mtrr_save_fixed_ranges(NULL);
-
-       /*
-        * control registers 
-        */
-       rdmsrl(MSR_EFER, ctxt->efer);
-       ctxt->cr0 = read_cr0();
-       ctxt->cr2 = read_cr2();
-       ctxt->cr3 = read_cr3();
-       ctxt->cr4 = read_cr4();
-       ctxt->cr8 = read_cr8();
-}
-
-void save_processor_state(void)
-{
-       __save_processor_state(&saved_context);
-}
-
-static void do_fpu_end(void)
-{
-       /*
-        * Restore FPU regs if necessary
-        */
-       kernel_fpu_end();
-}
-
-void __restore_processor_state(struct saved_context *ctxt)
-{
-       /*
-        * control registers
-        */
-       wrmsrl(MSR_EFER, ctxt->efer);
-       write_cr8(ctxt->cr8);
-       write_cr4(ctxt->cr4);
-       write_cr3(ctxt->cr3);
-       write_cr2(ctxt->cr2);
-       write_cr0(ctxt->cr0);
-
-       /*
-        * now restore the descriptor tables to their proper values
-        * ltr is done i fix_processor_context().
-        */
-       asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
-       asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
-
-       /*
-        * segment registers
-        */
-       asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds));
-       asm volatile ("movw %0, %%es" :: "r" (ctxt->es));
-       asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs));
-       load_gs_index(ctxt->gs);
-       asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss));
-
-       wrmsrl(MSR_FS_BASE, ctxt->fs_base);
-       wrmsrl(MSR_GS_BASE, ctxt->gs_base);
-       wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
-
-       fix_processor_context();
-
-       do_fpu_end();
-       mtrr_ap_init();
-}
-
-void restore_processor_state(void)
-{
-       __restore_processor_state(&saved_context);
-}
-
-void fix_processor_context(void)
-{
-       int cpu = smp_processor_id();
-       struct tss_struct *t = &per_cpu(init_tss, cpu);
-
-       set_tss_desc(cpu,t);    /* This just modifies memory; should not be neccessary. But... This is neccessary, because 386 hardware has concept of busy TSS or some similar stupidity. */
-
-       cpu_gdt(cpu)[GDT_ENTRY_TSS].type = 9;
-
-       syscall_init();                         /* This sets MSR_*STAR and related */
-       load_TR_desc();                         /* This does ltr */
-       load_LDT(&current->active_mm->context); /* This does lldt */
-
-       /*
-        * Now maybe reload the debug registers
-        */
-       if (current->thread.debugreg7){
-                loaddebug(&current->thread, 0);
-                loaddebug(&current->thread, 1);
-                loaddebug(&current->thread, 2);
-                loaddebug(&current->thread, 3);
-                /* no 4 and 5 */
-                loaddebug(&current->thread, 6);
-                loaddebug(&current->thread, 7);
-       }
-
-}
-
-#ifdef CONFIG_HIBERNATION
-/* Defined in arch/x86_64/kernel/suspend_asm.S */
-extern int restore_image(void);
-
-pgd_t *temp_level4_pgt;
-
-static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
-{
-       long i, j;
-
-       i = pud_index(address);
-       pud = pud + i;
-       for (; i < PTRS_PER_PUD; pud++, i++) {
-               unsigned long paddr;
-               pmd_t *pmd;
-
-               paddr = address + i*PUD_SIZE;
-               if (paddr >= end)
-                       break;
-
-               pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
-               if (!pmd)
-                       return -ENOMEM;
-               set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
-               for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
-                       unsigned long pe;
-
-                       if (paddr >= end)
-                               break;
-                       pe = _PAGE_NX | _PAGE_PSE | _KERNPG_TABLE | paddr;
-                       pe &= __supported_pte_mask;
-                       set_pmd(pmd, __pmd(pe));
-               }
-       }
-       return 0;
-}
-
-static int set_up_temporary_mappings(void)
-{
-       unsigned long start, end, next;
-       int error;
-
-       temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC);
-       if (!temp_level4_pgt)
-               return -ENOMEM;
-
-       /* It is safe to reuse the original kernel mapping */
-       set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map),
-               init_level4_pgt[pgd_index(__START_KERNEL_map)]);
-
-       /* Set up the direct mapping from scratch */
-       start = (unsigned long)pfn_to_kaddr(0);
-       end = (unsigned long)pfn_to_kaddr(end_pfn);
-
-       for (; start < end; start = next) {
-               pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC);
-               if (!pud)
-                       return -ENOMEM;
-               next = start + PGDIR_SIZE;
-               if (next > end)
-                       next = end;
-               if ((error = res_phys_pud_init(pud, __pa(start), __pa(next))))
-                       return error;
-               set_pgd(temp_level4_pgt + pgd_index(start),
-                       mk_kernel_pgd(__pa(pud)));
-       }
-       return 0;
-}
-
-int swsusp_arch_resume(void)
-{
-       int error;
-
-       /* We have got enough memory and from now on we cannot recover */
-       if ((error = set_up_temporary_mappings()))
-               return error;
-       restore_image();
-       return 0;
-}
-
-/*
- *     pfn_is_nosave - check if given pfn is in the 'nosave' section
- */
-
-int pfn_is_nosave(unsigned long pfn)
-{
-       unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT;
-       unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT;
-       return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
-}
-#endif /* CONFIG_HIBERNATION */
diff --git a/arch/x86_64/kernel/suspend_asm_64.S b/arch/x86_64/kernel/suspend_asm_64.S
deleted file mode 100644 (file)
index 16d183f..0000000
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright 2004,2005 Pavel Machek <pavel@suse.cz>, Andi Kleen <ak@suse.de>, Rafael J. Wysocki <rjw@sisk.pl>
- *
- * Distribute under GPLv2.
- *
- * swsusp_arch_resume may not use any stack, nor any variable that is
- * not "NoSave" during copying pages:
- *
- * Its rewriting one kernel image with another. What is stack in "old"
- * image could very well be data page in "new" image, and overwriting
- * your own stack under you is bad idea.
- */
-       
-       .text
-#include <linux/linkage.h>
-#include <asm/segment.h>
-#include <asm/page.h>
-#include <asm/asm-offsets.h>
-
-ENTRY(swsusp_arch_suspend)
-
-       movq %rsp, saved_context_esp(%rip)
-       movq %rax, saved_context_eax(%rip)
-       movq %rbx, saved_context_ebx(%rip)
-       movq %rcx, saved_context_ecx(%rip)
-       movq %rdx, saved_context_edx(%rip)
-       movq %rbp, saved_context_ebp(%rip)
-       movq %rsi, saved_context_esi(%rip)
-       movq %rdi, saved_context_edi(%rip)
-       movq %r8,  saved_context_r08(%rip)
-       movq %r9,  saved_context_r09(%rip)
-       movq %r10, saved_context_r10(%rip)
-       movq %r11, saved_context_r11(%rip)
-       movq %r12, saved_context_r12(%rip)
-       movq %r13, saved_context_r13(%rip)
-       movq %r14, saved_context_r14(%rip)
-       movq %r15, saved_context_r15(%rip)
-       pushfq ; popq saved_context_eflags(%rip)
-
-       call swsusp_save
-       ret
-
-ENTRY(restore_image)
-       /* switch to temporary page tables */
-       movq    $__PAGE_OFFSET, %rdx
-       movq    temp_level4_pgt(%rip), %rax
-       subq    %rdx, %rax
-       movq    %rax, %cr3
-       /* Flush TLB */
-       movq    mmu_cr4_features(%rip), %rax
-       movq    %rax, %rdx
-       andq    $~(1<<7), %rdx  # PGE
-       movq    %rdx, %cr4;  # turn off PGE
-       movq    %cr3, %rcx;  # flush TLB
-       movq    %rcx, %cr3;
-       movq    %rax, %cr4;  # turn PGE back on
-
-       movq    restore_pblist(%rip), %rdx
-loop:
-       testq   %rdx, %rdx
-       jz      done
-
-       /* get addresses from the pbe and copy the page */
-       movq    pbe_address(%rdx), %rsi
-       movq    pbe_orig_address(%rdx), %rdi
-       movq    $512, %rcx
-       rep
-       movsq
-
-       /* progress to the next pbe */
-       movq    pbe_next(%rdx), %rdx
-       jmp     loop
-done:
-       /* go back to the original page tables */
-       movq    $(init_level4_pgt - __START_KERNEL_map), %rax
-       addq    phys_base(%rip), %rax
-       movq    %rax, %cr3
-
-       /* Flush TLB, including "global" things (vmalloc) */
-       movq    mmu_cr4_features(%rip), %rax
-       movq    %rax, %rdx
-       andq    $~(1<<7), %rdx;  # PGE
-       movq    %rdx, %cr4;  # turn off PGE
-       movq    %cr3, %rcx;  # flush TLB
-       movq    %rcx, %cr3
-       movq    %rax, %cr4;  # turn PGE back on
-
-       movl    $24, %eax
-       movl    %eax, %ds
-
-       movq saved_context_esp(%rip), %rsp
-       movq saved_context_ebp(%rip), %rbp
-       /* Don't restore %rax, it must be 0 anyway */
-       movq saved_context_ebx(%rip), %rbx
-       movq saved_context_ecx(%rip), %rcx
-       movq saved_context_edx(%rip), %rdx
-       movq saved_context_esi(%rip), %rsi
-       movq saved_context_edi(%rip), %rdi
-       movq saved_context_r08(%rip), %r8
-       movq saved_context_r09(%rip), %r9
-       movq saved_context_r10(%rip), %r10
-       movq saved_context_r11(%rip), %r11
-       movq saved_context_r12(%rip), %r12
-       movq saved_context_r13(%rip), %r13
-       movq saved_context_r14(%rip), %r14
-       movq saved_context_r15(%rip), %r15
-       pushq saved_context_eflags(%rip) ; popfq
-
-       xorq    %rax, %rax
-
-       ret
diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c
deleted file mode 100644 (file)
index 4770b7a..0000000
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * linux/arch/x86_64/kernel/sys_x86_64.c
- */
-
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/syscalls.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/smp.h>
-#include <linux/sem.h>
-#include <linux/msg.h>
-#include <linux/shm.h>
-#include <linux/stat.h>
-#include <linux/mman.h>
-#include <linux/file.h>
-#include <linux/utsname.h>
-#include <linux/personality.h>
-
-#include <asm/uaccess.h>
-#include <asm/ia32.h>
-
-/*
- * sys_pipe() is the normal C calling standard for creating
- * a pipe. It's not the way Unix traditionally does this, though.
- */
-asmlinkage long sys_pipe(int __user *fildes)
-{
-       int fd[2];
-       int error;
-
-       error = do_pipe(fd);
-       if (!error) {
-               if (copy_to_user(fildes, fd, 2*sizeof(int)))
-                       error = -EFAULT;
-       }
-       return error;
-}
-
-asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags,
-       unsigned long fd, unsigned long off)
-{
-       long error;
-       struct file * file;
-
-       error = -EINVAL;
-       if (off & ~PAGE_MASK)
-               goto out;
-
-       error = -EBADF;
-       file = NULL;
-       flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
-       if (!(flags & MAP_ANONYMOUS)) {
-               file = fget(fd);
-               if (!file)
-                       goto out;
-       }
-       down_write(&current->mm->mmap_sem);
-       error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT);
-       up_write(&current->mm->mmap_sem);
-
-       if (file)
-               fput(file);
-out:
-       return error;
-}
-
-static void find_start_end(unsigned long flags, unsigned long *begin,
-                          unsigned long *end)
-{
-       if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) {
-               /* This is usually used needed to map code in small
-                  model, so it needs to be in the first 31bit. Limit
-                  it to that.  This means we need to move the
-                  unmapped base down for this case. This can give
-                  conflicts with the heap, but we assume that glibc
-                  malloc knows how to fall back to mmap. Give it 1GB
-                  of playground for now. -AK */ 
-               *begin = 0x40000000; 
-               *end = 0x80000000;              
-       } else {
-               *begin = TASK_UNMAPPED_BASE;
-               *end = TASK_SIZE; 
-       }
-} 
-
-unsigned long
-arch_get_unmapped_area(struct file *filp, unsigned long addr,
-               unsigned long len, unsigned long pgoff, unsigned long flags)
-{
-       struct mm_struct *mm = current->mm;
-       struct vm_area_struct *vma;
-       unsigned long start_addr;
-       unsigned long begin, end;
-       
-       if (flags & MAP_FIXED)
-               return addr;
-
-       find_start_end(flags, &begin, &end); 
-
-       if (len > end)
-               return -ENOMEM;
-
-       if (addr) {
-               addr = PAGE_ALIGN(addr);
-               vma = find_vma(mm, addr);
-               if (end - len >= addr &&
-                   (!vma || addr + len <= vma->vm_start))
-                       return addr;
-       }
-       if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
-           && len <= mm->cached_hole_size) {
-               mm->cached_hole_size = 0;
-               mm->free_area_cache = begin;
-       }
-       addr = mm->free_area_cache;
-       if (addr < begin) 
-               addr = begin; 
-       start_addr = addr;
-
-full_search:
-       for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
-               /* At this point:  (!vma || addr < vma->vm_end). */
-               if (end - len < addr) {
-                       /*
-                        * Start a new search - just in case we missed
-                        * some holes.
-                        */
-                       if (start_addr != begin) {
-                               start_addr = addr = begin;
-                               mm->cached_hole_size = 0;
-                               goto full_search;
-                       }
-                       return -ENOMEM;
-               }
-               if (!vma || addr + len <= vma->vm_start) {
-                       /*
-                        * Remember the place where we stopped the search:
-                        */
-                       mm->free_area_cache = addr + len;
-                       return addr;
-               }
-               if (addr + mm->cached_hole_size < vma->vm_start)
-                       mm->cached_hole_size = vma->vm_start - addr;
-
-               addr = vma->vm_end;
-       }
-}
-
-asmlinkage long sys_uname(struct new_utsname __user * name)
-{
-       int err;
-       down_read(&uts_sem);
-       err = copy_to_user(name, utsname(), sizeof (*name));
-       up_read(&uts_sem);
-       if (personality(current->personality) == PER_LINUX32) 
-               err |= copy_to_user(&name->machine, "i686", 5);                 
-       return err ? -EFAULT : 0;
-}
diff --git a/arch/x86_64/kernel/syscall_64.c b/arch/x86_64/kernel/syscall_64.c
deleted file mode 100644 (file)
index 63d592c..0000000
+++ /dev/null
@@ -1,26 +0,0 @@
-/* System call table for x86-64. */ 
-
-#include <linux/linkage.h>
-#include <linux/sys.h>
-#include <linux/cache.h>
-#include <asm/asm-offsets.h>
-
-#define __NO_STUBS
-
-#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; 
-#undef _ASM_X86_64_UNISTD_H_
-#include <asm-x86_64/unistd.h>
-
-#undef __SYSCALL
-#define __SYSCALL(nr, sym) [ nr ] = sym, 
-#undef _ASM_X86_64_UNISTD_H_
-
-typedef void (*sys_call_ptr_t)(void); 
-
-extern void sys_ni_syscall(void);
-
-const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
-       /* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */ 
-       [0 ... __NR_syscall_max] = &sys_ni_syscall,
-#include <asm-x86_64/unistd.h>
-};
diff --git a/arch/x86_64/kernel/tce_64.c b/arch/x86_64/kernel/tce_64.c
deleted file mode 100644 (file)
index e3f2569..0000000
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * This file manages the translation entries for the IBM Calgary IOMMU.
- *
- * Derived from arch/powerpc/platforms/pseries/iommu.c
- *
- * Copyright (C) IBM Corporation, 2006
- *
- * Author: Jon Mason <jdmason@us.ibm.com>
- * Author: Muli Ben-Yehuda <muli@il.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- */
-
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
-#include <linux/string.h>
-#include <linux/pci.h>
-#include <linux/dma-mapping.h>
-#include <linux/bootmem.h>
-#include <asm/tce.h>
-#include <asm/calgary.h>
-#include <asm/proto.h>
-
-/* flush a tce at 'tceaddr' to main memory */
-static inline void flush_tce(void* tceaddr)
-{
-       /* a single tce can't cross a cache line */
-       if (cpu_has_clflush)
-               asm volatile("clflush (%0)" :: "r" (tceaddr));
-       else
-               asm volatile("wbinvd":::"memory");
-}
-
-void tce_build(struct iommu_table *tbl, unsigned long index,
-       unsigned int npages, unsigned long uaddr, int direction)
-{
-       u64* tp;
-       u64 t;
-       u64 rpn;
-
-       t = (1 << TCE_READ_SHIFT);
-       if (direction != DMA_TO_DEVICE)
-               t |= (1 << TCE_WRITE_SHIFT);
-
-       tp = ((u64*)tbl->it_base) + index;
-
-       while (npages--) {
-               rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT;
-               t &= ~TCE_RPN_MASK;
-               t |= (rpn << TCE_RPN_SHIFT);
-
-               *tp = cpu_to_be64(t);
-               flush_tce(tp);
-
-               uaddr += PAGE_SIZE;
-               tp++;
-       }
-}
-
-void tce_free(struct iommu_table *tbl, long index, unsigned int npages)
-{
-       u64* tp;
-
-       tp  = ((u64*)tbl->it_base) + index;
-
-       while (npages--) {
-               *tp = cpu_to_be64(0);
-               flush_tce(tp);
-               tp++;
-       }
-}
-
-static inline unsigned int table_size_to_number_of_entries(unsigned char size)
-{
-       /*
-        * size is the order of the table, 0-7
-        * smallest table is 8K entries, so shift result by 13 to
-        * multiply by 8K
-        */
-       return (1 << size) << 13;
-}
-
-static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
-{
-       unsigned int bitmapsz;
-       unsigned long bmppages;
-       int ret;
-
-       tbl->it_busno = dev->bus->number;
-
-       /* set the tce table size - measured in entries */
-       tbl->it_size = table_size_to_number_of_entries(specified_table_size);
-
-       /*
-        * number of bytes needed for the bitmap size in number of
-        * entries; we need one bit per entry
-        */
-       bitmapsz = tbl->it_size / BITS_PER_BYTE;
-       bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz));
-       if (!bmppages) {
-               printk(KERN_ERR "Calgary: cannot allocate bitmap\n");
-               ret = -ENOMEM;
-               goto done;
-       }
-
-       tbl->it_map = (unsigned long*)bmppages;
-
-       memset(tbl->it_map, 0, bitmapsz);
-
-       tbl->it_hint = 0;
-
-       spin_lock_init(&tbl->it_lock);
-
-       return 0;
-
-done:
-       return ret;
-}
-
-int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar)
-{
-       struct iommu_table *tbl;
-       int ret;
-
-       if (pci_iommu(dev->bus)) {
-               printk(KERN_ERR "Calgary: dev %p has sysdata->iommu %p\n",
-                      dev, pci_iommu(dev->bus));
-               BUG();
-       }
-
-       tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL);
-       if (!tbl) {
-               printk(KERN_ERR "Calgary: error allocating iommu_table\n");
-               ret = -ENOMEM;
-               goto done;
-       }
-
-       ret = tce_table_setparms(dev, tbl);
-       if (ret)
-               goto free_tbl;
-
-       tbl->bbar = bbar;
-
-       set_pci_iommu(dev->bus, tbl);
-
-       return 0;
-
-free_tbl:
-       kfree(tbl);
-done:
-       return ret;
-}
-
-void * __init alloc_tce_table(void)
-{
-       unsigned int size;
-
-       size = table_size_to_number_of_entries(specified_table_size);
-       size *= TCE_ENTRY_SIZE;
-
-       return __alloc_bootmem_low(size, size, 0);
-}
-
-void __init free_tce_table(void *tbl)
-{
-       unsigned int size;
-
-       if (!tbl)
-               return;
-
-       size = table_size_to_number_of_entries(specified_table_size);
-       size *= TCE_ENTRY_SIZE;
-
-       free_bootmem(__pa(tbl), size);
-}
diff --git a/arch/x86_64/kernel/time_64.c b/arch/x86_64/kernel/time_64.c
deleted file mode 100644 (file)
index 6d48a4e..0000000
+++ /dev/null
@@ -1,447 +0,0 @@
-/*
- *  linux/arch/x86-64/kernel/time.c
- *
- *  "High Precision Event Timer" based timekeeping.
- *
- *  Copyright (c) 1991,1992,1995  Linus Torvalds
- *  Copyright (c) 1994  Alan Modra
- *  Copyright (c) 1995  Markus Kuhn
- *  Copyright (c) 1996  Ingo Molnar
- *  Copyright (c) 1998  Andrea Arcangeli
- *  Copyright (c) 2002,2006  Vojtech Pavlik
- *  Copyright (c) 2003  Andi Kleen
- *  RTC support code taken from arch/i386/kernel/timers/time_hpet.c
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/mc146818rtc.h>
-#include <linux/time.h>
-#include <linux/ioport.h>
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/sysdev.h>
-#include <linux/bcd.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/kallsyms.h>
-#include <linux/acpi.h>
-#ifdef CONFIG_ACPI
-#include <acpi/achware.h>      /* for PM timer frequency */
-#include <acpi/acpi_bus.h>
-#endif
-#include <asm/8253pit.h>
-#include <asm/i8253.h>
-#include <asm/pgtable.h>
-#include <asm/vsyscall.h>
-#include <asm/timex.h>
-#include <asm/proto.h>
-#include <asm/hpet.h>
-#include <asm/sections.h>
-#include <linux/hpet.h>
-#include <asm/apic.h>
-#include <asm/hpet.h>
-#include <asm/mpspec.h>
-#include <asm/nmi.h>
-#include <asm/vgtod.h>
-
-static char *timename = NULL;
-
-DEFINE_SPINLOCK(rtc_lock);
-EXPORT_SYMBOL(rtc_lock);
-DEFINE_SPINLOCK(i8253_lock);
-EXPORT_SYMBOL(i8253_lock);
-
-volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
-
-unsigned long profile_pc(struct pt_regs *regs)
-{
-       unsigned long pc = instruction_pointer(regs);
-
-       /* Assume the lock function has either no stack frame or a copy
-          of eflags from PUSHF
-          Eflags always has bits 22 and up cleared unlike kernel addresses. */
-       if (!user_mode(regs) && in_lock_functions(pc)) {
-               unsigned long *sp = (unsigned long *)regs->rsp;
-               if (sp[0] >> 22)
-                       return sp[0];
-               if (sp[1] >> 22)
-                       return sp[1];
-       }
-       return pc;
-}
-EXPORT_SYMBOL(profile_pc);
-
-/*
- * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500
- * ms after the second nowtime has started, because when nowtime is written
- * into the registers of the CMOS clock, it will jump to the next second
- * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data
- * sheet for details.
- */
-
-static int set_rtc_mmss(unsigned long nowtime)
-{
-       int retval = 0;
-       int real_seconds, real_minutes, cmos_minutes;
-       unsigned char control, freq_select;
-
-/*
- * IRQs are disabled when we're called from the timer interrupt,
- * no need for spin_lock_irqsave()
- */
-
-       spin_lock(&rtc_lock);
-
-/*
- * Tell the clock it's being set and stop it.
- */
-
-       control = CMOS_READ(RTC_CONTROL);
-       CMOS_WRITE(control | RTC_SET, RTC_CONTROL);
-
-       freq_select = CMOS_READ(RTC_FREQ_SELECT);
-       CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT);
-
-       cmos_minutes = CMOS_READ(RTC_MINUTES);
-               BCD_TO_BIN(cmos_minutes);
-
-/*
- * since we're only adjusting minutes and seconds, don't interfere with hour
- * overflow. This avoids messing with unknown time zones but requires your RTC
- * not to be off by more than 15 minutes. Since we're calling it only when
- * our clock is externally synchronized using NTP, this shouldn't be a problem.
- */
-
-       real_seconds = nowtime % 60;
-       real_minutes = nowtime / 60;
-       if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1)
-               real_minutes += 30;             /* correct for half hour time zone */
-       real_minutes %= 60;
-
-       if (abs(real_minutes - cmos_minutes) >= 30) {
-               printk(KERN_WARNING "time.c: can't update CMOS clock "
-                      "from %d to %d\n", cmos_minutes, real_minutes);
-               retval = -1;
-       } else {
-               BIN_TO_BCD(real_seconds);
-               BIN_TO_BCD(real_minutes);
-               CMOS_WRITE(real_seconds, RTC_SECONDS);
-               CMOS_WRITE(real_minutes, RTC_MINUTES);
-       }
-
-/*
- * The following flags have to be released exactly in this order, otherwise the
- * DS12887 (popular MC146818A clone with integrated battery and quartz) will
- * not reset the oscillator and will not update precisely 500 ms later. You
- * won't find this mentioned in the Dallas Semiconductor data sheets, but who
- * believes data sheets anyway ... -- Markus Kuhn
- */
-
-       CMOS_WRITE(control, RTC_CONTROL);
-       CMOS_WRITE(freq_select, RTC_FREQ_SELECT);
-
-       spin_unlock(&rtc_lock);
-
-       return retval;
-}
-
-int update_persistent_clock(struct timespec now)
-{
-       return set_rtc_mmss(now.tv_sec);
-}
-
-void main_timer_handler(void)
-{
-/*
- * Here we are in the timer irq handler. We have irqs locally disabled (so we
- * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running
- * on the other CPU, so we need a lock. We also need to lock the vsyscall
- * variables, because both do_timer() and us change them -arca+vojtech
- */
-
-       write_seqlock(&xtime_lock);
-
-/*
- * Do the timer stuff.
- */
-
-       do_timer(1);
-#ifndef CONFIG_SMP
-       update_process_times(user_mode(get_irq_regs()));
-#endif
-
-/*
- * In the SMP case we use the local APIC timer interrupt to do the profiling,
- * except when we simulate SMP mode on a uniprocessor system, in that case we
- * have to call the local interrupt handler.
- */
-
-       if (!using_apic_timer)
-               smp_local_timer_interrupt();
-
-       write_sequnlock(&xtime_lock);
-}
-
-static irqreturn_t timer_interrupt(int irq, void *dev_id)
-{
-       if (apic_runs_main_timer > 1)
-               return IRQ_HANDLED;
-       main_timer_handler();
-       if (using_apic_timer)
-               smp_send_timer_broadcast_ipi();
-       return IRQ_HANDLED;
-}
-
-unsigned long read_persistent_clock(void)
-{
-       unsigned int year, mon, day, hour, min, sec;
-       unsigned long flags;
-       unsigned century = 0;
-
-       spin_lock_irqsave(&rtc_lock, flags);
-
-       do {
-               sec = CMOS_READ(RTC_SECONDS);
-               min = CMOS_READ(RTC_MINUTES);
-               hour = CMOS_READ(RTC_HOURS);
-               day = CMOS_READ(RTC_DAY_OF_MONTH);
-               mon = CMOS_READ(RTC_MONTH);
-               year = CMOS_READ(RTC_YEAR);
-#ifdef CONFIG_ACPI
-               if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
-                                       acpi_gbl_FADT.century)
-                       century = CMOS_READ(acpi_gbl_FADT.century);
-#endif
-       } while (sec != CMOS_READ(RTC_SECONDS));
-
-       spin_unlock_irqrestore(&rtc_lock, flags);
-
-       /*
-        * We know that x86-64 always uses BCD format, no need to check the
-        * config register.
-        */
-
-       BCD_TO_BIN(sec);
-       BCD_TO_BIN(min);
-       BCD_TO_BIN(hour);
-       BCD_TO_BIN(day);
-       BCD_TO_BIN(mon);
-       BCD_TO_BIN(year);
-
-       if (century) {
-               BCD_TO_BIN(century);
-               year += century * 100;
-               printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
-       } else {
-               /*
-                * x86-64 systems only exists since 2002.
-                * This will work up to Dec 31, 2100
-                */
-               year += 2000;
-       }
-
-       return mktime(year, mon, day, hour, min, sec);
-}
-
-/* calibrate_cpu is used on systems with fixed rate TSCs to determine
- * processor frequency */
-#define TICK_COUNT 100000000
-static unsigned int __init tsc_calibrate_cpu_khz(void)
-{
-       int tsc_start, tsc_now;
-       int i, no_ctr_free;
-       unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
-       unsigned long flags;
-
-       for (i = 0; i < 4; i++)
-               if (avail_to_resrv_perfctr_nmi_bit(i))
-                       break;
-       no_ctr_free = (i == 4);
-       if (no_ctr_free) {
-               i = 3;
-               rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
-               wrmsrl(MSR_K7_EVNTSEL3, 0);
-               rdmsrl(MSR_K7_PERFCTR3, pmc3);
-       } else {
-               reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-               reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
-       }
-       local_irq_save(flags);
-       /* start meauring cycles, incrementing from 0 */
-       wrmsrl(MSR_K7_PERFCTR0 + i, 0);
-       wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
-       rdtscl(tsc_start);
-       do {
-               rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
-               tsc_now = get_cycles_sync();
-       } while ((tsc_now - tsc_start) < TICK_COUNT);
-
-       local_irq_restore(flags);
-       if (no_ctr_free) {
-               wrmsrl(MSR_K7_EVNTSEL3, 0);
-               wrmsrl(MSR_K7_PERFCTR3, pmc3);
-               wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
-       } else {
-               release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-               release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
-       }
-
-       return pmc_now * tsc_khz / (tsc_now - tsc_start);
-}
-
-/*
- * pit_calibrate_tsc() uses the speaker output (channel 2) of
- * the PIT. This is better than using the timer interrupt output,
- * because we can read the value of the speaker with just one inb(),
- * where we need three i/o operations for the interrupt channel.
- * We count how many ticks the TSC does in 50 ms.
- */
-
-static unsigned int __init pit_calibrate_tsc(void)
-{
-       unsigned long start, end;
-       unsigned long flags;
-
-       spin_lock_irqsave(&i8253_lock, flags);
-
-       outb((inb(0x61) & ~0x02) | 0x01, 0x61);
-
-       outb(0xb0, 0x43);
-       outb((PIT_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
-       outb((PIT_TICK_RATE / (1000 / 50)) >> 8, 0x42);
-       start = get_cycles_sync();
-       while ((inb(0x61) & 0x20) == 0);
-       end = get_cycles_sync();
-
-       spin_unlock_irqrestore(&i8253_lock, flags);
-
-       return (end - start) / 50;
-}
-
-#define PIT_MODE 0x43
-#define PIT_CH0  0x40
-
-static void __pit_init(int val, u8 mode)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&i8253_lock, flags);
-       outb_p(mode, PIT_MODE);
-       outb_p(val & 0xff, PIT_CH0);    /* LSB */
-       outb_p(val >> 8, PIT_CH0);      /* MSB */
-       spin_unlock_irqrestore(&i8253_lock, flags);
-}
-
-void __init pit_init(void)
-{
-       __pit_init(LATCH, 0x34); /* binary, mode 2, LSB/MSB, ch 0 */
-}
-
-void pit_stop_interrupt(void)
-{
-       __pit_init(0, 0x30); /* mode 0 */
-}
-
-void stop_timer_interrupt(void)
-{
-       char *name;
-       if (hpet_address) {
-               name = "HPET";
-               hpet_timer_stop_set_go(0);
-       } else {
-               name = "PIT";
-               pit_stop_interrupt();
-       }
-       printk(KERN_INFO "timer: %s interrupt stopped.\n", name);
-}
-
-static struct irqaction irq0 = {
-       .handler        = timer_interrupt,
-       .flags          = IRQF_DISABLED | IRQF_IRQPOLL,
-       .mask           = CPU_MASK_NONE,
-       .name           = "timer"
-};
-
-void __init time_init(void)
-{
-       if (nohpet)
-               hpet_address = 0;
-
-       if (hpet_arch_init())
-               hpet_address = 0;
-
-       if (hpet_use_timer) {
-               /* set tick_nsec to use the proper rate for HPET */
-               tick_nsec = TICK_NSEC_HPET;
-               tsc_khz = hpet_calibrate_tsc();
-               timename = "HPET";
-       } else {
-               pit_init();
-               tsc_khz = pit_calibrate_tsc();
-               timename = "PIT";
-       }
-
-       cpu_khz = tsc_khz;
-       if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
-               boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-               boot_cpu_data.x86 == 16)
-               cpu_khz = tsc_calibrate_cpu_khz();
-
-       if (unsynchronized_tsc())
-               mark_tsc_unstable("TSCs unsynchronized");
-
-       if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
-               vgetcpu_mode = VGETCPU_RDTSCP;
-       else
-               vgetcpu_mode = VGETCPU_LSL;
-
-       set_cyc2ns_scale(tsc_khz);
-       printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
-               cpu_khz / 1000, cpu_khz % 1000);
-       init_tsc_clocksource();
-
-       setup_irq(0, &irq0);
-}
-
-/*
- * sysfs support for the timer.
- */
-
-static int timer_suspend(struct sys_device *dev, pm_message_t state)
-{
-       return 0;
-}
-
-static int timer_resume(struct sys_device *dev)
-{
-       if (hpet_address)
-               hpet_reenable();
-       else
-               i8254_timer_resume();
-       return 0;
-}
-
-static struct sysdev_class timer_sysclass = {
-       .resume = timer_resume,
-       .suspend = timer_suspend,
-       set_kset_name("timer"),
-};
-
-/* XXX this sysfs stuff should probably go elsewhere later -john */
-static struct sys_device device_timer = {
-       .id     = 0,
-       .cls    = &timer_sysclass,
-};
-
-static int time_init_device(void)
-{
-       int error = sysdev_class_register(&timer_sysclass);
-       if (!error)
-               error = sysdev_register(&device_timer);
-       return error;
-}
-
-device_initcall(time_init_device);
diff --git a/arch/x86_64/kernel/trampoline_64.S b/arch/x86_64/kernel/trampoline_64.S
deleted file mode 100644 (file)
index 607983b..0000000
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- *
- *     Trampoline.S    Derived from Setup.S by Linus Torvalds
- *
- *     4 Jan 1997 Michael Chastain: changed to gnu as.
- *     15 Sept 2005 Eric Biederman: 64bit PIC support
- *
- *     Entry: CS:IP point to the start of our code, we are 
- *     in real mode with no stack, but the rest of the 
- *     trampoline page to make our stack and everything else
- *     is a mystery.
- *
- *     In fact we don't actually need a stack so we don't
- *     set one up.
- *
- *     On entry to trampoline_data, the processor is in real mode
- *     with 16-bit addressing and 16-bit data.  CS has some value
- *     and IP is zero.  Thus, data addresses need to be absolute
- *     (no relocation) and are taken with regard to r_base.
- *
- *     With the addition of trampoline_level4_pgt this code can
- *     now enter a 64bit kernel that lives at arbitrary 64bit
- *     physical addresses.
- *
- *     If you work on this file, check the object module with objdump
- *     --full-contents --reloc to make sure there are no relocation
- *     entries.
- */
-
-#include <linux/linkage.h>
-#include <asm/pgtable.h>
-#include <asm/page.h>
-#include <asm/msr.h>
-#include <asm/segment.h>
-
-.data
-
-.code16
-
-ENTRY(trampoline_data)
-r_base = .
-       cli                     # We should be safe anyway
-       wbinvd  
-       mov     %cs, %ax        # Code and data in the same place
-       mov     %ax, %ds
-       mov     %ax, %es
-       mov     %ax, %ss
-
-
-       movl    $0xA5A5A5A5, trampoline_data - r_base
-                               # write marker for master knows we're running
-
-                                       # Setup stack
-       movw    $(trampoline_stack_end - r_base), %sp
-
-       call    verify_cpu              # Verify the cpu supports long mode
-       testl   %eax, %eax              # Check for return code
-       jnz     no_longmode
-
-       mov     %cs, %ax
-       movzx   %ax, %esi               # Find the 32bit trampoline location
-       shll    $4, %esi
-
-                                       # Fixup the vectors
-       addl    %esi, startup_32_vector - r_base
-       addl    %esi, startup_64_vector - r_base
-       addl    %esi, tgdt + 2 - r_base # Fixup the gdt pointer
-
-       /*
-        * GDT tables in non default location kernel can be beyond 16MB and
-        * lgdt will not be able to load the address as in real mode default
-        * operand size is 16bit. Use lgdtl instead to force operand size
-        * to 32 bit.
-        */
-
-       lidtl   tidt - r_base   # load idt with 0, 0
-       lgdtl   tgdt - r_base   # load gdt with whatever is appropriate
-
-       xor     %ax, %ax
-       inc     %ax             # protected mode (PE) bit
-       lmsw    %ax             # into protected mode
-
-       # flush prefetch and jump to startup_32
-       ljmpl   *(startup_32_vector - r_base)
-
-       .code32
-       .balign 4
-startup_32:
-       movl    $__KERNEL_DS, %eax      # Initialize the %ds segment register
-       movl    %eax, %ds
-
-       xorl    %eax, %eax
-       btsl    $5, %eax                # Enable PAE mode
-       movl    %eax, %cr4
-
-                                       # Setup trampoline 4 level pagetables
-       leal    (trampoline_level4_pgt - r_base)(%esi), %eax
-       movl    %eax, %cr3
-
-       movl    $MSR_EFER, %ecx
-       movl    $(1 << _EFER_LME), %eax # Enable Long Mode
-       xorl    %edx, %edx
-       wrmsr
-
-       xorl    %eax, %eax
-       btsl    $31, %eax               # Enable paging and in turn activate Long Mode
-       btsl    $0, %eax                # Enable protected mode
-       movl    %eax, %cr0
-
-       /*
-        * At this point we're in long mode but in 32bit compatibility mode
-        * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
-        * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
-        * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
-        */
-       ljmp    *(startup_64_vector - r_base)(%esi)
-
-       .code64
-       .balign 4
-startup_64:
-       # Now jump into the kernel using virtual addresses
-       movq    $secondary_startup_64, %rax
-       jmp     *%rax
-
-       .code16
-no_longmode:
-       hlt
-       jmp no_longmode
-#include "verify_cpu_64.S"
-
-       # Careful these need to be in the same 64K segment as the above;
-tidt:
-       .word   0                       # idt limit = 0
-       .word   0, 0                    # idt base = 0L
-
-       # Duplicate the global descriptor table
-       # so the kernel can live anywhere
-       .balign 4
-tgdt:
-       .short  tgdt_end - tgdt         # gdt limit
-       .long   tgdt - r_base
-       .short 0
-       .quad   0x00cf9b000000ffff      # __KERNEL32_CS
-       .quad   0x00af9b000000ffff      # __KERNEL_CS
-       .quad   0x00cf93000000ffff      # __KERNEL_DS
-tgdt_end:
-
-       .balign 4
-startup_32_vector:
-       .long   startup_32 - r_base
-       .word   __KERNEL32_CS, 0
-
-       .balign 4
-startup_64_vector:
-       .long   startup_64 - r_base
-       .word   __KERNEL_CS, 0
-
-trampoline_stack:
-       .org 0x1000
-trampoline_stack_end:
-ENTRY(trampoline_level4_pgt)
-       .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
-       .fill   510,8,0
-       .quad   level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
-
-ENTRY(trampoline_end)
diff --git a/arch/x86_64/kernel/traps_64.c b/arch/x86_64/kernel/traps_64.c
deleted file mode 100644 (file)
index 0388842..0000000
+++ /dev/null
@@ -1,1138 +0,0 @@
-/*
- *  linux/arch/x86-64/traps.c
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
- *
- *  Pentium III FXSR, SSE support
- *     Gareth Hughes <gareth@valinux.com>, May 2000
- */
-
-/*
- * 'Traps.c' handles hardware traps and faults after we have saved some
- * state in 'entry.S'.
- */
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/ptrace.h>
-#include <linux/timer.h>
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/interrupt.h>
-#include <linux/kallsyms.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/nmi.h>
-#include <linux/kprobes.h>
-#include <linux/kexec.h>
-#include <linux/unwind.h>
-#include <linux/uaccess.h>
-#include <linux/bug.h>
-#include <linux/kdebug.h>
-
-#if defined(CONFIG_EDAC)
-#include <linux/edac.h>
-#endif
-
-#include <asm/system.h>
-#include <asm/io.h>
-#include <asm/atomic.h>
-#include <asm/debugreg.h>
-#include <asm/desc.h>
-#include <asm/i387.h>
-#include <asm/processor.h>
-#include <asm/unwind.h>
-#include <asm/smp.h>
-#include <asm/pgalloc.h>
-#include <asm/pda.h>
-#include <asm/proto.h>
-#include <asm/nmi.h>
-#include <asm/stacktrace.h>
-
-asmlinkage void divide_error(void);
-asmlinkage void debug(void);
-asmlinkage void nmi(void);
-asmlinkage void int3(void);
-asmlinkage void overflow(void);
-asmlinkage void bounds(void);
-asmlinkage void invalid_op(void);
-asmlinkage void device_not_available(void);
-asmlinkage void double_fault(void);
-asmlinkage void coprocessor_segment_overrun(void);
-asmlinkage void invalid_TSS(void);
-asmlinkage void segment_not_present(void);
-asmlinkage void stack_segment(void);
-asmlinkage void general_protection(void);
-asmlinkage void page_fault(void);
-asmlinkage void coprocessor_error(void);
-asmlinkage void simd_coprocessor_error(void);
-asmlinkage void reserved(void);
-asmlinkage void alignment_check(void);
-asmlinkage void machine_check(void);
-asmlinkage void spurious_interrupt_bug(void);
-
-static inline void conditional_sti(struct pt_regs *regs)
-{
-       if (regs->eflags & X86_EFLAGS_IF)
-               local_irq_enable();
-}
-
-static inline void preempt_conditional_sti(struct pt_regs *regs)
-{
-       preempt_disable();
-       if (regs->eflags & X86_EFLAGS_IF)
-               local_irq_enable();
-}
-
-static inline void preempt_conditional_cli(struct pt_regs *regs)
-{
-       if (regs->eflags & X86_EFLAGS_IF)
-               local_irq_disable();
-       /* Make sure to not schedule here because we could be running
-          on an exception stack. */
-       preempt_enable_no_resched();
-}
-
-int kstack_depth_to_print = 12;
-
-#ifdef CONFIG_KALLSYMS
-void printk_address(unsigned long address)
-{
-       unsigned long offset = 0, symsize;
-       const char *symname;
-       char *modname;
-       char *delim = ":";
-       char namebuf[128];
-
-       symname = kallsyms_lookup(address, &symsize, &offset,
-                                       &modname, namebuf);
-       if (!symname) {
-               printk(" [<%016lx>]\n", address);
-               return;
-       }
-       if (!modname)
-               modname = delim = "";           
-       printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n",
-               address, delim, modname, delim, symname, offset, symsize);
-}
-#else
-void printk_address(unsigned long address)
-{
-       printk(" [<%016lx>]\n", address);
-}
-#endif
-
-static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
-                                       unsigned *usedp, char **idp)
-{
-       static char ids[][8] = {
-               [DEBUG_STACK - 1] = "#DB",
-               [NMI_STACK - 1] = "NMI",
-               [DOUBLEFAULT_STACK - 1] = "#DF",
-               [STACKFAULT_STACK - 1] = "#SS",
-               [MCE_STACK - 1] = "#MC",
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
-               [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
-#endif
-       };
-       unsigned k;
-
-       /*
-        * Iterate over all exception stacks, and figure out whether
-        * 'stack' is in one of them:
-        */
-       for (k = 0; k < N_EXCEPTION_STACKS; k++) {
-               unsigned long end = per_cpu(orig_ist, cpu).ist[k];
-               /*
-                * Is 'stack' above this exception frame's end?
-                * If yes then skip to the next frame.
-                */
-               if (stack >= end)
-                       continue;
-               /*
-                * Is 'stack' above this exception frame's start address?
-                * If yes then we found the right frame.
-                */
-               if (stack >= end - EXCEPTION_STKSZ) {
-                       /*
-                        * Make sure we only iterate through an exception
-                        * stack once. If it comes up for the second time
-                        * then there's something wrong going on - just
-                        * break out and return NULL:
-                        */
-                       if (*usedp & (1U << k))
-                               break;
-                       *usedp |= 1U << k;
-                       *idp = ids[k];
-                       return (unsigned long *)end;
-               }
-               /*
-                * If this is a debug stack, and if it has a larger size than
-                * the usual exception stacks, then 'stack' might still
-                * be within the lower portion of the debug stack:
-                */
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
-               if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
-                       unsigned j = N_EXCEPTION_STACKS - 1;
-
-                       /*
-                        * Black magic. A large debug stack is composed of
-                        * multiple exception stack entries, which we
-                        * iterate through now. Dont look:
-                        */
-                       do {
-                               ++j;
-                               end -= EXCEPTION_STKSZ;
-                               ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
-                       } while (stack < end - EXCEPTION_STKSZ);
-                       if (*usedp & (1U << j))
-                               break;
-                       *usedp |= 1U << j;
-                       *idp = ids[j];
-                       return (unsigned long *)end;
-               }
-#endif
-       }
-       return NULL;
-}
-
-#define MSG(txt) ops->warning(data, txt)
-
-/*
- * x86-64 can have upto three kernel stacks: 
- * process stack
- * interrupt stack
- * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
- */
-
-static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
-{
-       void *t = (void *)tinfo;
-        return p > t && p < t + THREAD_SIZE - 3;
-}
-
-void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
-               unsigned long *stack,
-               struct stacktrace_ops *ops, void *data)
-{
-       const unsigned cpu = get_cpu();
-       unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
-       unsigned used = 0;
-       struct thread_info *tinfo;
-
-       if (!tsk)
-               tsk = current;
-
-       if (!stack) {
-               unsigned long dummy;
-               stack = &dummy;
-               if (tsk && tsk != current)
-                       stack = (unsigned long *)tsk->thread.rsp;
-       }
-
-       /*
-        * Print function call entries within a stack. 'cond' is the
-        * "end of stackframe" condition, that the 'stack++'
-        * iteration will eventually trigger.
-        */
-#define HANDLE_STACK(cond) \
-       do while (cond) { \
-               unsigned long addr = *stack++; \
-               /* Use unlocked access here because except for NMIs     \
-                  we should be already protected against module unloads */ \
-               if (__kernel_text_address(addr)) { \
-                       /* \
-                        * If the address is either in the text segment of the \
-                        * kernel, or in the region which contains vmalloc'ed \
-                        * memory, it *may* be the address of a calling \
-                        * routine; if so, print it so that someone tracing \
-                        * down the cause of the crash will be able to figure \
-                        * out the call path that was taken. \
-                        */ \
-                       ops->address(data, addr);   \
-               } \
-       } while (0)
-
-       /*
-        * Print function call entries in all stacks, starting at the
-        * current stack address. If the stacks consist of nested
-        * exceptions
-        */
-       for (;;) {
-               char *id;
-               unsigned long *estack_end;
-               estack_end = in_exception_stack(cpu, (unsigned long)stack,
-                                               &used, &id);
-
-               if (estack_end) {
-                       if (ops->stack(data, id) < 0)
-                               break;
-                       HANDLE_STACK (stack < estack_end);
-                       ops->stack(data, "<EOE>");
-                       /*
-                        * We link to the next stack via the
-                        * second-to-last pointer (index -2 to end) in the
-                        * exception stack:
-                        */
-                       stack = (unsigned long *) estack_end[-2];
-                       continue;
-               }
-               if (irqstack_end) {
-                       unsigned long *irqstack;
-                       irqstack = irqstack_end -
-                               (IRQSTACKSIZE - 64) / sizeof(*irqstack);
-
-                       if (stack >= irqstack && stack < irqstack_end) {
-                               if (ops->stack(data, "IRQ") < 0)
-                                       break;
-                               HANDLE_STACK (stack < irqstack_end);
-                               /*
-                                * We link to the next stack (which would be
-                                * the process stack normally) the last
-                                * pointer (index -1 to end) in the IRQ stack:
-                                */
-                               stack = (unsigned long *) (irqstack_end[-1]);
-                               irqstack_end = NULL;
-                               ops->stack(data, "EOI");
-                               continue;
-                       }
-               }
-               break;
-       }
-
-       /*
-        * This handles the process stack:
-        */
-       tinfo = task_thread_info(tsk);
-       HANDLE_STACK (valid_stack_ptr(tinfo, stack));
-#undef HANDLE_STACK
-       put_cpu();
-}
-EXPORT_SYMBOL(dump_trace);
-
-static void
-print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-       print_symbol(msg, symbol);
-       printk("\n");
-}
-
-static void print_trace_warning(void *data, char *msg)
-{
-       printk("%s\n", msg);
-}
-
-static int print_trace_stack(void *data, char *name)
-{
-       printk(" <%s> ", name);
-       return 0;
-}
-
-static void print_trace_address(void *data, unsigned long addr)
-{
-       touch_nmi_watchdog();
-       printk_address(addr);
-}
-
-static struct stacktrace_ops print_trace_ops = {
-       .warning = print_trace_warning,
-       .warning_symbol = print_trace_warning_symbol,
-       .stack = print_trace_stack,
-       .address = print_trace_address,
-};
-
-void
-show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
-{
-       printk("\nCall Trace:\n");
-       dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
-       printk("\n");
-}
-
-static void
-_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
-{
-       unsigned long *stack;
-       int i;
-       const int cpu = smp_processor_id();
-       unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
-       unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
-
-       // debugging aid: "show_stack(NULL, NULL);" prints the
-       // back trace for this cpu.
-
-       if (rsp == NULL) {
-               if (tsk)
-                       rsp = (unsigned long *)tsk->thread.rsp;
-               else
-                       rsp = (unsigned long *)&rsp;
-       }
-
-       stack = rsp;
-       for(i=0; i < kstack_depth_to_print; i++) {
-               if (stack >= irqstack && stack <= irqstack_end) {
-                       if (stack == irqstack_end) {
-                               stack = (unsigned long *) (irqstack_end[-1]);
-                               printk(" <EOI> ");
-                       }
-               } else {
-               if (((long) stack & (THREAD_SIZE-1)) == 0)
-                       break;
-               }
-               if (i && ((i % 4) == 0))
-                       printk("\n");
-               printk(" %016lx", *stack++);
-               touch_nmi_watchdog();
-       }
-       show_trace(tsk, regs, rsp);
-}
-
-void show_stack(struct task_struct *tsk, unsigned long * rsp)
-{
-       _show_stack(tsk, NULL, rsp);
-}
-
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
-       unsigned long dummy;
-       show_trace(NULL, NULL, &dummy);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
-void show_registers(struct pt_regs *regs)
-{
-       int i;
-       int in_kernel = !user_mode(regs);
-       unsigned long rsp;
-       const int cpu = smp_processor_id();
-       struct task_struct *cur = cpu_pda(cpu)->pcurrent;
-
-       rsp = regs->rsp;
-       printk("CPU %d ", cpu);
-       __show_regs(regs);
-       printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
-               cur->comm, cur->pid, task_thread_info(cur), cur);
-
-       /*
-        * When in-kernel, we also print out the stack and code at the
-        * time of the fault..
-        */
-       if (in_kernel) {
-               printk("Stack: ");
-               _show_stack(NULL, regs, (unsigned long*)rsp);
-
-               printk("\nCode: ");
-               if (regs->rip < PAGE_OFFSET)
-                       goto bad;
-
-               for (i=0; i<20; i++) {
-                       unsigned char c;
-                       if (__get_user(c, &((unsigned char*)regs->rip)[i])) {
-bad:
-                               printk(" Bad RIP value.");
-                               break;
-                       }
-                       printk("%02x ", c);
-               }
-       }
-       printk("\n");
-}      
-
-int is_valid_bugaddr(unsigned long rip)
-{
-       unsigned short ud2;
-
-       if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2)))
-               return 0;
-
-       return ud2 == 0x0b0f;
-}
-
-#ifdef CONFIG_BUG
-void out_of_line_bug(void)
-{ 
-       BUG(); 
-} 
-EXPORT_SYMBOL(out_of_line_bug);
-#endif
-
-static DEFINE_SPINLOCK(die_lock);
-static int die_owner = -1;
-static unsigned int die_nest_count;
-
-unsigned __kprobes long oops_begin(void)
-{
-       int cpu;
-       unsigned long flags;
-
-       oops_enter();
-
-       /* racy, but better than risking deadlock. */
-       local_irq_save(flags);
-       cpu = smp_processor_id();
-       if (!spin_trylock(&die_lock)) { 
-               if (cpu == die_owner) 
-                       /* nested oops. should stop eventually */;
-               else
-                       spin_lock(&die_lock);
-       }
-       die_nest_count++;
-       die_owner = cpu;
-       console_verbose();
-       bust_spinlocks(1);
-       return flags;
-}
-
-void __kprobes oops_end(unsigned long flags)
-{ 
-       die_owner = -1;
-       bust_spinlocks(0);
-       die_nest_count--;
-       if (die_nest_count)
-               /* We still own the lock */
-               local_irq_restore(flags);
-       else
-               /* Nest count reaches zero, release the lock. */
-               spin_unlock_irqrestore(&die_lock, flags);
-       if (panic_on_oops)
-               panic("Fatal exception");
-       oops_exit();
-}
-
-void __kprobes __die(const char * str, struct pt_regs * regs, long err)
-{
-       static int die_counter;
-       printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
-#ifdef CONFIG_PREEMPT
-       printk("PREEMPT ");
-#endif
-#ifdef CONFIG_SMP
-       printk("SMP ");
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
-       printk("DEBUG_PAGEALLOC");
-#endif
-       printk("\n");
-       notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV);
-       show_registers(regs);
-       add_taint(TAINT_DIE);
-       /* Executive summary in case the oops scrolled away */
-       printk(KERN_ALERT "RIP ");
-       printk_address(regs->rip); 
-       printk(" RSP <%016lx>\n", regs->rsp); 
-       if (kexec_should_crash(current))
-               crash_kexec(regs);
-}
-
-void die(const char * str, struct pt_regs * regs, long err)
-{
-       unsigned long flags = oops_begin();
-
-       if (!user_mode(regs))
-               report_bug(regs->rip, regs);
-
-       __die(str, regs, err);
-       oops_end(flags);
-       do_exit(SIGSEGV); 
-}
-
-void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
-{
-       unsigned long flags = oops_begin();
-
-       /*
-        * We are in trouble anyway, lets at least try
-        * to get a message out.
-        */
-       printk(str, smp_processor_id());
-       show_registers(regs);
-       if (kexec_should_crash(current))
-               crash_kexec(regs);
-       if (do_panic || panic_on_oops)
-               panic("Non maskable interrupt");
-       oops_end(flags);
-       nmi_exit();
-       local_irq_enable();
-       do_exit(SIGSEGV);
-}
-
-static void __kprobes do_trap(int trapnr, int signr, char *str,
-                             struct pt_regs * regs, long error_code,
-                             siginfo_t *info)
-{
-       struct task_struct *tsk = current;
-
-       if (user_mode(regs)) {
-               /*
-                * We want error_code and trap_no set for userspace
-                * faults and kernelspace faults which result in
-                * die(), but not kernelspace faults which are fixed
-                * up.  die() gives the process no chance to handle
-                * the signal and notice the kernel fault information,
-                * so that won't result in polluting the information
-                * about previously queued, but not yet delivered,
-                * faults.  See also do_general_protection below.
-                */
-               tsk->thread.error_code = error_code;
-               tsk->thread.trap_no = trapnr;
-
-               if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
-                   printk_ratelimit())
-                       printk(KERN_INFO
-                              "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
-                              tsk->comm, tsk->pid, str,
-                              regs->rip, regs->rsp, error_code); 
-
-               if (info)
-                       force_sig_info(signr, info, tsk);
-               else
-                       force_sig(signr, tsk);
-               return;
-       }
-
-
-       /* kernel trap */ 
-       {            
-               const struct exception_table_entry *fixup;
-               fixup = search_exception_tables(regs->rip);
-               if (fixup)
-                       regs->rip = fixup->fixup;
-               else {
-                       tsk->thread.error_code = error_code;
-                       tsk->thread.trap_no = trapnr;
-                       die(str, regs, error_code);
-               }
-               return;
-       }
-}
-
-#define DO_ERROR(trapnr, signr, str, name) \
-asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
-{ \
-       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
-                                                       == NOTIFY_STOP) \
-               return; \
-       conditional_sti(regs);                                          \
-       do_trap(trapnr, signr, str, regs, error_code, NULL); \
-}
-
-#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
-asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
-{ \
-       siginfo_t info; \
-       info.si_signo = signr; \
-       info.si_errno = 0; \
-       info.si_code = sicode; \
-       info.si_addr = (void __user *)siaddr; \
-       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
-                                                       == NOTIFY_STOP) \
-               return; \
-       conditional_sti(regs);                                          \
-       do_trap(trapnr, signr, str, regs, error_code, &info); \
-}
-
-DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->rip)
-DO_ERROR( 4, SIGSEGV, "overflow", overflow)
-DO_ERROR( 5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip)
-DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
-DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
-DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
-DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
-DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
-DO_ERROR(18, SIGSEGV, "reserved", reserved)
-
-/* Runs on IST stack */
-asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
-{
-       if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
-                       12, SIGBUS) == NOTIFY_STOP)
-               return;
-       preempt_conditional_sti(regs);
-       do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
-       preempt_conditional_cli(regs);
-}
-
-asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
-{
-       static const char str[] = "double fault";
-       struct task_struct *tsk = current;
-
-       /* Return not checked because double check cannot be ignored */
-       notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
-
-       tsk->thread.error_code = error_code;
-       tsk->thread.trap_no = 8;
-
-       /* This is always a kernel trap and never fixable (and thus must
-          never return). */
-       for (;;)
-               die(str, regs, error_code);
-}
-
-asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
-                                               long error_code)
-{
-       struct task_struct *tsk = current;
-
-       conditional_sti(regs);
-
-       if (user_mode(regs)) {
-               tsk->thread.error_code = error_code;
-               tsk->thread.trap_no = 13;
-
-               if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
-                   printk_ratelimit())
-                       printk(KERN_INFO
-                      "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
-                              tsk->comm, tsk->pid,
-                              regs->rip, regs->rsp, error_code); 
-
-               force_sig(SIGSEGV, tsk);
-               return;
-       } 
-
-       /* kernel gp */
-       {
-               const struct exception_table_entry *fixup;
-               fixup = search_exception_tables(regs->rip);
-               if (fixup) {
-                       regs->rip = fixup->fixup;
-                       return;
-               }
-
-               tsk->thread.error_code = error_code;
-               tsk->thread.trap_no = 13;
-               if (notify_die(DIE_GPF, "general protection fault", regs,
-                                       error_code, 13, SIGSEGV) == NOTIFY_STOP)
-                       return;
-               die("general protection fault", regs, error_code);
-       }
-}
-
-static __kprobes void
-mem_parity_error(unsigned char reason, struct pt_regs * regs)
-{
-       printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
-               reason);
-       printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
-
-#if defined(CONFIG_EDAC)
-       if(edac_handler_set()) {
-               edac_atomic_assert_error();
-               return;
-       }
-#endif
-
-       if (panic_on_unrecovered_nmi)
-               panic("NMI: Not continuing");
-
-       printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
-
-       /* Clear and disable the memory parity error line. */
-       reason = (reason & 0xf) | 4;
-       outb(reason, 0x61);
-}
-
-static __kprobes void
-io_check_error(unsigned char reason, struct pt_regs * regs)
-{
-       printk("NMI: IOCK error (debug interrupt?)\n");
-       show_registers(regs);
-
-       /* Re-enable the IOCK line, wait for a few seconds */
-       reason = (reason & 0xf) | 8;
-       outb(reason, 0x61);
-       mdelay(2000);
-       reason &= ~8;
-       outb(reason, 0x61);
-}
-
-static __kprobes void
-unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
-{
-       printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
-               reason);
-       printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
-
-       if (panic_on_unrecovered_nmi)
-               panic("NMI: Not continuing");
-
-       printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
-}
-
-/* Runs on IST stack. This code must keep interrupts off all the time.
-   Nested NMIs are prevented by the CPU. */
-asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
-{
-       unsigned char reason = 0;
-       int cpu;
-
-       cpu = smp_processor_id();
-
-       /* Only the BSP gets external NMIs from the system.  */
-       if (!cpu)
-               reason = get_nmi_reason();
-
-       if (!(reason & 0xc0)) {
-               if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
-                                                               == NOTIFY_STOP)
-                       return;
-               /*
-                * Ok, so this is none of the documented NMI sources,
-                * so it must be the NMI watchdog.
-                */
-               if (nmi_watchdog_tick(regs,reason))
-                       return;
-               if (!do_nmi_callback(regs,cpu))
-                       unknown_nmi_error(reason, regs);
-
-               return;
-       }
-       if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
-               return; 
-
-       /* AK: following checks seem to be broken on modern chipsets. FIXME */
-
-       if (reason & 0x80)
-               mem_parity_error(reason, regs);
-       if (reason & 0x40)
-               io_check_error(reason, regs);
-}
-
-/* runs on IST stack. */
-asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
-{
-       if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
-               return;
-       }
-       preempt_conditional_sti(regs);
-       do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
-       preempt_conditional_cli(regs);
-}
-
-/* Help handler running on IST stack to switch back to user stack
-   for scheduling or signal handling. The actual stack switch is done in
-   entry.S */
-asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
-{
-       struct pt_regs *regs = eregs;
-       /* Did already sync */
-       if (eregs == (struct pt_regs *)eregs->rsp)
-               ;
-       /* Exception from user space */
-       else if (user_mode(eregs))
-               regs = task_pt_regs(current);
-       /* Exception from kernel and interrupts are enabled. Move to
-          kernel process stack. */
-       else if (eregs->eflags & X86_EFLAGS_IF)
-               regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs));
-       if (eregs != regs)
-               *regs = *eregs;
-       return regs;
-}
-
-/* runs on IST stack. */
-asmlinkage void __kprobes do_debug(struct pt_regs * regs,
-                                  unsigned long error_code)
-{
-       unsigned long condition;
-       struct task_struct *tsk = current;
-       siginfo_t info;
-
-       get_debugreg(condition, 6);
-
-       if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
-                                               SIGTRAP) == NOTIFY_STOP)
-               return;
-
-       preempt_conditional_sti(regs);
-
-       /* Mask out spurious debug traps due to lazy DR7 setting */
-       if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
-               if (!tsk->thread.debugreg7) { 
-                       goto clear_dr7;
-               }
-       }
-
-       tsk->thread.debugreg6 = condition;
-
-       /* Mask out spurious TF errors due to lazy TF clearing */
-       if (condition & DR_STEP) {
-               /*
-                * The TF error should be masked out only if the current
-                * process is not traced and if the TRAP flag has been set
-                * previously by a tracing process (condition detected by
-                * the PT_DTRACE flag); remember that the i386 TRAP flag
-                * can be modified by the process itself in user mode,
-                * allowing programs to debug themselves without the ptrace()
-                * interface.
-                */
-                if (!user_mode(regs))
-                       goto clear_TF_reenable;
-               /*
-                * Was the TF flag set by a debugger? If so, clear it now,
-                * so that register information is correct.
-                */
-               if (tsk->ptrace & PT_DTRACE) {
-                       regs->eflags &= ~TF_MASK;
-                       tsk->ptrace &= ~PT_DTRACE;
-               }
-       }
-
-       /* Ok, finally something we can handle */
-       tsk->thread.trap_no = 1;
-       tsk->thread.error_code = error_code;
-       info.si_signo = SIGTRAP;
-       info.si_errno = 0;
-       info.si_code = TRAP_BRKPT;
-       info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL;
-       force_sig_info(SIGTRAP, &info, tsk);
-
-clear_dr7:
-       set_debugreg(0UL, 7);
-       preempt_conditional_cli(regs);
-       return;
-
-clear_TF_reenable:
-       set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
-       regs->eflags &= ~TF_MASK;
-       preempt_conditional_cli(regs);
-}
-
-static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
-{
-       const struct exception_table_entry *fixup;
-       fixup = search_exception_tables(regs->rip);
-       if (fixup) {
-               regs->rip = fixup->fixup;
-               return 1;
-       }
-       notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
-       /* Illegal floating point operation in the kernel */
-       current->thread.trap_no = trapnr;
-       die(str, regs, 0);
-       return 0;
-}
-
-/*
- * Note that we play around with the 'TS' bit in an attempt to get
- * the correct behaviour even in the presence of the asynchronous
- * IRQ13 behaviour
- */
-asmlinkage void do_coprocessor_error(struct pt_regs *regs)
-{
-       void __user *rip = (void __user *)(regs->rip);
-       struct task_struct * task;
-       siginfo_t info;
-       unsigned short cwd, swd;
-
-       conditional_sti(regs);
-       if (!user_mode(regs) &&
-           kernel_math_error(regs, "kernel x87 math error", 16))
-               return;
-
-       /*
-        * Save the info for the exception handler and clear the error.
-        */
-       task = current;
-       save_init_fpu(task);
-       task->thread.trap_no = 16;
-       task->thread.error_code = 0;
-       info.si_signo = SIGFPE;
-       info.si_errno = 0;
-       info.si_code = __SI_FAULT;
-       info.si_addr = rip;
-       /*
-        * (~cwd & swd) will mask out exceptions that are not set to unmasked
-        * status.  0x3f is the exception bits in these regs, 0x200 is the
-        * C1 reg you need in case of a stack fault, 0x040 is the stack
-        * fault bit.  We should only be taking one exception at a time,
-        * so if this combination doesn't produce any single exception,
-        * then we have a bad program that isn't synchronizing its FPU usage
-        * and it will suffer the consequences since we won't be able to
-        * fully reproduce the context of the exception
-        */
-       cwd = get_fpu_cwd(task);
-       swd = get_fpu_swd(task);
-       switch (swd & ~cwd & 0x3f) {
-               case 0x000:
-               default:
-                       break;
-               case 0x001: /* Invalid Op */
-                       /*
-                        * swd & 0x240 == 0x040: Stack Underflow
-                        * swd & 0x240 == 0x240: Stack Overflow
-                        * User must clear the SF bit (0x40) if set
-                        */
-                       info.si_code = FPE_FLTINV;
-                       break;
-               case 0x002: /* Denormalize */
-               case 0x010: /* Underflow */
-                       info.si_code = FPE_FLTUND;
-                       break;
-               case 0x004: /* Zero Divide */
-                       info.si_code = FPE_FLTDIV;
-                       break;
-               case 0x008: /* Overflow */
-                       info.si_code = FPE_FLTOVF;
-                       break;
-               case 0x020: /* Precision */
-                       info.si_code = FPE_FLTRES;
-                       break;
-       }
-       force_sig_info(SIGFPE, &info, task);
-}
-
-asmlinkage void bad_intr(void)
-{
-       printk("bad interrupt"); 
-}
-
-asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
-{
-       void __user *rip = (void __user *)(regs->rip);
-       struct task_struct * task;
-       siginfo_t info;
-       unsigned short mxcsr;
-
-       conditional_sti(regs);
-       if (!user_mode(regs) &&
-               kernel_math_error(regs, "kernel simd math error", 19))
-               return;
-
-       /*
-        * Save the info for the exception handler and clear the error.
-        */
-       task = current;
-       save_init_fpu(task);
-       task->thread.trap_no = 19;
-       task->thread.error_code = 0;
-       info.si_signo = SIGFPE;
-       info.si_errno = 0;
-       info.si_code = __SI_FAULT;
-       info.si_addr = rip;
-       /*
-        * The SIMD FPU exceptions are handled a little differently, as there
-        * is only a single status/control register.  Thus, to determine which
-        * unmasked exception was caught we must mask the exception mask bits
-        * at 0x1f80, and then use these to mask the exception bits at 0x3f.
-        */
-       mxcsr = get_fpu_mxcsr(task);
-       switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
-               case 0x000:
-               default:
-                       break;
-               case 0x001: /* Invalid Op */
-                       info.si_code = FPE_FLTINV;
-                       break;
-               case 0x002: /* Denormalize */
-               case 0x010: /* Underflow */
-                       info.si_code = FPE_FLTUND;
-                       break;
-               case 0x004: /* Zero Divide */
-                       info.si_code = FPE_FLTDIV;
-                       break;
-               case 0x008: /* Overflow */
-                       info.si_code = FPE_FLTOVF;
-                       break;
-               case 0x020: /* Precision */
-                       info.si_code = FPE_FLTRES;
-                       break;
-       }
-       force_sig_info(SIGFPE, &info, task);
-}
-
-asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
-{
-}
-
-asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
-{
-}
-
-asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
-{
-}
-
-/*
- *  'math_state_restore()' saves the current math information in the
- * old math state array, and gets the new ones from the current task
- *
- * Careful.. There are problems with IBM-designed IRQ13 behaviour.
- * Don't touch unless you *really* know how it works.
- */
-asmlinkage void math_state_restore(void)
-{
-       struct task_struct *me = current;
-       clts();                 /* Allow maths ops (or we recurse) */
-
-       if (!used_math())
-               init_fpu(me);
-       restore_fpu_checking(&me->thread.i387.fxsave);
-       task_thread_info(me)->status |= TS_USEDFPU;
-       me->fpu_counter++;
-}
-
-void __init trap_init(void)
-{
-       set_intr_gate(0,&divide_error);
-       set_intr_gate_ist(1,&debug,DEBUG_STACK);
-       set_intr_gate_ist(2,&nmi,NMI_STACK);
-       set_system_gate_ist(3,&int3,DEBUG_STACK); /* int3 can be called from all */
-       set_system_gate(4,&overflow);   /* int4 can be called from all */
-       set_intr_gate(5,&bounds);
-       set_intr_gate(6,&invalid_op);
-       set_intr_gate(7,&device_not_available);
-       set_intr_gate_ist(8,&double_fault, DOUBLEFAULT_STACK);
-       set_intr_gate(9,&coprocessor_segment_overrun);
-       set_intr_gate(10,&invalid_TSS);
-       set_intr_gate(11,&segment_not_present);
-       set_intr_gate_ist(12,&stack_segment,STACKFAULT_STACK);
-       set_intr_gate(13,&general_protection);
-       set_intr_gate(14,&page_fault);
-       set_intr_gate(15,&spurious_interrupt_bug);
-       set_intr_gate(16,&coprocessor_error);
-       set_intr_gate(17,&alignment_check);
-#ifdef CONFIG_X86_MCE
-       set_intr_gate_ist(18,&machine_check, MCE_STACK); 
-#endif
-       set_intr_gate(19,&simd_coprocessor_error);
-
-#ifdef CONFIG_IA32_EMULATION
-       set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
-#endif
-       
-       /*
-        * Should be a barrier for any external CPU state.
-        */
-       cpu_init();
-}
-
-
-static int __init oops_setup(char *s)
-{ 
-       if (!s)
-               return -EINVAL;
-       if (!strcmp(s, "panic"))
-               panic_on_oops = 1;
-       return 0;
-} 
-early_param("oops", oops_setup);
-
-static int __init kstack_setup(char *s)
-{
-       if (!s)
-               return -EINVAL;
-       kstack_depth_to_print = simple_strtoul(s,NULL,0);
-       return 0;
-}
-early_param("kstack", kstack_setup);
diff --git a/arch/x86_64/kernel/tsc_64.c b/arch/x86_64/kernel/tsc_64.c
deleted file mode 100644 (file)
index 2a59bde..0000000
+++ /dev/null
@@ -1,207 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/clocksource.h>
-#include <linux/time.h>
-#include <linux/acpi.h>
-#include <linux/cpufreq.h>
-
-#include <asm/timex.h>
-
-static int notsc __initdata = 0;
-
-unsigned int cpu_khz;          /* TSC clocks / usec, not used here */
-EXPORT_SYMBOL(cpu_khz);
-unsigned int tsc_khz;
-EXPORT_SYMBOL(tsc_khz);
-
-static unsigned int cyc2ns_scale __read_mostly;
-
-void set_cyc2ns_scale(unsigned long khz)
-{
-       cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz;
-}
-
-static unsigned long long cycles_2_ns(unsigned long long cyc)
-{
-       return (cyc * cyc2ns_scale) >> NS_SCALE;
-}
-
-unsigned long long sched_clock(void)
-{
-       unsigned long a = 0;
-
-       /* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
-        * which means it is not completely exact and may not be monotonous
-        * between CPUs. But the errors should be too small to matter for
-        * scheduling purposes.
-        */
-
-       rdtscll(a);
-       return cycles_2_ns(a);
-}
-
-static int tsc_unstable;
-
-inline int check_tsc_unstable(void)
-{
-       return tsc_unstable;
-}
-#ifdef CONFIG_CPU_FREQ
-
-/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
- * changes.
- *
- * RED-PEN: On SMP we assume all CPUs run with the same frequency.  It's
- * not that important because current Opteron setups do not support
- * scaling on SMP anyroads.
- *
- * Should fix up last_tsc too. Currently gettimeofday in the
- * first tick after the change will be slightly wrong.
- */
-
-static unsigned int  ref_freq;
-static unsigned long loops_per_jiffy_ref;
-static unsigned long tsc_khz_ref;
-
-static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
-                                void *data)
-{
-       struct cpufreq_freqs *freq = data;
-       unsigned long *lpj, dummy;
-
-       if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC))
-               return 0;
-
-       lpj = &dummy;
-       if (!(freq->flags & CPUFREQ_CONST_LOOPS))
-#ifdef CONFIG_SMP
-               lpj = &cpu_data[freq->cpu].loops_per_jiffy;
-#else
-               lpj = &boot_cpu_data.loops_per_jiffy;
-#endif
-
-       if (!ref_freq) {
-               ref_freq = freq->old;
-               loops_per_jiffy_ref = *lpj;
-               tsc_khz_ref = tsc_khz;
-       }
-       if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
-               (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
-               (val == CPUFREQ_RESUMECHANGE)) {
-               *lpj =
-               cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
-
-               tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
-               if (!(freq->flags & CPUFREQ_CONST_LOOPS))
-                       mark_tsc_unstable("cpufreq changes");
-       }
-
-       set_cyc2ns_scale(tsc_khz_ref);
-
-       return 0;
-}
-
-static struct notifier_block time_cpufreq_notifier_block = {
-       .notifier_call  = time_cpufreq_notifier
-};
-
-static int __init cpufreq_tsc(void)
-{
-       cpufreq_register_notifier(&time_cpufreq_notifier_block,
-                                 CPUFREQ_TRANSITION_NOTIFIER);
-       return 0;
-}
-
-core_initcall(cpufreq_tsc);
-
-#endif
-
-/*
- * Make an educated guess if the TSC is trustworthy and synchronized
- * over all CPUs.
- */
-__cpuinit int unsynchronized_tsc(void)
-{
-       if (tsc_unstable)
-               return 1;
-
-#ifdef CONFIG_SMP
-       if (apic_is_clustered_box())
-               return 1;
-#endif
-       /* Most intel systems have synchronized TSCs except for
-          multi node systems */
-       if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
-#ifdef CONFIG_ACPI
-               /* But TSC doesn't tick in C3 so don't use it there */
-               if (acpi_gbl_FADT.header.length > 0 &&
-                   acpi_gbl_FADT.C3latency < 1000)
-                       return 1;
-#endif
-               return 0;
-       }
-
-       /* Assume multi socket systems are not synchronized */
-       return num_present_cpus() > 1;
-}
-
-int __init notsc_setup(char *s)
-{
-       notsc = 1;
-       return 1;
-}
-
-__setup("notsc", notsc_setup);
-
-
-/* clock source code: */
-static cycle_t read_tsc(void)
-{
-       cycle_t ret = (cycle_t)get_cycles_sync();
-       return ret;
-}
-
-static cycle_t __vsyscall_fn vread_tsc(void)
-{
-       cycle_t ret = (cycle_t)get_cycles_sync();
-       return ret;
-}
-
-static struct clocksource clocksource_tsc = {
-       .name                   = "tsc",
-       .rating                 = 300,
-       .read                   = read_tsc,
-       .mask                   = CLOCKSOURCE_MASK(64),
-       .shift                  = 22,
-       .flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
-                                 CLOCK_SOURCE_MUST_VERIFY,
-       .vread                  = vread_tsc,
-};
-
-void mark_tsc_unstable(char *reason)
-{
-       if (!tsc_unstable) {
-               tsc_unstable = 1;
-               printk("Marking TSC unstable due to %s\n", reason);
-               /* Change only the rating, when not registered */
-               if (clocksource_tsc.mult)
-                       clocksource_change_rating(&clocksource_tsc, 0);
-               else
-                       clocksource_tsc.rating = 0;
-       }
-}
-EXPORT_SYMBOL_GPL(mark_tsc_unstable);
-
-void __init init_tsc_clocksource(void)
-{
-       if (!notsc) {
-               clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
-                                                       clocksource_tsc.shift);
-               if (check_tsc_unstable())
-                       clocksource_tsc.rating = 0;
-
-               clocksource_register(&clocksource_tsc);
-       }
-}
diff --git a/arch/x86_64/kernel/tsc_sync.c b/arch/x86_64/kernel/tsc_sync.c
deleted file mode 100644 (file)
index 355f5f5..0000000
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
- * arch/x86_64/kernel/tsc_sync.c: check TSC synchronization.
- *
- * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar
- *
- * We check whether all boot CPUs have their TSC's synchronized,
- * print a warning if not and turn off the TSC clock-source.
- *
- * The warp-check is point-to-point between two CPUs, the CPU
- * initiating the bootup is the 'source CPU', the freshly booting
- * CPU is the 'target CPU'.
- *
- * Only two CPUs may participate - they can enter in any order.
- * ( The serial nature of the boot logic and the CPU hotplug lock
- *   protects against more than 2 CPUs entering this code. )
- */
-#include <linux/spinlock.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/nmi.h>
-#include <asm/tsc.h>
-
-/*
- * Entry/exit counters that make sure that both CPUs
- * run the measurement code at once:
- */
-static __cpuinitdata atomic_t start_count;
-static __cpuinitdata atomic_t stop_count;
-
-/*
- * We use a raw spinlock in this exceptional case, because
- * we want to have the fastest, inlined, non-debug version
- * of a critical section, to be able to prove TSC time-warps:
- */
-static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
-static __cpuinitdata cycles_t last_tsc;
-static __cpuinitdata cycles_t max_warp;
-static __cpuinitdata int nr_warps;
-
-/*
- * TSC-warp measurement loop running on both CPUs:
- */
-static __cpuinit void check_tsc_warp(void)
-{
-       cycles_t start, now, prev, end;
-       int i;
-
-       start = get_cycles_sync();
-       /*
-        * The measurement runs for 20 msecs:
-        */
-       end = start + tsc_khz * 20ULL;
-       now = start;
-
-       for (i = 0; ; i++) {
-               /*
-                * We take the global lock, measure TSC, save the
-                * previous TSC that was measured (possibly on
-                * another CPU) and update the previous TSC timestamp.
-                */
-               __raw_spin_lock(&sync_lock);
-               prev = last_tsc;
-               now = get_cycles_sync();
-               last_tsc = now;
-               __raw_spin_unlock(&sync_lock);
-
-               /*
-                * Be nice every now and then (and also check whether
-                * measurement is done [we also insert a 100 million
-                * loops safety exit, so we dont lock up in case the
-                * TSC readout is totally broken]):
-                */
-               if (unlikely(!(i & 7))) {
-                       if (now > end || i > 100000000)
-                               break;
-                       cpu_relax();
-                       touch_nmi_watchdog();
-               }
-               /*
-                * Outside the critical section we can now see whether
-                * we saw a time-warp of the TSC going backwards:
-                */
-               if (unlikely(prev > now)) {
-                       __raw_spin_lock(&sync_lock);
-                       max_warp = max(max_warp, prev - now);
-                       nr_warps++;
-                       __raw_spin_unlock(&sync_lock);
-               }
-
-       }
-}
-
-/*
- * Source CPU calls into this - it waits for the freshly booted
- * target CPU to arrive and then starts the measurement:
- */
-void __cpuinit check_tsc_sync_source(int cpu)
-{
-       int cpus = 2;
-
-       /*
-        * No need to check if we already know that the TSC is not
-        * synchronized:
-        */
-       if (unsynchronized_tsc())
-               return;
-
-       printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
-                         smp_processor_id(), cpu);
-
-       /*
-        * Reset it - in case this is a second bootup:
-        */
-       atomic_set(&stop_count, 0);
-
-       /*
-        * Wait for the target to arrive:
-        */
-       while (atomic_read(&start_count) != cpus-1)
-               cpu_relax();
-       /*
-        * Trigger the target to continue into the measurement too:
-        */
-       atomic_inc(&start_count);
-
-       check_tsc_warp();
-
-       while (atomic_read(&stop_count) != cpus-1)
-               cpu_relax();
-
-       /*
-        * Reset it - just in case we boot another CPU later:
-        */
-       atomic_set(&start_count, 0);
-
-       if (nr_warps) {
-               printk("\n");
-               printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
-                                   " turning off TSC clock.\n", max_warp);
-               mark_tsc_unstable("check_tsc_sync_source failed");
-               nr_warps = 0;
-               max_warp = 0;
-               last_tsc = 0;
-       } else {
-               printk(" passed.\n");
-       }
-
-       /*
-        * Let the target continue with the bootup:
-        */
-       atomic_inc(&stop_count);
-}
-
-/*
- * Freshly booted CPUs call into this:
- */
-void __cpuinit check_tsc_sync_target(void)
-{
-       int cpus = 2;
-
-       if (unsynchronized_tsc())
-               return;
-
-       /*
-        * Register this CPU's participation and wait for the
-        * source CPU to start the measurement:
-        */
-       atomic_inc(&start_count);
-       while (atomic_read(&start_count) != cpus)
-               cpu_relax();
-
-       check_tsc_warp();
-
-       /*
-        * Ok, we are done:
-        */
-       atomic_inc(&stop_count);
-
-       /*
-        * Wait for the source CPU to print stuff:
-        */
-       while (atomic_read(&stop_count) != cpus)
-               cpu_relax();
-}
-#undef NR_LOOPS
-
diff --git a/arch/x86_64/kernel/verify_cpu_64.S b/arch/x86_64/kernel/verify_cpu_64.S
deleted file mode 100644 (file)
index 45b6f8a..0000000
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- *
- *     verify_cpu.S - Code for cpu long mode and SSE verification. This
- *     code has been borrowed from boot/setup.S and was introduced by
- *     Andi Kleen.
- *
- *     Copyright (c) 2007  Andi Kleen (ak@suse.de)
- *     Copyright (c) 2007  Eric Biederman (ebiederm@xmission.com)
- *     Copyright (c) 2007  Vivek Goyal (vgoyal@in.ibm.com)
- *
- *     This source code is licensed under the GNU General Public License,
- *     Version 2.  See the file COPYING for more details.
- *
- *     This is a common code for verification whether CPU supports
- *     long mode and SSE or not. It is not called directly instead this
- *     file is included at various places and compiled in that context.
- *     Following are the current usage.
- *
- *     This file is included by both 16bit and 32bit code.
- *
- *     arch/x86_64/boot/setup.S : Boot cpu verification (16bit)
- *     arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit)
- *     arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit)
- *     arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit)
- *
- *     verify_cpu, returns the status of cpu check in register %eax.
- *             0: Success    1: Failure
- *
- *     The caller needs to check for the error code and take the action
- *     appropriately. Either display a message or halt.
- */
-
-#include <asm/cpufeature.h>
-
-verify_cpu:
-       pushfl                          # Save caller passed flags
-       pushl   $0                      # Kill any dangerous flags
-       popfl
-
-       pushfl                          # standard way to check for cpuid
-       popl    %eax
-       movl    %eax,%ebx
-       xorl    $0x200000,%eax
-       pushl   %eax
-       popfl
-       pushfl
-       popl    %eax
-       cmpl    %eax,%ebx
-       jz      verify_cpu_no_longmode  # cpu has no cpuid
-
-       movl    $0x0,%eax               # See if cpuid 1 is implemented
-       cpuid
-       cmpl    $0x1,%eax
-       jb      verify_cpu_no_longmode  # no cpuid 1
-
-       xor     %di,%di
-       cmpl    $0x68747541,%ebx        # AuthenticAMD
-       jnz     verify_cpu_noamd
-       cmpl    $0x69746e65,%edx
-       jnz     verify_cpu_noamd
-       cmpl    $0x444d4163,%ecx
-       jnz     verify_cpu_noamd
-       mov     $1,%di                  # cpu is from AMD
-
-verify_cpu_noamd:
-       movl    $0x1,%eax               # Does the cpu have what it takes
-       cpuid
-       andl    $REQUIRED_MASK0,%edx
-       xorl    $REQUIRED_MASK0,%edx
-       jnz     verify_cpu_no_longmode
-
-       movl    $0x80000000,%eax        # See if extended cpuid is implemented
-       cpuid
-       cmpl    $0x80000001,%eax
-       jb      verify_cpu_no_longmode  # no extended cpuid
-
-       movl    $0x80000001,%eax        # Does the cpu have what it takes
-       cpuid
-       andl    $REQUIRED_MASK1,%edx
-       xorl    $REQUIRED_MASK1,%edx
-       jnz     verify_cpu_no_longmode
-
-verify_cpu_sse_test:
-       movl    $1,%eax
-       cpuid
-       andl    $SSE_MASK,%edx
-       cmpl    $SSE_MASK,%edx
-       je      verify_cpu_sse_ok
-       test    %di,%di
-       jz      verify_cpu_no_longmode  # only try to force SSE on AMD
-       movl    $0xc0010015,%ecx        # HWCR
-       rdmsr
-       btr     $15,%eax                # enable SSE
-       wrmsr
-       xor     %di,%di                 # don't loop
-       jmp     verify_cpu_sse_test     # try again
-
-verify_cpu_no_longmode:
-       popfl                           # Restore caller passed flags
-       movl $1,%eax
-       ret
-verify_cpu_sse_ok:
-       popfl                           # Restore caller passed flags
-       xorl %eax, %eax
-       ret
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
deleted file mode 100644 (file)
index 849ee61..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-#ifdef CONFIG_X86_32
-# include "vmlinux_32.lds.S"
-#else
-# include "vmlinux_64.lds.S"
-#endif
diff --git a/arch/x86_64/kernel/vmlinux_64.lds.S b/arch/x86_64/kernel/vmlinux_64.lds.S
deleted file mode 100644 (file)
index ba8ea97..0000000
+++ /dev/null
@@ -1,235 +0,0 @@
-/* ld script to make x86-64 Linux kernel
- * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
- */
-
-#define LOAD_OFFSET __START_KERNEL_map
-
-#include <asm-generic/vmlinux.lds.h>
-#include <asm/page.h>
-
-#undef i386    /* in case the preprocessor is a 32bit one */
-
-OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
-OUTPUT_ARCH(i386:x86-64)
-ENTRY(phys_startup_64)
-jiffies_64 = jiffies;
-_proxy_pda = 1;
-PHDRS {
-       text PT_LOAD FLAGS(5);  /* R_E */
-       data PT_LOAD FLAGS(7);  /* RWE */
-       user PT_LOAD FLAGS(7);  /* RWE */
-       data.init PT_LOAD FLAGS(7);     /* RWE */
-       note PT_NOTE FLAGS(4);  /* R__ */
-}
-SECTIONS
-{
-  . = __START_KERNEL;
-  phys_startup_64 = startup_64 - LOAD_OFFSET;
-  _text = .;                   /* Text and read-only data */
-  .text :  AT(ADDR(.text) - LOAD_OFFSET) {
-       /* First the code that has to be first for bootstrapping */
-       *(.text.head)
-       _stext = .;
-       /* Then the rest */
-       TEXT_TEXT
-       SCHED_TEXT
-       LOCK_TEXT
-       KPROBES_TEXT
-       *(.fixup)
-       *(.gnu.warning)
-       } :text = 0x9090
-                               /* out-of-line lock text */
-  .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
-
-  _etext = .;                  /* End of text section */
-
-  . = ALIGN(16);               /* Exception table */
-  __start___ex_table = .;
-  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
-  __stop___ex_table = .;
-
-  NOTES :text :note
-
-  BUG_TABLE :text
-
-  RODATA
-
-  . = ALIGN(4);
-  .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
-       __tracedata_start = .;
-       *(.tracedata)
-       __tracedata_end = .;
-  }
-
-  . = ALIGN(PAGE_SIZE);        /* Align data segment to page size boundary */
-                               /* Data */
-  .data : AT(ADDR(.data) - LOAD_OFFSET) {
-       DATA_DATA
-       CONSTRUCTORS
-       } :data
-
-  _edata = .;                  /* End of data section */
-
-  . = ALIGN(PAGE_SIZE);
-  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-  .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
-       *(.data.cacheline_aligned)
-  }
-  . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
-  .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
-       *(.data.read_mostly)
-  }
-
-#define VSYSCALL_ADDR (-10*1024*1024)
-#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
-#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
-
-#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
-#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
-
-#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
-#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
-
-  . = VSYSCALL_ADDR;
-  .vsyscall_0 :         AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
-  __vsyscall_0 = VSYSCALL_VIRT_ADDR;
-
-  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-  .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
-  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-  .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data))
-               { *(.vsyscall_gtod_data) }
-  vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
-  .vsyscall_clock : AT(VLOAD(.vsyscall_clock))
-               { *(.vsyscall_clock) }
-  vsyscall_clock = VVIRT(.vsyscall_clock);
-
-
-  .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
-               { *(.vsyscall_1) }
-  .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2))
-               { *(.vsyscall_2) }
-
-  .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
-  vgetcpu_mode = VVIRT(.vgetcpu_mode);
-
-  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-  .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
-  jiffies = VVIRT(.jiffies);
-
-  .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3))
-               { *(.vsyscall_3) }
-
-  . = VSYSCALL_VIRT_ADDR + 4096;
-
-#undef VSYSCALL_ADDR
-#undef VSYSCALL_PHYS_ADDR
-#undef VSYSCALL_VIRT_ADDR
-#undef VLOAD_OFFSET
-#undef VLOAD
-#undef VVIRT_OFFSET
-#undef VVIRT
-
-  . = ALIGN(8192);             /* init_task */
-  .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
-       *(.data.init_task)
-  }:data.init
-
-  . = ALIGN(4096);
-  .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
-       *(.data.page_aligned)
-  }
-
-  /* might get freed after init */
-  . = ALIGN(4096);
-  __smp_alt_begin = .;
-  __smp_locks = .;
-  .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
-       *(.smp_locks)
-  }
-  __smp_locks_end = .;
-  . = ALIGN(4096);
-  __smp_alt_end = .;
-
-  . = ALIGN(4096);             /* Init code and data */
-  __init_begin = .;
-  .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
-       _sinittext = .;
-       *(.init.text)
-       _einittext = .;
-  }
-  __initdata_begin = .;
-  .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
-  __initdata_end = .;
-  . = ALIGN(16);
-  __setup_start = .;
-  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) }
-  __setup_end = .;
-  __initcall_start = .;
-  .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
-       INITCALLS
-  }
-  __initcall_end = .;
-  __con_initcall_start = .;
-  .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
-       *(.con_initcall.init)
-  }
-  __con_initcall_end = .;
-  SECURITY_INIT
-  . = ALIGN(8);
-  __alt_instructions = .;
-  .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
-       *(.altinstructions)
-  }
-  __alt_instructions_end = .; 
-  .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
-       *(.altinstr_replacement)
-  }
-  /* .exit.text is discard at runtime, not link time, to deal with references
-     from .altinstructions and .eh_frame */
-  .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
-  .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
-
-/* vdso blob that is mapped into user space */
-  vdso_start = . ;
-  .vdso  : AT(ADDR(.vdso) - LOAD_OFFSET) { *(.vdso) }
-  . = ALIGN(4096);
-  vdso_end = .;
-
-#ifdef CONFIG_BLK_DEV_INITRD
-  . = ALIGN(4096);
-  __initramfs_start = .;
-  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
-  __initramfs_end = .;
-#endif
-
-  PERCPU(4096)
-
-  . = ALIGN(4096);
-  __init_end = .;
-
-  . = ALIGN(4096);
-  __nosave_begin = .;
-  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
-  . = ALIGN(4096);
-  __nosave_end = .;
-
-  __bss_start = .;             /* BSS */
-  .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
-       *(.bss.page_aligned)
-       *(.bss)
-       }
-  __bss_stop = .;
-
-  _end = . ;
-
-  /* Sections to be discarded */
-  /DISCARD/ : {
-       *(.exitcall.exit)
-       *(.eh_frame)
-       }
-
-  STABS_DEBUG
-
-  DWARF_DEBUG
-}
diff --git a/arch/x86_64/kernel/vsmp_64.c b/arch/x86_64/kernel/vsmp_64.c
deleted file mode 100644 (file)
index 414caf0..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * vSMPowered(tm) systems specific initialization
- * Copyright (C) 2005 ScaleMP Inc.
- *
- * Use of this code is subject to the terms and conditions of the
- * GNU general public license version 2. See "COPYING" or
- * http://www.gnu.org/licenses/gpl.html
- *
- * Ravikiran Thirumalai <kiran@scalemp.com>,
- * Shai Fultheim <shai@scalemp.com>
- */
-
-#include <linux/init.h>
-#include <linux/pci_ids.h>
-#include <linux/pci_regs.h>
-#include <asm/pci-direct.h>
-#include <asm/io.h>
-
-static int __init vsmp_init(void)
-{
-       void *address;
-       unsigned int cap, ctl;
-
-       if (!early_pci_allowed())
-               return 0;
-
-       /* Check if we are running on a ScaleMP vSMP box */
-       if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) ||
-           (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL))
-               return 0;
-
-       /* set vSMP magic bits to indicate vSMP capable kernel */
-       address = ioremap(read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0), 8);
-       cap = readl(address);
-       ctl = readl(address + 4);
-       printk("vSMP CTL: capabilities:0x%08x  control:0x%08x\n", cap, ctl);
-       if (cap & ctl & (1 << 4)) {
-               /* Turn on vSMP IRQ fastpath handling (see system.h) */
-               ctl &= ~(1 << 4);
-               writel(ctl, address + 4);
-               ctl = readl(address + 4);
-               printk("vSMP CTL: control set to:0x%08x\n", ctl);
-       }
-
-       iounmap(address);
-       return 0;
-}
-
-core_initcall(vsmp_init);
diff --git a/arch/x86_64/kernel/vsyscall_64.c b/arch/x86_64/kernel/vsyscall_64.c
deleted file mode 100644 (file)
index 06c3494..0000000
+++ /dev/null
@@ -1,349 +0,0 @@
-/*
- *  linux/arch/x86_64/kernel/vsyscall.c
- *
- *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
- *  Copyright 2003 Andi Kleen, SuSE Labs.
- *
- *  Thanks to hpa@transmeta.com for some useful hint.
- *  Special thanks to Ingo Molnar for his early experience with
- *  a different vsyscall implementation for Linux/IA32 and for the name.
- *
- *  vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
- *  at virtual address -10Mbyte+1024bytes etc... There are at max 4
- *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
- *  jumping out of line if necessary. We cannot add more with this
- *  mechanism because older kernels won't return -ENOSYS.
- *  If we want more than four we need a vDSO.
- *
- *  Note: the concept clashes with user mode linux. If you use UML and
- *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
- */
-
-#include <linux/time.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/timer.h>
-#include <linux/seqlock.h>
-#include <linux/jiffies.h>
-#include <linux/sysctl.h>
-#include <linux/clocksource.h>
-#include <linux/getcpu.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/notifier.h>
-
-#include <asm/vsyscall.h>
-#include <asm/pgtable.h>
-#include <asm/page.h>
-#include <asm/unistd.h>
-#include <asm/fixmap.h>
-#include <asm/errno.h>
-#include <asm/io.h>
-#include <asm/segment.h>
-#include <asm/desc.h>
-#include <asm/topology.h>
-#include <asm/vgtod.h>
-
-#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
-#define __syscall_clobber "r11","rcx","memory"
-#define __pa_vsymbol(x)                        \
-       ({unsigned long v;              \
-       extern char __vsyscall_0;       \
-         asm("" : "=r" (v) : "0" (x)); \
-         ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); })
-
-/*
- * vsyscall_gtod_data contains data that is :
- * - readonly from vsyscalls
- * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
- * Try to keep this structure as small as possible to avoid cache line ping pongs
- */
-int __vgetcpu_mode __section_vgetcpu_mode;
-
-struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
-{
-       .lock = SEQLOCK_UNLOCKED,
-       .sysctl_enabled = 1,
-};
-
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
-{
-       unsigned long flags;
-
-       write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
-       /* copy vsyscall data */
-       vsyscall_gtod_data.clock.vread = clock->vread;
-       vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
-       vsyscall_gtod_data.clock.mask = clock->mask;
-       vsyscall_gtod_data.clock.mult = clock->mult;
-       vsyscall_gtod_data.clock.shift = clock->shift;
-       vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
-       vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
-       vsyscall_gtod_data.sys_tz = sys_tz;
-       vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
-       vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
-       write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
-}
-
-/* RED-PEN may want to readd seq locking, but then the variable should be
- * write-once.
- */
-static __always_inline void do_get_tz(struct timezone * tz)
-{
-       *tz = __vsyscall_gtod_data.sys_tz;
-}
-
-static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
-{
-       int ret;
-       asm volatile("vsysc2: syscall"
-               : "=a" (ret)
-               : "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
-               : __syscall_clobber );
-       return ret;
-}
-
-static __always_inline long time_syscall(long *t)
-{
-       long secs;
-       asm volatile("vsysc1: syscall"
-               : "=a" (secs)
-               : "0" (__NR_time),"D" (t) : __syscall_clobber);
-       return secs;
-}
-
-static __always_inline void do_vgettimeofday(struct timeval * tv)
-{
-       cycle_t now, base, mask, cycle_delta;
-       unsigned seq;
-       unsigned long mult, shift, nsec;
-       cycle_t (*vread)(void);
-       do {
-               seq = read_seqbegin(&__vsyscall_gtod_data.lock);
-
-               vread = __vsyscall_gtod_data.clock.vread;
-               if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
-                       gettimeofday(tv,NULL);
-                       return;
-               }
-               now = vread();
-               base = __vsyscall_gtod_data.clock.cycle_last;
-               mask = __vsyscall_gtod_data.clock.mask;
-               mult = __vsyscall_gtod_data.clock.mult;
-               shift = __vsyscall_gtod_data.clock.shift;
-
-               tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
-               nsec = __vsyscall_gtod_data.wall_time_nsec;
-       } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
-
-       /* calculate interval: */
-       cycle_delta = (now - base) & mask;
-       /* convert to nsecs: */
-       nsec += (cycle_delta * mult) >> shift;
-
-       while (nsec >= NSEC_PER_SEC) {
-               tv->tv_sec += 1;
-               nsec -= NSEC_PER_SEC;
-       }
-       tv->tv_usec = nsec / NSEC_PER_USEC;
-}
-
-int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
-{
-       if (tv)
-               do_vgettimeofday(tv);
-       if (tz)
-               do_get_tz(tz);
-       return 0;
-}
-
-/* This will break when the xtime seconds get inaccurate, but that is
- * unlikely */
-time_t __vsyscall(1) vtime(time_t *t)
-{
-       struct timeval tv;
-       time_t result;
-       if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
-               return time_syscall(t);
-
-       vgettimeofday(&tv, 0);
-       result = tv.tv_sec;
-       if (t)
-               *t = result;
-       return result;
-}
-
-/* Fast way to get current CPU and node.
-   This helps to do per node and per CPU caches in user space.
-   The result is not guaranteed without CPU affinity, but usually
-   works out because the scheduler tries to keep a thread on the same
-   CPU.
-
-   tcache must point to a two element sized long array.
-   All arguments can be NULL. */
-long __vsyscall(2)
-vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
-{
-       unsigned int dummy, p;
-       unsigned long j = 0;
-
-       /* Fast cache - only recompute value once per jiffies and avoid
-          relatively costly rdtscp/cpuid otherwise.
-          This works because the scheduler usually keeps the process
-          on the same CPU and this syscall doesn't guarantee its
-          results anyways.
-          We do this here because otherwise user space would do it on
-          its own in a likely inferior way (no access to jiffies).
-          If you don't like it pass NULL. */
-       if (tcache && tcache->blob[0] == (j = __jiffies)) {
-               p = tcache->blob[1];
-       } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
-               /* Load per CPU data from RDTSCP */
-               rdtscp(dummy, dummy, p);
-       } else {
-               /* Load per CPU data from GDT */
-               asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
-       }
-       if (tcache) {
-               tcache->blob[0] = j;
-               tcache->blob[1] = p;
-       }
-       if (cpu)
-               *cpu = p & 0xfff;
-       if (node)
-               *node = p >> 12;
-       return 0;
-}
-
-long __vsyscall(3) venosys_1(void)
-{
-       return -ENOSYS;
-}
-
-#ifdef CONFIG_SYSCTL
-
-#define SYSCALL 0x050f
-#define NOP2    0x9090
-
-/*
- * NOP out syscall in vsyscall page when not needed.
- */
-static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
-                        void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-       extern u16 vsysc1, vsysc2;
-       u16 __iomem *map1;
-       u16 __iomem *map2;
-       int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
-       if (!write)
-               return ret;
-       /* gcc has some trouble with __va(__pa()), so just do it this
-          way. */
-       map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
-       if (!map1)
-               return -ENOMEM;
-       map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
-       if (!map2) {
-               ret = -ENOMEM;
-               goto out;
-       }
-       if (!vsyscall_gtod_data.sysctl_enabled) {
-               writew(SYSCALL, map1);
-               writew(SYSCALL, map2);
-       } else {
-               writew(NOP2, map1);
-               writew(NOP2, map2);
-       }
-       iounmap(map2);
-out:
-       iounmap(map1);
-       return ret;
-}
-
-static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
-                               void __user *oldval, size_t __user *oldlenp,
-                               void __user *newval, size_t newlen)
-{
-       return -ENOSYS;
-}
-
-static ctl_table kernel_table2[] = {
-       { .ctl_name = 99, .procname = "vsyscall64",
-         .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
-         .mode = 0644,
-         .strategy = vsyscall_sysctl_nostrat,
-         .proc_handler = vsyscall_sysctl_change },
-       {}
-};
-
-static ctl_table kernel_root_table2[] = {
-       { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555,
-         .child = kernel_table2 },
-       {}
-};
-
-#endif
-
-/* Assume __initcall executes before all user space. Hopefully kmod
-   doesn't violate that. We'll find out if it does. */
-static void __cpuinit vsyscall_set_cpu(int cpu)
-{
-       unsigned long *d;
-       unsigned long node = 0;
-#ifdef CONFIG_NUMA
-       node = cpu_to_node[cpu];
-#endif
-       if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP))
-               write_rdtscp_aux((node << 12) | cpu);
-
-       /* Store cpu number in limit so that it can be loaded quickly
-          in user space in vgetcpu.
-          12 bits for the CPU and 8 bits for the node. */
-       d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU);
-       *d = 0x0f40000000000ULL;
-       *d |= cpu;
-       *d |= (node & 0xf) << 12;
-       *d |= (node >> 4) << 48;
-}
-
-static void __cpuinit cpu_vsyscall_init(void *arg)
-{
-       /* preemption should be already off */
-       vsyscall_set_cpu(raw_smp_processor_id());
-}
-
-static int __cpuinit
-cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
-{
-       long cpu = (long)arg;
-       if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
-               smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
-       return NOTIFY_DONE;
-}
-
-static void __init map_vsyscall(void)
-{
-       extern char __vsyscall_0;
-       unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
-
-       /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
-       __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
-}
-
-static int __init vsyscall_init(void)
-{
-       BUG_ON(((unsigned long) &vgettimeofday !=
-                       VSYSCALL_ADDR(__NR_vgettimeofday)));
-       BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
-       BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
-       BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
-       map_vsyscall();
-#ifdef CONFIG_SYSCTL
-       register_sysctl_table(kernel_root_table2);
-#endif
-       on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
-       hotcpu_notifier(cpu_vsyscall_notifier, 0);
-       return 0;
-}
-
-__initcall(vsyscall_init);
diff --git a/arch/x86_64/kernel/x8664_ksyms_64.c b/arch/x86_64/kernel/x8664_ksyms_64.c
deleted file mode 100644 (file)
index 77c25b3..0000000
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Exports for assembly files.
-   All C exports should go in the respective C files. */
-
-#include <linux/module.h>
-#include <linux/smp.h>
-
-#include <asm/semaphore.h>
-#include <asm/processor.h>
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-
-EXPORT_SYMBOL(kernel_thread);
-
-EXPORT_SYMBOL(__down_failed);
-EXPORT_SYMBOL(__down_failed_interruptible);
-EXPORT_SYMBOL(__down_failed_trylock);
-EXPORT_SYMBOL(__up_wakeup);
-
-EXPORT_SYMBOL(__get_user_1);
-EXPORT_SYMBOL(__get_user_2);
-EXPORT_SYMBOL(__get_user_4);
-EXPORT_SYMBOL(__get_user_8);
-EXPORT_SYMBOL(__put_user_1);
-EXPORT_SYMBOL(__put_user_2);
-EXPORT_SYMBOL(__put_user_4);
-EXPORT_SYMBOL(__put_user_8);
-
-EXPORT_SYMBOL(copy_user_generic);
-EXPORT_SYMBOL(__copy_user_nocache);
-EXPORT_SYMBOL(copy_from_user);
-EXPORT_SYMBOL(copy_to_user);
-EXPORT_SYMBOL(__copy_from_user_inatomic);
-
-EXPORT_SYMBOL(copy_page);
-EXPORT_SYMBOL(clear_page);
-
-#ifdef CONFIG_SMP
-extern void  __write_lock_failed(rwlock_t *rw);
-extern void  __read_lock_failed(rwlock_t *rw);
-EXPORT_SYMBOL(__write_lock_failed);
-EXPORT_SYMBOL(__read_lock_failed);
-#endif
-
-/* Export string functions. We normally rely on gcc builtin for most of these,
-   but gcc sometimes decides not to inline them. */    
-#undef memcpy
-#undef memset
-#undef memmove
-
-extern void * memset(void *,int,__kernel_size_t);
-extern void * memcpy(void *,const void *,__kernel_size_t);
-extern void * __memcpy(void *,const void *,__kernel_size_t);
-
-EXPORT_SYMBOL(memset);
-EXPORT_SYMBOL(memcpy);
-EXPORT_SYMBOL(__memcpy);
-
-EXPORT_SYMBOL(empty_zero_page);
-EXPORT_SYMBOL(init_level4_pgt);
-EXPORT_SYMBOL(load_gs_index);
-
-EXPORT_SYMBOL(_proxy_pda);