Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 21 Oct 2010 20:47:29 +0000 (13:47 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 21 Oct 2010 20:47:29 +0000 (13:47 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 21 Oct 2010 20:47:29 +0000 (13:47 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 21 Oct 2010 20:47:29 +0000 (13:47 -0700)
diff --combined arch/x86/Kconfig

index b8676498d8df407fb89f8371204ffb387550c088,2924f4e77791e91fc432d4c9c53c64bcd80b68c1..8e9c4d4772fb7d724189812130c640abc80d50a9
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -25,7 -25,6 +25,7 @@@ config X8
         select HAVE_IDE
         select HAVE_OPROFILE
         select HAVE_PERF_EVENTS if (!M386 && !M486)
+ +      select HAVE_IRQ_WORK
         select HAVE_IOREMAP_PROT
         select HAVE_KPROBES
         select ARCH_WANT_OPTIONAL_GPIOLIB
@@@ -34,7 -33,6 +34,7 @@@
         select HAVE_KRETPROBES
         select HAVE_OPTPROBES
         select HAVE_FTRACE_MCOUNT_RECORD
+ +      select HAVE_C_RECORDMCOUNT
         select HAVE_DYNAMIC_FTRACE
         select HAVE_FUNCTION_TRACER
         select HAVE_FUNCTION_GRAPH_TRACER
@@@ -61,8 -59,6 +61,8 @@@
         select ANON_INODES
         select HAVE_ARCH_KMEMCHECK
         select HAVE_USER_RETURN_NOTIFIER
+ +      select HAVE_ARCH_JUMP_LABEL
+ +      select HAVE_TEXT_POKE_SMP
   
   config INSTRUCTION_DECODER
         def_bool (KPROBES || PERF_EVENTS)
@@@ -674,7 -670,7 +674,7 @@@ config GART_IOMM
         bool "GART IOMMU support" if EMBEDDED
         default y
         select SWIOTLB
- -      depends on X86_64 && PCI && K8_NB
+ +      depends on X86_64 && PCI && AMD_NB
         ---help---
           Support for full DMA access of devices with 32bit memory access only
           on systems with more than 3GB. This is usually needed for USB,
@@@ -799,17 -795,6 +799,17 @@@ config SCHED_M
           making when dealing with multi-core CPU chips at a cost of slightly
           increased overhead in some places. If unsure say N here.
   
+ +config IRQ_TIME_ACCOUNTING
+ +      bool "Fine granularity task level IRQ time accounting"
+ +      default n
+ +      ---help---
+ +        Select this option to enable fine granularity task irq time
+ +        accounting. This is done by reading a timestamp on each
+ +        transitions between softirq and hardirq state, so there can be a
+ +        small performance impact.
+ +
+ +        If in doubt, say N here.
+ +
   source "kernel/Kconfig.preempt"
   
   config X86_UP_APIC
@@@ -1163,6 -1148,9 +1163,9 @@@ config X86_PA
   config ARCH_PHYS_ADDR_T_64BIT
         def_bool X86_64 || X86_PAE
   
+ config ARCH_DMA_ADDR_T_64BIT
+       def_bool X86_64 || HIGHMEM64G
+ 
   config DIRECT_GBPAGES
         bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
         default y
@@@ -1341,34 -1329,25 +1344,34 @@@ config X86_BOOTPARAM_MEMORY_CORRUPTION_
           Set whether the default state of memory_corruption_check is
           on or off.
   
- -config X86_RESERVE_LOW_64K
- -      bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen"
- -      default y
+ +config X86_RESERVE_LOW
+ +      int "Amount of low memory, in kilobytes, to reserve for the BIOS"
+ +      default 64
+ +      range 4 640
         ---help---
- -        Reserve the first 64K of physical RAM on BIOSes that are known
- -        to potentially corrupt that memory range. A numbers of BIOSes are
- -        known to utilize this area during suspend/resume, so it must not
- -        be used by the kernel.
+ +        Specify the amount of low memory to reserve for the BIOS.
   
- -        Set this to N if you are absolutely sure that you trust the BIOS
- -        to get all its memory reservations and usages right.
+ +        The first page contains BIOS data structures that the kernel
+ +        must not use, so that page must always be reserved.
   
- -        If you have doubts about the BIOS (e.g. suspend/resume does not
- -        work or there's kernel crashes after certain hardware hotplug
- -        events) and it's not AMI or Phoenix, then you might want to enable
- -        X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical
- -        corruption patterns.
+ +        By default we reserve the first 64K of physical RAM, as a
+ +        number of BIOSes are known to corrupt that memory range
+ +        during events such as suspend/resume or monitor cable
+ +        insertion, so it must not be used by the kernel.
   
- -        Say Y if unsure.
+ +        You can set this to 4 if you are absolutely sure that you
+ +        trust the BIOS to get all its memory reservations and usages
+ +        right.  If you know your BIOS have problems beyond the
+ +        default 64K area, you can set this to 640 to avoid using the
+ +        entire low memory range.
+ +
+ +        If you have doubts about the BIOS (e.g. suspend/resume does
+ +        not work or there's kernel crashes after certain hardware
+ +        hotplug events) then you might want to enable
+ +        X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check
+ +        typical corruption patterns.
+ +
+ +        Leave this to the default value of 64 if you are unsure.
   
   config MATH_EMULATION
         bool
@@@ -2100,7 -2079,7 +2103,7 @@@ config OLPC_OPENFIRMWAR
   
   endif # X86_32
   
- -config K8_NB
+ +config AMD_NB
         def_bool y
         depends on CPU_SUP_AMD && PCI
   
@@@ -2149,10 -2128,6 +2152,10 @@@ config HAVE_ATOMIC_IOMA
         def_bool y
         depends on X86_32
   
+ +config HAVE_TEXT_POKE_SMP
+ +      bool
+ +      select STOP_MACHINE if SMP
+ +
   source "net/Kconfig"
   
   source "drivers/Kconfig"
diff --combined arch/x86/mm/fault.c

index a24c6cfdccc47da8a12f16b2cd4a09caaf825b9a,0cdb8d493f6117a882e8d1a9f835f2c6c6dffeb8..79b0b372d2d033ca35a4bb83295a332c17bbb6c4
--- 1/arch/x86/mm/fault.c
--- 2/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@@ -229,7 -229,16 +229,16 @@@ void vmalloc_sync_all(void
   
                 spin_lock_irqsave(&pgd_lock, flags);
                 list_for_each_entry(page, &pgd_list, lru) {
-                       if (!vmalloc_sync_one(page_address(page), address))
+                       spinlock_t *pgt_lock;
+                       pmd_t *ret;
+ 
+                       pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+ 
+                       spin_lock(pgt_lock);
+                       ret = vmalloc_sync_one(page_address(page), address);
+                       spin_unlock(pgt_lock);
+ 
+                       if (!ret)
                                 break;
                 }
                 spin_unlock_irqrestore(&pgd_lock, flags);
@@@ -251,8 -260,6 +260,8 @@@ static noinline __kprobes int vmalloc_f
         if (!(address >= VMALLOC_START && address < VMALLOC_END))
                 return -1;
   
+ +      WARN_ON_ONCE(in_nmi());
+ +
         /*
          * Synchronize this task's top level page-table
          * with the 'reference' page table.
@@@ -328,29 -335,7 +337,7 @@@ out
   
   void vmalloc_sync_all(void)
   {
-       unsigned long address;
- 
-       for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
-            address += PGDIR_SIZE) {
- 
-               const pgd_t *pgd_ref = pgd_offset_k(address);
-               unsigned long flags;
-               struct page *page;
- 
-               if (pgd_none(*pgd_ref))
-                       continue;
- 
-               spin_lock_irqsave(&pgd_lock, flags);
-               list_for_each_entry(page, &pgd_list, lru) {
-                       pgd_t *pgd;
-                       pgd = (pgd_t *)page_address(page) + pgd_index(address);
-                       if (pgd_none(*pgd))
-                               set_pgd(pgd, *pgd_ref);
-                       else
-                               BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
-               }
-               spin_unlock_irqrestore(&pgd_lock, flags);
-       }
+       sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
   }
   
   /*
@@@ -371,8 -356,6 +358,8 @@@ static noinline __kprobes int vmalloc_f
         if (!(address >= VMALLOC_START && address < VMALLOC_END))
                 return -1;
   
+ +      WARN_ON_ONCE(in_nmi());
+ +
         /*
          * Copy kernel mappings over when needed. This can also
          * happen within a race in page table update. In the later
@@@ -898,8 -881,14 +885,14 @@@ spurious_fault(unsigned long error_code
         if (pmd_large(*pmd))
                 return spurious_fault_check(error_code, (pte_t *) pmd);
   
+       /*
+        * Note: don't use pte_present() here, since it returns true
+        * if the _PAGE_PROTNONE bit is set.  However, this aliases the
+        * _PAGE_GLOBAL bit, which for kernel pages give false positives
+        * when CONFIG_DEBUG_PAGEALLOC is used.
+        */
         pte = pte_offset_kernel(pmd, address);
-       if (!pte_present(*pte))
+       if (!(pte_flags(*pte) & _PAGE_PRESENT))
                 return 0;
   
         ret = spurious_fault_check(error_code, pte);
diff --combined arch/x86/mm/init_64.c

index 7c48ad4faca312c2f3e80b2971bb29b774994522,4d323fb770c28937f48496a392e2129fa297754d..c55f900fbf89253b5a568c50feadd08057c1dde9
--- 1/arch/x86/mm/init_64.c
--- 2/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@@ -97,6 -97,43 +97,43 @@@ static int __init nonx32_setup(char *st
   }
   __setup("noexec32=", nonx32_setup);
   
+ /*
+  * When memory was added/removed make sure all the processes MM have
+  * suitable PGD entries in the local PGD level page.
+  */
+ void sync_global_pgds(unsigned long start, unsigned long end)
+ {
+       unsigned long address;
+ 
+       for (address = start; address <= end; address += PGDIR_SIZE) {
+               const pgd_t *pgd_ref = pgd_offset_k(address);
+               unsigned long flags;
+               struct page *page;
+ 
+               if (pgd_none(*pgd_ref))
+                       continue;
+ 
+               spin_lock_irqsave(&pgd_lock, flags);
+               list_for_each_entry(page, &pgd_list, lru) {
+                       pgd_t *pgd;
+                       spinlock_t *pgt_lock;
+ 
+                       pgd = (pgd_t *)page_address(page) + pgd_index(address);
+                       pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+                       spin_lock(pgt_lock);
+ 
+                       if (pgd_none(*pgd))
+                               set_pgd(pgd, *pgd_ref);
+                       else
+                               BUG_ON(pgd_page_vaddr(*pgd)
+                                      != pgd_page_vaddr(*pgd_ref));
+ 
+                       spin_unlock(pgt_lock);
+               }
+               spin_unlock_irqrestore(&pgd_lock, flags);
+       }
+ }
+ 
   /*
    * NOTE: This function is marked __ref because it calls __init function
    * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
@@@ -293,7 -330,7 +330,7 @@@ static __ref void *alloc_low_page(unsig
                 panic("alloc_low_page: ran out of memory");
   
         adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
- -      memset(adr, 0, PAGE_SIZE);
+ +      clear_page(adr);
         *phys  = pfn * PAGE_SIZE;
         return adr;
   }
@@@ -534,11 -571,13 +571,13 @@@ kernel_physical_mapping_init(unsigned l
                              unsigned long end,
                              unsigned long page_size_mask)
   {
- 
+       bool pgd_changed = false;
         unsigned long next, last_map_addr = end;
+       unsigned long addr;
   
         start = (unsigned long)__va(start);
         end = (unsigned long)__va(end);
+       addr = start;
   
         for (; start < end; start = next) {
                 pgd_t *pgd = pgd_offset_k(start);
@@@ -563,7 -602,12 +602,12 @@@
                 spin_lock(&init_mm.page_table_lock);
                 pgd_populate(&init_mm, pgd, __va(pud_phys));
                 spin_unlock(&init_mm.page_table_lock);
+               pgd_changed = true;
         }
+ 
+       if (pgd_changed)
+               sync_global_pgds(addr, end);
+ 
         __flush_tlb_all();
   
         return last_map_addr;
@@@ -1003,6 -1047,7 +1047,7 @@@ vmemmap_populate(struct page *start_pag
                 }
   
         }
+       sync_global_pgds((unsigned long)start_page, end);
         return 0;
   }
   
diff --combined include/asm-generic/vmlinux.lds.h

index ef2af9948eacc9b9f27a706b2febeeb0272b0910,1457b81357afeaec66501de5e642ec2b741acbce..f4229fb315e1d7d81caddb7557cbcb6c65072fed
--- 1/include/asm-generic/vmlinux.lds.h
--- 2/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@@ -220,8 -220,6 +220,8 @@@
                                                                         \
         BUG_TABLE                                                       \
                                                                         \
+ +      JUMP_TABLE                                                      \
+ +                                                                      \
         /* PCI quirks */                                                \
         .pci_fixup        : AT(ADDR(.pci_fixup) - LOAD_OFFSET) {        \
                 VMLINUX_SYMBOL(__start_pci_fixups_early) = .;           \
@@@ -565,14 -563,6 +565,14 @@@
   #define BUG_TABLE
   #endif
   
+ +#define JUMP_TABLE                                                    \
+ +      . = ALIGN(8);                                                   \
+ +      __jump_table : AT(ADDR(__jump_table) - LOAD_OFFSET) {           \
+ +              VMLINUX_SYMBOL(__start___jump_table) = .;               \
+ +              *(__jump_table)                                         \
+ +              VMLINUX_SYMBOL(__stop___jump_table) = .;                \
+ +      }
+ +
   #ifdef CONFIG_PM_TRACE
   #define TRACEDATA                                                     \
         . = ALIGN(4);                                                   \
@@@ -687,7 -677,9 +687,9 @@@
                                 - LOAD_OFFSET) {                        \
                 VMLINUX_SYMBOL(__per_cpu_start) = .;                    \
                 *(.data..percpu..first)                                 \
+               . = ALIGN(PAGE_SIZE);                                   \
                 *(.data..percpu..page_aligned)                          \
+               *(.data..percpu..readmostly)                            \
                 *(.data..percpu)                                        \
                 *(.data..percpu..shared_aligned)                        \
                 VMLINUX_SYMBOL(__per_cpu_end) = .;                      \
@@@ -713,7 -705,9 +715,9 @@@
                 VMLINUX_SYMBOL(__per_cpu_load) = .;                     \
                 VMLINUX_SYMBOL(__per_cpu_start) = .;                    \
                 *(.data..percpu..first)                                 \
+               . = ALIGN(PAGE_SIZE);                                   \
                 *(.data..percpu..page_aligned)                          \
+               *(.data..percpu..readmostly)                            \
                 *(.data..percpu)                                        \
                 *(.data..percpu..shared_aligned)                        \
                 VMLINUX_SYMBOL(__per_cpu_end) = .;                      \
diff --combined mm/memory.c

index 0e18b4d649ec82abc83c208e5f9dce9cbb2cf905,a40da6983961153ca4cfce25991a94da07740a93..98b58fecedeffc236a9c7285689fe4720409bd30
--- 1/mm/memory.c
--- 2/mm/memory.c
+++ b/mm/memory.c
@@@ -2623,7 -2623,7 +2623,7 @@@ static int do_swap_page(struct mm_struc
                 unsigned int flags, pte_t orig_pte)
   {
         spinlock_t *ptl;
- -      struct page *page;
+ +      struct page *page, *swapcache = NULL;
         swp_entry_t entry;
         pte_t pte;
         struct mem_cgroup *ptr = NULL;
@@@ -2679,25 -2679,10 +2679,25 @@@
         lock_page(page);
         delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
   
- -      page = ksm_might_need_to_copy(page, vma, address);
- -      if (!page) {
- -              ret = VM_FAULT_OOM;
- -              goto out;
+ +      /*
+ +       * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
+ +       * release the swapcache from under us.  The page pin, and pte_same
+ +       * test below, are not enough to exclude that.  Even if it is still
+ +       * swapcache, we need to check that the page's swap has not changed.
+ +       */
+ +      if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
+ +              goto out_page;
+ +
+ +      if (ksm_might_need_to_copy(page, vma, address)) {
+ +              swapcache = page;
+ +              page = ksm_does_need_to_copy(page, vma, address);
+ +
+ +              if (unlikely(!page)) {
+ +                      ret = VM_FAULT_OOM;
+ +                      page = swapcache;
+ +                      swapcache = NULL;
+ +                      goto out_page;
+ +              }
         }
   
         if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
@@@ -2750,18 -2735,6 +2750,18 @@@
         if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
                 try_to_free_swap(page);
         unlock_page(page);
+ +      if (swapcache) {
+ +              /*
+ +               * Hold the lock to avoid the swap entry to be reused
+ +               * until we take the PT lock for the pte_same() check
+ +               * (to avoid false positives from pte_same). For
+ +               * further safety release the lock after the swap_free
+ +               * so that the swap count won't change under a
+ +               * parallel locked swapcache.
+ +               */
+ +              unlock_page(swapcache);
+ +              page_cache_release(swapcache);
+ +      }
   
         if (flags & FAULT_FLAG_WRITE) {
                 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
@@@ -2783,17 -2756,15 +2783,17 @@@ out_page
         unlock_page(page);
   out_release:
         page_cache_release(page);
+ +      if (swapcache) {
+ +              unlock_page(swapcache);
+ +              page_cache_release(swapcache);
+ +      }
         return ret;
   }
   
   /*
- - * This is like a special single-page "expand_downwards()",
- - * except we must first make sure that 'address-PAGE_SIZE'
+ + * This is like a special single-page "expand_{down|up}wards()",
+ + * except we must first make sure that 'address{-|+}PAGE_SIZE'
    * doesn't hit another vma.
- - *
- - * The "find_vma()" will do the right thing even if we wrap
    */
   static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
   {
@@@ -2812,15 -2783,6 +2812,15 @@@
   
                 expand_stack(vma, address - PAGE_SIZE);
         }
+ +      if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
+ +              struct vm_area_struct *next = vma->vm_next;
+ +
+ +              /* As VM_GROWSDOWN but s/below/above/ */
+ +              if (next && next->vm_start == address + PAGE_SIZE)
+ +                      return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
+ +
+ +              expand_upwards(vma, address + PAGE_SIZE);
+ +      }
         return 0;
   }
   
@@@ -3185,7 -3147,7 +3185,7 @@@ static inline int handle_pte_fault(stru
                  * with threads.
                  */
                 if (flags & FAULT_FLAG_WRITE)
-                       flush_tlb_page(vma, address);
+                       flush_tlb_fix_spurious_fault(vma, address);
         }
   unlock:
         pte_unmap_unlock(pte, ptl);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 21 Oct 2010 20:47:29 +0000 (13:47 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 21 Oct 2010 20:47:29 +0000 (13:47 -0700)
		1	2
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/fault.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/init_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/asm-generic/vmlinux.lds.h	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memory.c	patch \|	diff1 \|	diff2 \|	blob \| history