Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 21 Oct 2010 20:47:29 +0000 (13:47 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 21 Oct 2010 20:47:29 +0000 (13:47 -0700)
* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  x86-32, percpu: Correct the ordering of the percpu readmostly section
  x86, mm: Enable ARCH_DMA_ADDR_T_64BIT with X86_64 || HIGHMEM64G
  x86: Spread tlb flush vector between nodes
  percpu: Introduce a read-mostly percpu API
  x86, mm: Fix incorrect data type in vmalloc_sync_all()
  x86, mm: Hold mm->page_table_lock while doing vmalloc_sync
  x86, mm: Fix bogus whitespace in sync_global_pgds()
  x86-32: Fix sparse warning for the __PHYSICAL_MASK calculation
  x86, mm: Add RESERVE_BRK_ARRAY() helper
  mm, x86: Saving vmcore with non-lazy freeing of vmas
  x86, kdump: Change copy_oldmem_page() to use cached addressing
  x86, mm: fix uninitialized addr in kernel_physical_mapping_init()
  x86, kmemcheck: Remove double test
  x86, mm: Make spurious_fault check explicitly check the PRESENT bit
  x86-64, mem: Update all PGDs for direct mapping and vmemmap mapping changes
  x86, mm: Separate x86_64 vmalloc_sync_all() into separate functions
  x86, mm: Avoid unnecessary TLB flush

1  2 
arch/x86/Kconfig
arch/x86/mm/fault.c
arch/x86/mm/init_64.c
include/asm-generic/vmlinux.lds.h
mm/memory.c

diff --combined arch/x86/Kconfig
index b8676498d8df407fb89f8371204ffb387550c088,2924f4e77791e91fc432d4c9c53c64bcd80b68c1..8e9c4d4772fb7d724189812130c640abc80d50a9
@@@ -25,7 -25,6 +25,7 @@@ config X8
        select HAVE_IDE
        select HAVE_OPROFILE
        select HAVE_PERF_EVENTS if (!M386 && !M486)
 +      select HAVE_IRQ_WORK
        select HAVE_IOREMAP_PROT
        select HAVE_KPROBES
        select ARCH_WANT_OPTIONAL_GPIOLIB
@@@ -34,7 -33,6 +34,7 @@@
        select HAVE_KRETPROBES
        select HAVE_OPTPROBES
        select HAVE_FTRACE_MCOUNT_RECORD
 +      select HAVE_C_RECORDMCOUNT
        select HAVE_DYNAMIC_FTRACE
        select HAVE_FUNCTION_TRACER
        select HAVE_FUNCTION_GRAPH_TRACER
@@@ -61,8 -59,6 +61,8 @@@
        select ANON_INODES
        select HAVE_ARCH_KMEMCHECK
        select HAVE_USER_RETURN_NOTIFIER
 +      select HAVE_ARCH_JUMP_LABEL
 +      select HAVE_TEXT_POKE_SMP
  
  config INSTRUCTION_DECODER
        def_bool (KPROBES || PERF_EVENTS)
@@@ -674,7 -670,7 +674,7 @@@ config GART_IOMM
        bool "GART IOMMU support" if EMBEDDED
        default y
        select SWIOTLB
 -      depends on X86_64 && PCI && K8_NB
 +      depends on X86_64 && PCI && AMD_NB
        ---help---
          Support for full DMA access of devices with 32bit memory access only
          on systems with more than 3GB. This is usually needed for USB,
@@@ -799,17 -795,6 +799,17 @@@ config SCHED_M
          making when dealing with multi-core CPU chips at a cost of slightly
          increased overhead in some places. If unsure say N here.
  
 +config IRQ_TIME_ACCOUNTING
 +      bool "Fine granularity task level IRQ time accounting"
 +      default n
 +      ---help---
 +        Select this option to enable fine granularity task irq time
 +        accounting. This is done by reading a timestamp on each
 +        transitions between softirq and hardirq state, so there can be a
 +        small performance impact.
 +
 +        If in doubt, say N here.
 +
  source "kernel/Kconfig.preempt"
  
  config X86_UP_APIC
@@@ -1163,6 -1148,9 +1163,9 @@@ config X86_PA
  config ARCH_PHYS_ADDR_T_64BIT
        def_bool X86_64 || X86_PAE
  
+ config ARCH_DMA_ADDR_T_64BIT
+       def_bool X86_64 || HIGHMEM64G
  config DIRECT_GBPAGES
        bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
        default y
@@@ -1341,34 -1329,25 +1344,34 @@@ config X86_BOOTPARAM_MEMORY_CORRUPTION_
          Set whether the default state of memory_corruption_check is
          on or off.
  
 -config X86_RESERVE_LOW_64K
 -      bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen"
 -      default y
 +config X86_RESERVE_LOW
 +      int "Amount of low memory, in kilobytes, to reserve for the BIOS"
 +      default 64
 +      range 4 640
        ---help---
 -        Reserve the first 64K of physical RAM on BIOSes that are known
 -        to potentially corrupt that memory range. A numbers of BIOSes are
 -        known to utilize this area during suspend/resume, so it must not
 -        be used by the kernel.
 +        Specify the amount of low memory to reserve for the BIOS.
  
 -        Set this to N if you are absolutely sure that you trust the BIOS
 -        to get all its memory reservations and usages right.
 +        The first page contains BIOS data structures that the kernel
 +        must not use, so that page must always be reserved.
  
 -        If you have doubts about the BIOS (e.g. suspend/resume does not
 -        work or there's kernel crashes after certain hardware hotplug
 -        events) and it's not AMI or Phoenix, then you might want to enable
 -        X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical
 -        corruption patterns.
 +        By default we reserve the first 64K of physical RAM, as a
 +        number of BIOSes are known to corrupt that memory range
 +        during events such as suspend/resume or monitor cable
 +        insertion, so it must not be used by the kernel.
  
 -        Say Y if unsure.
 +        You can set this to 4 if you are absolutely sure that you
 +        trust the BIOS to get all its memory reservations and usages
 +        right.  If you know your BIOS have problems beyond the
 +        default 64K area, you can set this to 640 to avoid using the
 +        entire low memory range.
 +
 +        If you have doubts about the BIOS (e.g. suspend/resume does
 +        not work or there's kernel crashes after certain hardware
 +        hotplug events) then you might want to enable
 +        X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check
 +        typical corruption patterns.
 +
 +        Leave this to the default value of 64 if you are unsure.
  
  config MATH_EMULATION
        bool
@@@ -2100,7 -2079,7 +2103,7 @@@ config OLPC_OPENFIRMWAR
  
  endif # X86_32
  
 -config K8_NB
 +config AMD_NB
        def_bool y
        depends on CPU_SUP_AMD && PCI
  
@@@ -2149,10 -2128,6 +2152,10 @@@ config HAVE_ATOMIC_IOMA
        def_bool y
        depends on X86_32
  
 +config HAVE_TEXT_POKE_SMP
 +      bool
 +      select STOP_MACHINE if SMP
 +
  source "net/Kconfig"
  
  source "drivers/Kconfig"
diff --combined arch/x86/mm/fault.c
index a24c6cfdccc47da8a12f16b2cd4a09caaf825b9a,0cdb8d493f6117a882e8d1a9f835f2c6c6dffeb8..79b0b372d2d033ca35a4bb83295a332c17bbb6c4
@@@ -229,7 -229,16 +229,16 @@@ void vmalloc_sync_all(void
  
                spin_lock_irqsave(&pgd_lock, flags);
                list_for_each_entry(page, &pgd_list, lru) {
-                       if (!vmalloc_sync_one(page_address(page), address))
+                       spinlock_t *pgt_lock;
+                       pmd_t *ret;
+                       pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+                       spin_lock(pgt_lock);
+                       ret = vmalloc_sync_one(page_address(page), address);
+                       spin_unlock(pgt_lock);
+                       if (!ret)
                                break;
                }
                spin_unlock_irqrestore(&pgd_lock, flags);
@@@ -251,8 -260,6 +260,8 @@@ static noinline __kprobes int vmalloc_f
        if (!(address >= VMALLOC_START && address < VMALLOC_END))
                return -1;
  
 +      WARN_ON_ONCE(in_nmi());
 +
        /*
         * Synchronize this task's top level page-table
         * with the 'reference' page table.
@@@ -328,29 -335,7 +337,7 @@@ out
  
  void vmalloc_sync_all(void)
  {
-       unsigned long address;
-       for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
-            address += PGDIR_SIZE) {
-               const pgd_t *pgd_ref = pgd_offset_k(address);
-               unsigned long flags;
-               struct page *page;
-               if (pgd_none(*pgd_ref))
-                       continue;
-               spin_lock_irqsave(&pgd_lock, flags);
-               list_for_each_entry(page, &pgd_list, lru) {
-                       pgd_t *pgd;
-                       pgd = (pgd_t *)page_address(page) + pgd_index(address);
-                       if (pgd_none(*pgd))
-                               set_pgd(pgd, *pgd_ref);
-                       else
-                               BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
-               }
-               spin_unlock_irqrestore(&pgd_lock, flags);
-       }
+       sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
  }
  
  /*
@@@ -371,8 -356,6 +358,8 @@@ static noinline __kprobes int vmalloc_f
        if (!(address >= VMALLOC_START && address < VMALLOC_END))
                return -1;
  
 +      WARN_ON_ONCE(in_nmi());
 +
        /*
         * Copy kernel mappings over when needed. This can also
         * happen within a race in page table update. In the later
@@@ -898,8 -881,14 +885,14 @@@ spurious_fault(unsigned long error_code
        if (pmd_large(*pmd))
                return spurious_fault_check(error_code, (pte_t *) pmd);
  
+       /*
+        * Note: don't use pte_present() here, since it returns true
+        * if the _PAGE_PROTNONE bit is set.  However, this aliases the
+        * _PAGE_GLOBAL bit, which for kernel pages give false positives
+        * when CONFIG_DEBUG_PAGEALLOC is used.
+        */
        pte = pte_offset_kernel(pmd, address);
-       if (!pte_present(*pte))
+       if (!(pte_flags(*pte) & _PAGE_PRESENT))
                return 0;
  
        ret = spurious_fault_check(error_code, pte);
diff --combined arch/x86/mm/init_64.c
index 7c48ad4faca312c2f3e80b2971bb29b774994522,4d323fb770c28937f48496a392e2129fa297754d..c55f900fbf89253b5a568c50feadd08057c1dde9
@@@ -97,6 -97,43 +97,43 @@@ static int __init nonx32_setup(char *st
  }
  __setup("noexec32=", nonx32_setup);
  
+ /*
+  * When memory was added/removed make sure all the processes MM have
+  * suitable PGD entries in the local PGD level page.
+  */
+ void sync_global_pgds(unsigned long start, unsigned long end)
+ {
+       unsigned long address;
+       for (address = start; address <= end; address += PGDIR_SIZE) {
+               const pgd_t *pgd_ref = pgd_offset_k(address);
+               unsigned long flags;
+               struct page *page;
+               if (pgd_none(*pgd_ref))
+                       continue;
+               spin_lock_irqsave(&pgd_lock, flags);
+               list_for_each_entry(page, &pgd_list, lru) {
+                       pgd_t *pgd;
+                       spinlock_t *pgt_lock;
+                       pgd = (pgd_t *)page_address(page) + pgd_index(address);
+                       pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+                       spin_lock(pgt_lock);
+                       if (pgd_none(*pgd))
+                               set_pgd(pgd, *pgd_ref);
+                       else
+                               BUG_ON(pgd_page_vaddr(*pgd)
+                                      != pgd_page_vaddr(*pgd_ref));
+                       spin_unlock(pgt_lock);
+               }
+               spin_unlock_irqrestore(&pgd_lock, flags);
+       }
+ }
  /*
   * NOTE: This function is marked __ref because it calls __init function
   * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
@@@ -293,7 -330,7 +330,7 @@@ static __ref void *alloc_low_page(unsig
                panic("alloc_low_page: ran out of memory");
  
        adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
 -      memset(adr, 0, PAGE_SIZE);
 +      clear_page(adr);
        *phys  = pfn * PAGE_SIZE;
        return adr;
  }
@@@ -534,11 -571,13 +571,13 @@@ kernel_physical_mapping_init(unsigned l
                             unsigned long end,
                             unsigned long page_size_mask)
  {
+       bool pgd_changed = false;
        unsigned long next, last_map_addr = end;
+       unsigned long addr;
  
        start = (unsigned long)__va(start);
        end = (unsigned long)__va(end);
+       addr = start;
  
        for (; start < end; start = next) {
                pgd_t *pgd = pgd_offset_k(start);
                spin_lock(&init_mm.page_table_lock);
                pgd_populate(&init_mm, pgd, __va(pud_phys));
                spin_unlock(&init_mm.page_table_lock);
+               pgd_changed = true;
        }
+       if (pgd_changed)
+               sync_global_pgds(addr, end);
        __flush_tlb_all();
  
        return last_map_addr;
@@@ -1003,6 -1047,7 +1047,7 @@@ vmemmap_populate(struct page *start_pag
                }
  
        }
+       sync_global_pgds((unsigned long)start_page, end);
        return 0;
  }
  
index ef2af9948eacc9b9f27a706b2febeeb0272b0910,1457b81357afeaec66501de5e642ec2b741acbce..f4229fb315e1d7d81caddb7557cbcb6c65072fed
                                                                        \
        BUG_TABLE                                                       \
                                                                        \
 +      JUMP_TABLE                                                      \
 +                                                                      \
        /* PCI quirks */                                                \
        .pci_fixup        : AT(ADDR(.pci_fixup) - LOAD_OFFSET) {        \
                VMLINUX_SYMBOL(__start_pci_fixups_early) = .;           \
  #define BUG_TABLE
  #endif
  
 +#define JUMP_TABLE                                                    \
 +      . = ALIGN(8);                                                   \
 +      __jump_table : AT(ADDR(__jump_table) - LOAD_OFFSET) {           \
 +              VMLINUX_SYMBOL(__start___jump_table) = .;               \
 +              *(__jump_table)                                         \
 +              VMLINUX_SYMBOL(__stop___jump_table) = .;                \
 +      }
 +
  #ifdef CONFIG_PM_TRACE
  #define TRACEDATA                                                     \
        . = ALIGN(4);                                                   \
                                - LOAD_OFFSET) {                        \
                VMLINUX_SYMBOL(__per_cpu_start) = .;                    \
                *(.data..percpu..first)                                 \
+               . = ALIGN(PAGE_SIZE);                                   \
                *(.data..percpu..page_aligned)                          \
+               *(.data..percpu..readmostly)                            \
                *(.data..percpu)                                        \
                *(.data..percpu..shared_aligned)                        \
                VMLINUX_SYMBOL(__per_cpu_end) = .;                      \
                VMLINUX_SYMBOL(__per_cpu_load) = .;                     \
                VMLINUX_SYMBOL(__per_cpu_start) = .;                    \
                *(.data..percpu..first)                                 \
+               . = ALIGN(PAGE_SIZE);                                   \
                *(.data..percpu..page_aligned)                          \
+               *(.data..percpu..readmostly)                            \
                *(.data..percpu)                                        \
                *(.data..percpu..shared_aligned)                        \
                VMLINUX_SYMBOL(__per_cpu_end) = .;                      \
diff --combined mm/memory.c
index 0e18b4d649ec82abc83c208e5f9dce9cbb2cf905,a40da6983961153ca4cfce25991a94da07740a93..98b58fecedeffc236a9c7285689fe4720409bd30
@@@ -2623,7 -2623,7 +2623,7 @@@ static int do_swap_page(struct mm_struc
                unsigned int flags, pte_t orig_pte)
  {
        spinlock_t *ptl;
 -      struct page *page;
 +      struct page *page, *swapcache = NULL;
        swp_entry_t entry;
        pte_t pte;
        struct mem_cgroup *ptr = NULL;
        lock_page(page);
        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
  
 -      page = ksm_might_need_to_copy(page, vma, address);
 -      if (!page) {
 -              ret = VM_FAULT_OOM;
 -              goto out;
 +      /*
 +       * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
 +       * release the swapcache from under us.  The page pin, and pte_same
 +       * test below, are not enough to exclude that.  Even if it is still
 +       * swapcache, we need to check that the page's swap has not changed.
 +       */
 +      if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
 +              goto out_page;
 +
 +      if (ksm_might_need_to_copy(page, vma, address)) {
 +              swapcache = page;
 +              page = ksm_does_need_to_copy(page, vma, address);
 +
 +              if (unlikely(!page)) {
 +                      ret = VM_FAULT_OOM;
 +                      page = swapcache;
 +                      swapcache = NULL;
 +                      goto out_page;
 +              }
        }
  
        if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
        if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
                try_to_free_swap(page);
        unlock_page(page);
 +      if (swapcache) {
 +              /*
 +               * Hold the lock to avoid the swap entry to be reused
 +               * until we take the PT lock for the pte_same() check
 +               * (to avoid false positives from pte_same). For
 +               * further safety release the lock after the swap_free
 +               * so that the swap count won't change under a
 +               * parallel locked swapcache.
 +               */
 +              unlock_page(swapcache);
 +              page_cache_release(swapcache);
 +      }
  
        if (flags & FAULT_FLAG_WRITE) {
                ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
@@@ -2783,17 -2756,15 +2783,17 @@@ out_page
        unlock_page(page);
  out_release:
        page_cache_release(page);
 +      if (swapcache) {
 +              unlock_page(swapcache);
 +              page_cache_release(swapcache);
 +      }
        return ret;
  }
  
  /*
 - * This is like a special single-page "expand_downwards()",
 - * except we must first make sure that 'address-PAGE_SIZE'
 + * This is like a special single-page "expand_{down|up}wards()",
 + * except we must first make sure that 'address{-|+}PAGE_SIZE'
   * doesn't hit another vma.
 - *
 - * The "find_vma()" will do the right thing even if we wrap
   */
  static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
  {
  
                expand_stack(vma, address - PAGE_SIZE);
        }
 +      if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
 +              struct vm_area_struct *next = vma->vm_next;
 +
 +              /* As VM_GROWSDOWN but s/below/above/ */
 +              if (next && next->vm_start == address + PAGE_SIZE)
 +                      return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
 +
 +              expand_upwards(vma, address + PAGE_SIZE);
 +      }
        return 0;
  }
  
@@@ -3185,7 -3147,7 +3185,7 @@@ static inline int handle_pte_fault(stru
                 * with threads.
                 */
                if (flags & FAULT_FLAG_WRITE)
-                       flush_tlb_page(vma, address);
+                       flush_tlb_fix_spurious_fault(vma, address);
        }
  unlock:
        pte_unmap_unlock(pte, ptl);