Merge tag 'sh-3.15' of git://git.kernel.org/pub/scm/linux/kernel/git/arm/arm-soc

[firefly-linux-kernel-4.4.55.git] / mm / hugetlb.c
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 5a2515a774b5f52f9a683dd2f44bbbf41d9619f4..7c02b9dadfb05b28e2aef363523f021f933bdb6d 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,6 +22,7 @@
  #include <linux/swap.h>
  #include <linux/swapops.h>
  #include <linux/page-isolation.h>
+#include <linux/jhash.h>
  
  #include <asm/page.h>
  #include <asm/pgtable.h>
@@ -53,6 +54,13 @@ static unsigned long __initdata default_hstate_size;
   */
  DEFINE_SPINLOCK(hugetlb_lock);
  
+/*
+ * Serializes faults on the same logical page.  This is used to
+ * prevent spurious OOMs when the hugepage pool is fully utilized.
+ */
+static int num_fault_mutexes;
+static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
+
  static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
  {
         bool free = (spool->count == 0) && (spool->used_hpages == 0);
@@ -419,13 +427,24 @@ void resv_map_release(struct kref *ref)
         kfree(resv_map);
  }
  
+static inline struct resv_map *inode_resv_map(struct inode *inode)
+{
+       return inode->i_mapping->private_data;
+}
+
  static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
  {
         VM_BUG_ON(!is_vm_hugetlb_page(vma));
-       if (!(vma->vm_flags & VM_MAYSHARE))
+       if (vma->vm_flags & VM_MAYSHARE) {
+               struct address_space *mapping = vma->vm_file->f_mapping;
+               struct inode *inode = mapping->host;
+
+               return inode_resv_map(inode);
+
+       } else {
                 return (struct resv_map *)(get_vma_private_data(vma) &
                                                         ~HPAGE_RESV_MASK);
-       return NULL;
+       }
  }
  
  static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
@@ -670,7 +689,8 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
         put_page(page); /* free it into the hugepage allocator */
  }
  
-static void prep_compound_gigantic_page(struct page *page, unsigned long order)
+static void __init prep_compound_gigantic_page(struct page *page,
+                                              unsigned long order)
  {
         int i;
         int nr_pages = 1 << order;
@@ -1167,48 +1187,34 @@ static void return_unused_surplus_pages(struct hstate *h,
  static long vma_needs_reservation(struct hstate *h,
                         struct vm_area_struct *vma, unsigned long addr)
  {
-       struct address_space *mapping = vma->vm_file->f_mapping;
-       struct inode *inode = mapping->host;
-
-       if (vma->vm_flags & VM_MAYSHARE) {
-               pgoff_t idx = vma_hugecache_offset(h, vma, addr);
-               struct resv_map *resv = inode->i_mapping->private_data;
-
-               return region_chg(resv, idx, idx + 1);
+       struct resv_map *resv;
+       pgoff_t idx;
+       long chg;
  
-       } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+       resv = vma_resv_map(vma);
+       if (!resv)
                 return 1;
  
-       } else  {
-               long err;
-               pgoff_t idx = vma_hugecache_offset(h, vma, addr);
-               struct resv_map *resv = vma_resv_map(vma);
+       idx = vma_hugecache_offset(h, vma, addr);
+       chg = region_chg(resv, idx, idx + 1);
  
-               err = region_chg(resv, idx, idx + 1);
-               if (err < 0)
-                       return err;
-               return 0;
-       }
+       if (vma->vm_flags & VM_MAYSHARE)
+               return chg;
+       else
+               return chg < 0 ? chg : 0;
  }
  static void vma_commit_reservation(struct hstate *h,
                         struct vm_area_struct *vma, unsigned long addr)
  {
-       struct address_space *mapping = vma->vm_file->f_mapping;
-       struct inode *inode = mapping->host;
-
-       if (vma->vm_flags & VM_MAYSHARE) {
-               pgoff_t idx = vma_hugecache_offset(h, vma, addr);
-               struct resv_map *resv = inode->i_mapping->private_data;
-
-               region_add(resv, idx, idx + 1);
+       struct resv_map *resv;
+       pgoff_t idx;
  
-       } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
-               pgoff_t idx = vma_hugecache_offset(h, vma, addr);
-               struct resv_map *resv = vma_resv_map(vma);
+       resv = vma_resv_map(vma);
+       if (!resv)
+               return;
  
-               /* Mark this page used in the map. */
-               region_add(resv, idx, idx + 1);
-       }
+       idx = vma_hugecache_offset(h, vma, addr);
+       region_add(resv, idx, idx + 1);
  }
  
  static struct page *alloc_huge_page(struct vm_area_struct *vma,
@@ -1314,7 +1320,7 @@ found:
         return 1;
  }
  
-static void prep_compound_huge_page(struct page *page, int order)
+static void __init prep_compound_huge_page(struct page *page, int order)
  {
         if (unlikely(order > (MAX_ORDER - 1)))
                 prep_compound_gigantic_page(page, order);
@@ -1964,11 +1970,14 @@ static void __exit hugetlb_exit(void)
         }
  
         kobject_put(hugepages_kobj);
+       kfree(htlb_fault_mutex_table);
  }
  module_exit(hugetlb_exit);
  
  static int __init hugetlb_init(void)
  {
+       int i;
+
         /* Some platform decide whether they support huge pages at boot
          * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
          * there is no such support
@@ -1993,6 +2002,17 @@ static int __init hugetlb_init(void)
         hugetlb_register_all_nodes();
         hugetlb_cgroup_file_init();
  
+#ifdef CONFIG_SMP
+       num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
+#else
+       num_fault_mutexes = 1;
+#endif
+       htlb_fault_mutex_table =
+               kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL);
+       BUG_ON(!htlb_fault_mutex_table);
+
+       for (i = 0; i < num_fault_mutexes; i++)
+               mutex_init(&htlb_fault_mutex_table[i]);
         return 0;
  }
  module_init(hugetlb_init);
@@ -2271,41 +2291,30 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
          * after this open call completes.  It is therefore safe to take a
          * new reference here without additional locking.
          */
-       if (resv)
+       if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
                 kref_get(&resv->refs);
  }
  
-static void resv_map_put(struct vm_area_struct *vma)
-{
-       struct resv_map *resv = vma_resv_map(vma);
-
-       if (!resv)
-               return;
-       kref_put(&resv->refs, resv_map_release);
-}
-
  static void hugetlb_vm_op_close(struct vm_area_struct *vma)
  {
         struct hstate *h = hstate_vma(vma);
         struct resv_map *resv = vma_resv_map(vma);
         struct hugepage_subpool *spool = subpool_vma(vma);
-       unsigned long reserve;
-       unsigned long start;
-       unsigned long end;
+       unsigned long reserve, start, end;
  
-       if (resv) {
-               start = vma_hugecache_offset(h, vma, vma->vm_start);
-               end = vma_hugecache_offset(h, vma, vma->vm_end);
+       if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+               return;
  
-               reserve = (end - start) -
-                       region_count(resv, start, end);
+       start = vma_hugecache_offset(h, vma, vma->vm_start);
+       end = vma_hugecache_offset(h, vma, vma->vm_end);
  
-               resv_map_put(vma);
+       reserve = (end - start) - region_count(resv, start, end);
  
-               if (reserve) {
-                       hugetlb_acct_memory(h, -reserve);
-                       hugepage_subpool_put_pages(spool, reserve);
-               }
+       kref_put(&resv->refs, resv_map_release);
+
+       if (reserve) {
+               hugetlb_acct_memory(h, -reserve);
+               hugepage_subpool_put_pages(spool, reserve);
         }
  }
  
@@ -2781,15 +2790,14 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
  }
  
  static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
-                       unsigned long address, pte_t *ptep, unsigned int flags)
+                          struct address_space *mapping, pgoff_t idx,
+                          unsigned long address, pte_t *ptep, unsigned int flags)
  {
         struct hstate *h = hstate_vma(vma);
         int ret = VM_FAULT_SIGBUS;
         int anon_rmap = 0;
-       pgoff_t idx;
         unsigned long size;
         struct page *page;
-       struct address_space *mapping;
         pte_t new_pte;
         spinlock_t *ptl;
  
@@ -2804,9 +2812,6 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 return ret;
         }
  
-       mapping = vma->vm_file->f_mapping;
-       idx = vma_hugecache_offset(h, vma, address);
-
         /*
          * Use page lock to guard against racing truncation
          * before we get page_table_lock.
@@ -2916,17 +2921,53 @@ backout_unlocked:
         goto out;
  }
  
+#ifdef CONFIG_SMP
+static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+                           struct vm_area_struct *vma,
+                           struct address_space *mapping,
+                           pgoff_t idx, unsigned long address)
+{
+       unsigned long key[2];
+       u32 hash;
+
+       if (vma->vm_flags & VM_SHARED) {
+               key[0] = (unsigned long) mapping;
+               key[1] = idx;
+       } else {
+               key[0] = (unsigned long) mm;
+               key[1] = address >> huge_page_shift(h);
+       }
+
+       hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
+
+       return hash & (num_fault_mutexes - 1);
+}
+#else
+/*
+ * For uniprocesor systems we always use a single mutex, so just
+ * return 0 and avoid the hashing overhead.
+ */
+static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+                           struct vm_area_struct *vma,
+                           struct address_space *mapping,
+                           pgoff_t idx, unsigned long address)
+{
+       return 0;
+}
+#endif
+
  int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                         unsigned long address, unsigned int flags)
  {
-       pte_t *ptep;
-       pte_t entry;
+       pte_t *ptep, entry;
         spinlock_t *ptl;
         int ret;
+       u32 hash;
+       pgoff_t idx;
         struct page *page = NULL;
         struct page *pagecache_page = NULL;
-       static DEFINE_MUTEX(hugetlb_instantiation_mutex);
         struct hstate *h = hstate_vma(vma);
+       struct address_space *mapping;
  
         address &= huge_page_mask(h);
  
@@ -2945,15 +2986,20 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         if (!ptep)
                 return VM_FAULT_OOM;
  
+       mapping = vma->vm_file->f_mapping;
+       idx = vma_hugecache_offset(h, vma, address);
+
         /*
          * Serialize hugepage allocation and instantiation, so that we don't
          * get spurious allocation failures if two CPUs race to instantiate
          * the same page in the page cache.
          */
-       mutex_lock(&hugetlb_instantiation_mutex);
+       hash = fault_mutex_hash(h, mm, vma, mapping, idx, address);
+       mutex_lock(&htlb_fault_mutex_table[hash]);
+
         entry = huge_ptep_get(ptep);
         if (huge_pte_none(entry)) {
-               ret = hugetlb_no_page(mm, vma, address, ptep, flags);
+               ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
                 goto out_mutex;
         }
  
@@ -3022,8 +3068,7 @@ out_ptl:
         put_page(page);
  
  out_mutex:
-       mutex_unlock(&hugetlb_instantiation_mutex);
-
+       mutex_unlock(&htlb_fault_mutex_table[hash]);
         return ret;
  }
  
@@ -3198,7 +3243,7 @@ int hugetlb_reserve_pages(struct inode *inode,
          * called to make the mapping read-write. Assume !vma is a shm mapping
          */
         if (!vma || vma->vm_flags & VM_MAYSHARE) {
-               resv_map = inode->i_mapping->private_data;
+               resv_map = inode_resv_map(inode);
  
                 chg = region_chg(resv_map, from, to);
  
@@ -3249,15 +3294,15 @@ int hugetlb_reserve_pages(struct inode *inode,
                 region_add(resv_map, from, to);
         return 0;
  out_err:
-       if (vma)
-               resv_map_put(vma);
+       if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+               kref_put(&resv_map->refs, resv_map_release);
         return ret;
  }
  
  void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
  {
         struct hstate *h = hstate_inode(inode);
-       struct resv_map *resv_map = inode->i_mapping->private_data;
+       struct resv_map *resv_map = inode_resv_map(inode);
         long chg = 0;
         struct hugepage_subpool *spool = subpool_inode(inode);