Merge tag 'v3.10.13' into lsk/v3.10/topic/kvm
[firefly-linux-kernel-4.4.55.git] / mm / hugetlb.c
index 7c5eb85ec6450d05dec7e628bd366bf86965e67a..de608bbc8389b553042cfe74c8065a5181166091 100644 (file)
@@ -2312,16 +2312,26 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
        int cow;
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
+       unsigned long mmun_start;       /* For mmu_notifiers */
+       unsigned long mmun_end;         /* For mmu_notifiers */
+       int ret = 0;
 
        cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 
+       mmun_start = vma->vm_start;
+       mmun_end = vma->vm_end;
+       if (cow)
+               mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
+
        for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
                src_pte = huge_pte_offset(src, addr);
                if (!src_pte)
                        continue;
                dst_pte = huge_pte_alloc(dst, addr, sz);
-               if (!dst_pte)
-                       goto nomem;
+               if (!dst_pte) {
+                       ret = -ENOMEM;
+                       break;
+               }
 
                /* If the pagetables are shared don't copy or take references */
                if (dst_pte == src_pte)
@@ -2341,10 +2351,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                spin_unlock(&src->page_table_lock);
                spin_unlock(&dst->page_table_lock);
        }
-       return 0;
 
-nomem:
-       return -ENOMEM;
+       if (cow)
+               mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
+
+       return ret;
 }
 
 static int is_hugetlb_entry_migration(pte_t pte)
@@ -2948,15 +2959,6 @@ out_mutex:
        return ret;
 }
 
-/* Can be overriden by architectures */
-__attribute__((weak)) struct page *
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
-              pud_t *pud, int write)
-{
-       BUG();
-       return NULL;
-}
-
 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                         struct page **pages, struct vm_area_struct **vmas,
                         unsigned long *position, unsigned long *nr_pages,
@@ -3186,6 +3188,216 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
        hugetlb_acct_memory(h, -(chg - freed));
 }
 
+#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+static unsigned long page_table_shareable(struct vm_area_struct *svma,
+                               struct vm_area_struct *vma,
+                               unsigned long addr, pgoff_t idx)
+{
+       unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
+                               svma->vm_start;
+       unsigned long sbase = saddr & PUD_MASK;
+       unsigned long s_end = sbase + PUD_SIZE;
+
+       /* Allow segments to share if only one is marked locked */
+       unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
+       unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
+
+       /*
+        * match the virtual addresses, permission and the alignment of the
+        * page table page.
+        */
+       if (pmd_index(addr) != pmd_index(saddr) ||
+           vm_flags != svm_flags ||
+           sbase < svma->vm_start || svma->vm_end < s_end)
+               return 0;
+
+       return saddr;
+}
+
+static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
+{
+       unsigned long base = addr & PUD_MASK;
+       unsigned long end = base + PUD_SIZE;
+
+       /*
+        * check on proper vm_flags and page table alignment
+        */
+       if (vma->vm_flags & VM_MAYSHARE &&
+           vma->vm_start <= base && end <= vma->vm_end)
+               return 1;
+       return 0;
+}
+
+/*
+ * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
+ * and returns the corresponding pte. While this is not necessary for the
+ * !shared pmd case because we can allocate the pmd later as well, it makes the
+ * code much cleaner. pmd allocation is essential for the shared case because
+ * pud has to be populated inside the same i_mmap_mutex section - otherwise
+ * racing tasks could either miss the sharing (see huge_pte_offset) or select a
+ * bad pmd for sharing.
+ */
+pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+{
+       struct vm_area_struct *vma = find_vma(mm, addr);
+       struct address_space *mapping = vma->vm_file->f_mapping;
+       pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+                       vma->vm_pgoff;
+       struct vm_area_struct *svma;
+       unsigned long saddr;
+       pte_t *spte = NULL;
+       pte_t *pte;
+
+       if (!vma_shareable(vma, addr))
+               return (pte_t *)pmd_alloc(mm, pud, addr);
+
+       mutex_lock(&mapping->i_mmap_mutex);
+       vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
+               if (svma == vma)
+                       continue;
+
+               saddr = page_table_shareable(svma, vma, addr, idx);
+               if (saddr) {
+                       spte = huge_pte_offset(svma->vm_mm, saddr);
+                       if (spte) {
+                               get_page(virt_to_page(spte));
+                               break;
+                       }
+               }
+       }
+
+       if (!spte)
+               goto out;
+
+       spin_lock(&mm->page_table_lock);
+       if (pud_none(*pud))
+               pud_populate(mm, pud,
+                               (pmd_t *)((unsigned long)spte & PAGE_MASK));
+       else
+               put_page(virt_to_page(spte));
+       spin_unlock(&mm->page_table_lock);
+out:
+       pte = (pte_t *)pmd_alloc(mm, pud, addr);
+       mutex_unlock(&mapping->i_mmap_mutex);
+       return pte;
+}
+
+/*
+ * unmap huge page backed by shared pte.
+ *
+ * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared
+ * indicated by page_count > 1, unmap is achieved by clearing pud and
+ * decrementing the ref count. If count == 1, the pte page is not shared.
+ *
+ * called with vma->vm_mm->page_table_lock held.
+ *
+ * returns: 1 successfully unmapped a shared pte page
+ *         0 the underlying pte page is not shared, or it is the last user
+ */
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+       pgd_t *pgd = pgd_offset(mm, *addr);
+       pud_t *pud = pud_offset(pgd, *addr);
+
+       BUG_ON(page_count(virt_to_page(ptep)) == 0);
+       if (page_count(virt_to_page(ptep)) == 1)
+               return 0;
+
+       pud_clear(pud);
+       put_page(virt_to_page(ptep));
+       *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
+       return 1;
+}
+#define want_pmd_share()       (1)
+#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+{
+       return NULL;
+}
+#define want_pmd_share()       (0)
+#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+
+#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+                       unsigned long addr, unsigned long sz)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pte_t *pte = NULL;
+
+       pgd = pgd_offset(mm, addr);
+       pud = pud_alloc(mm, pgd, addr);
+       if (pud) {
+               if (sz == PUD_SIZE) {
+                       pte = (pte_t *)pud;
+               } else {
+                       BUG_ON(sz != PMD_SIZE);
+                       if (want_pmd_share() && pud_none(*pud))
+                               pte = huge_pmd_share(mm, addr, pud);
+                       else
+                               pte = (pte_t *)pmd_alloc(mm, pud, addr);
+               }
+       }
+       BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
+
+       return pte;
+}
+
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd = NULL;
+
+       pgd = pgd_offset(mm, addr);
+       if (pgd_present(*pgd)) {
+               pud = pud_offset(pgd, addr);
+               if (pud_present(*pud)) {
+                       if (pud_huge(*pud))
+                               return (pte_t *)pud;
+                       pmd = pmd_offset(pud, addr);
+               }
+       }
+       return (pte_t *) pmd;
+}
+
+struct page *
+follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+               pmd_t *pmd, int write)
+{
+       struct page *page;
+
+       page = pte_page(*(pte_t *)pmd);
+       if (page)
+               page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
+       return page;
+}
+
+struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+               pud_t *pud, int write)
+{
+       struct page *page;
+
+       page = pte_page(*(pte_t *)pud);
+       if (page)
+               page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
+       return page;
+}
+
+#else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+
+/* Can be overriden by architectures */
+__attribute__((weak)) struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+              pud_t *pud, int write)
+{
+       BUG();
+       return NULL;
+}
+
+#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+
 #ifdef CONFIG_MEMORY_FAILURE
 
 /* Should be called in hugetlb_lock */