UPSTREAM: clk: rockchip: add clock-id for rk3036 emac pll source clock
[firefly-linux-kernel-4.4.55.git] / mm / memory-failure.c
index ceb0c7f1932f2e97c18cb1971f5b0593d57172ab..750b7893ee3ac840205299417ac1600c0cc16ebc 100644 (file)
  * this code has to be extremely careful. Generally it tries to use 
  * normal locking rules, as in get the standard locks, even if that means 
  * the error handling takes potentially a long time.
+ *
+ * It can be very tempting to add handling for obscure cases here.
+ * In general any code for handling new cases should only be added iff:
+ * - You know how to test it.
+ * - You have a test that can be added to mce-test
+ *   https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
+ * - The case actually shows up as a frequent (top 10) page state in
+ *   tools/vm/page-types when running a real workload.
  * 
  * There are several operations here with exponential complexity because
  * of unsuitable VM data structures. For example the operation to map back 
  * are rare we hope to get away with this. This avoids impacting the core 
  * VM.
  */
-
-/*
- * Notebook:
- * - hugetlb needs more code
- * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
- * - pass bad pages to kdump next kernel
- */
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/page-flags.h>
@@ -55,7 +56,9 @@
 #include <linux/memory_hotplug.h>
 #include <linux/mm_inline.h>
 #include <linux/kfifo.h>
+#include <linux/ratelimit.h>
 #include "internal.h"
+#include "ras/ras_event.h"
 
 int sysctl_memory_failure_early_kill __read_mostly = 0;
 
@@ -128,31 +131,15 @@ static int hwpoison_filter_flags(struct page *p)
  * can only guarantee that the page either belongs to the memcg tasks, or is
  * a freed page.
  */
-#ifdef CONFIG_MEMCG_SWAP
+#ifdef CONFIG_MEMCG
 u64 hwpoison_filter_memcg;
 EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
 static int hwpoison_filter_task(struct page *p)
 {
-       struct mem_cgroup *mem;
-       struct cgroup_subsys_state *css;
-       unsigned long ino;
-
        if (!hwpoison_filter_memcg)
                return 0;
 
-       mem = try_get_mem_cgroup_from_page(p);
-       if (!mem)
-               return -EINVAL;
-
-       css = mem_cgroup_css(mem);
-       /* root_mem_cgroup has NULL dentries */
-       if (!css->cgroup->dentry)
-               return -EINVAL;
-
-       ino = css->cgroup->dentry->d_inode->i_ino;
-       css_put(css);
-
-       if (ino != hwpoison_filter_memcg)
+       if (page_cgroup_ino(p) != hwpoison_filter_memcg)
                return -EINVAL;
 
        return 0;
@@ -206,11 +193,11 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
 #ifdef __ARCH_SI_TRAPNO
        si.si_trapno = trapno;
 #endif
-       si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
+       si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
 
-       if ((flags & MF_ACTION_REQUIRED) && t == current) {
+       if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
                si.si_code = BUS_MCEERR_AR;
-               ret = force_sig_info(SIGBUS, &si, t);
+               ret = force_sig_info(SIGBUS, &si, current);
        } else {
                /*
                 * Don't use force here, it's convenient if the signal
@@ -237,27 +224,17 @@ void shake_page(struct page *p, int access)
                lru_add_drain_all();
                if (PageLRU(p))
                        return;
-               drain_all_pages();
+               drain_all_pages(page_zone(p));
                if (PageLRU(p) || is_free_buddy_page(p))
                        return;
        }
 
        /*
-        * Only call shrink_slab here (which would also shrink other caches) if
-        * access is not potentially fatal.
+        * Only call shrink_node_slabs here (which would also shrink
+        * other caches) if access is not potentially fatal.
         */
-       if (access) {
-               int nr;
-               do {
-                       struct shrink_control shrink = {
-                               .gfp_mask = GFP_KERNEL,
-                       };
-
-                       nr = shrink_slab(&shrink, 1000, 1000);
-                       if (page_count(p) == 1)
-                               break;
-               } while (nr > 10);
-       }
+       if (access)
+               drop_slab_node(page_to_nid(p));
 }
 EXPORT_SYMBOL_GPL(shake_page);
 
@@ -382,20 +359,51 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
        }
 }
 
-static int task_early_kill(struct task_struct *tsk)
+/*
+ * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
+ * on behalf of the thread group. Return task_struct of the (first found)
+ * dedicated thread if found, and return NULL otherwise.
+ *
+ * We already hold read_lock(&tasklist_lock) in the caller, so we don't
+ * have to call rcu_read_lock/unlock() in this function.
+ */
+static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
 {
+       struct task_struct *t;
+
+       for_each_thread(tsk, t)
+               if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
+                       return t;
+       return NULL;
+}
+
+/*
+ * Determine whether a given process is "early kill" process which expects
+ * to be signaled when some page under the process is hwpoisoned.
+ * Return task_struct of the dedicated thread (main thread unless explicitly
+ * specified) if the process is "early kill," and otherwise returns NULL.
+ */
+static struct task_struct *task_early_kill(struct task_struct *tsk,
+                                          int force_early)
+{
+       struct task_struct *t;
        if (!tsk->mm)
-               return 0;
-       if (tsk->flags & PF_MCE_PROCESS)
-               return !!(tsk->flags & PF_MCE_EARLY);
-       return sysctl_memory_failure_early_kill;
+               return NULL;
+       if (force_early)
+               return tsk;
+       t = find_early_kill_thread(tsk);
+       if (t)
+               return t;
+       if (sysctl_memory_failure_early_kill)
+               return tsk;
+       return NULL;
 }
 
 /*
  * Collect processes when the error hit an anonymous page.
  */
 static void collect_procs_anon(struct page *page, struct list_head *to_kill,
-                             struct to_kill **tkc)
+                             struct to_kill **tkc, int force_early)
 {
        struct vm_area_struct *vma;
        struct task_struct *tsk;
@@ -406,20 +414,21 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
        if (av == NULL) /* Not actually mapped anymore */
                return;
 
-       pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+       pgoff = page_to_pgoff(page);
        read_lock(&tasklist_lock);
        for_each_process (tsk) {
                struct anon_vma_chain *vmac;
+               struct task_struct *t = task_early_kill(tsk, force_early);
 
-               if (!task_early_kill(tsk))
+               if (!t)
                        continue;
                anon_vma_interval_tree_foreach(vmac, &av->rb_root,
                                               pgoff, pgoff) {
                        vma = vmac->vma;
                        if (!page_mapped_in_vma(page, vma))
                                continue;
-                       if (vma->vm_mm == tsk->mm)
-                               add_to_kill(tsk, page, vma, to_kill, tkc);
+                       if (vma->vm_mm == t->mm)
+                               add_to_kill(t, page, vma, to_kill, tkc);
                }
        }
        read_unlock(&tasklist_lock);
@@ -430,20 +439,20 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
  * Collect processes when the error hit a file mapped page.
  */
 static void collect_procs_file(struct page *page, struct list_head *to_kill,
-                             struct to_kill **tkc)
+                             struct to_kill **tkc, int force_early)
 {
        struct vm_area_struct *vma;
        struct task_struct *tsk;
        struct address_space *mapping = page->mapping;
 
-       mutex_lock(&mapping->i_mmap_mutex);
+       i_mmap_lock_read(mapping);
        read_lock(&tasklist_lock);
        for_each_process(tsk) {
-               pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+               pgoff_t pgoff = page_to_pgoff(page);
+               struct task_struct *t = task_early_kill(tsk, force_early);
 
-               if (!task_early_kill(tsk))
+               if (!t)
                        continue;
-
                vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
                                      pgoff) {
                        /*
@@ -453,12 +462,12 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
                         * Assume applications who requested early kill want
                         * to be informed of all such data corruptions.
                         */
-                       if (vma->vm_mm == tsk->mm)
-                               add_to_kill(tsk, page, vma, to_kill, tkc);
+                       if (vma->vm_mm == t->mm)
+                               add_to_kill(t, page, vma, to_kill, tkc);
                }
        }
        read_unlock(&tasklist_lock);
-       mutex_unlock(&mapping->i_mmap_mutex);
+       i_mmap_unlock_read(mapping);
 }
 
 /*
@@ -467,7 +476,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
  * First preallocate one tokill structure outside the spin locks,
  * so that we can kill at least one process reasonably reliable.
  */
-static void collect_procs(struct page *page, struct list_head *tokill)
+static void collect_procs(struct page *page, struct list_head *tokill,
+                               int force_early)
 {
        struct to_kill *tk;
 
@@ -478,28 +488,40 @@ static void collect_procs(struct page *page, struct list_head *tokill)
        if (!tk)
                return;
        if (PageAnon(page))
-               collect_procs_anon(page, tokill, &tk);
+               collect_procs_anon(page, tokill, &tk, force_early);
        else
-               collect_procs_file(page, tokill, &tk);
+               collect_procs_file(page, tokill, &tk, force_early);
        kfree(tk);
 }
 
-/*
- * Error handlers for various types of pages.
- */
-
-enum outcome {
-       IGNORED,        /* Error: cannot be handled */
-       FAILED,         /* Error: handling failed */
-       DELAYED,        /* Will be handled later */
-       RECOVERED,      /* Successfully recovered */
+static const char *action_name[] = {
+       [MF_IGNORED] = "Ignored",
+       [MF_FAILED] = "Failed",
+       [MF_DELAYED] = "Delayed",
+       [MF_RECOVERED] = "Recovered",
 };
 
-static const char *action_name[] = {
-       [IGNORED] = "Ignored",
-       [FAILED] = "Failed",
-       [DELAYED] = "Delayed",
-       [RECOVERED] = "Recovered",
+static const char * const action_page_types[] = {
+       [MF_MSG_KERNEL]                 = "reserved kernel page",
+       [MF_MSG_KERNEL_HIGH_ORDER]      = "high-order kernel page",
+       [MF_MSG_SLAB]                   = "kernel slab page",
+       [MF_MSG_DIFFERENT_COMPOUND]     = "different compound page after locking",
+       [MF_MSG_POISONED_HUGE]          = "huge page already hardware poisoned",
+       [MF_MSG_HUGE]                   = "huge page",
+       [MF_MSG_FREE_HUGE]              = "free huge page",
+       [MF_MSG_UNMAP_FAILED]           = "unmapping failed page",
+       [MF_MSG_DIRTY_SWAPCACHE]        = "dirty swapcache page",
+       [MF_MSG_CLEAN_SWAPCACHE]        = "clean swapcache page",
+       [MF_MSG_DIRTY_MLOCKED_LRU]      = "dirty mlocked LRU page",
+       [MF_MSG_CLEAN_MLOCKED_LRU]      = "clean mlocked LRU page",
+       [MF_MSG_DIRTY_UNEVICTABLE_LRU]  = "dirty unevictable LRU page",
+       [MF_MSG_CLEAN_UNEVICTABLE_LRU]  = "clean unevictable LRU page",
+       [MF_MSG_DIRTY_LRU]              = "dirty LRU page",
+       [MF_MSG_CLEAN_LRU]              = "clean LRU page",
+       [MF_MSG_TRUNCATED_LRU]          = "already truncated LRU page",
+       [MF_MSG_BUDDY]                  = "free buddy page",
+       [MF_MSG_BUDDY_2ND]              = "free buddy page (2nd try)",
+       [MF_MSG_UNKNOWN]                = "unknown page",
 };
 
 /*
@@ -533,7 +555,7 @@ static int delete_from_lru_cache(struct page *p)
  */
 static int me_kernel(struct page *p, unsigned long pfn)
 {
-       return IGNORED;
+       return MF_IGNORED;
 }
 
 /*
@@ -542,7 +564,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
 static int me_unknown(struct page *p, unsigned long pfn)
 {
        printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
-       return FAILED;
+       return MF_FAILED;
 }
 
 /*
@@ -551,7 +573,7 @@ static int me_unknown(struct page *p, unsigned long pfn)
 static int me_pagecache_clean(struct page *p, unsigned long pfn)
 {
        int err;
-       int ret = FAILED;
+       int ret = MF_FAILED;
        struct address_space *mapping;
 
        delete_from_lru_cache(p);
@@ -561,7 +583,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
         * should be the one m_f() holds.
         */
        if (PageAnon(p))
-               return RECOVERED;
+               return MF_RECOVERED;
 
        /*
         * Now truncate the page in the page cache. This is really
@@ -575,7 +597,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
                /*
                 * Page has been teared down in the meanwhile
                 */
-               return FAILED;
+               return MF_FAILED;
        }
 
        /*
@@ -592,7 +614,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
                                !try_to_release_page(p, GFP_NOIO)) {
                        pr_info("MCE %#lx: failed to release buffers\n", pfn);
                } else {
-                       ret = RECOVERED;
+                       ret = MF_RECOVERED;
                }
        } else {
                /*
@@ -600,7 +622,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
                 * This fails on dirty or anything with private pages
                 */
                if (invalidate_inode_page(p))
-                       ret = RECOVERED;
+                       ret = MF_RECOVERED;
                else
                        printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
                                pfn);
@@ -609,7 +631,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 }
 
 /*
- * Dirty cache page page
+ * Dirty pagecache page
  * Issues: when the error hit a hole page the error is not properly
  * propagated.
  */
@@ -686,9 +708,9 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn)
        ClearPageUptodate(p);
 
        if (!delete_from_lru_cache(p))
-               return DELAYED;
+               return MF_DELAYED;
        else
-               return FAILED;
+               return MF_FAILED;
 }
 
 static int me_swapcache_clean(struct page *p, unsigned long pfn)
@@ -696,9 +718,9 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
        delete_from_swap_cache(p);
 
        if (!delete_from_lru_cache(p))
-               return RECOVERED;
+               return MF_RECOVERED;
        else
-               return FAILED;
+               return MF_FAILED;
 }
 
 /*
@@ -711,6 +733,10 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 {
        int res = 0;
        struct page *hpage = compound_head(p);
+
+       if (!PageHuge(hpage))
+               return MF_DELAYED;
+
        /*
         * We can safely recover from error on free or reserved (i.e.
         * not in-use) hugepage by dequeuing it from freelist.
@@ -724,9 +750,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
        if (!(page_mapping(hpage) || PageAnon(hpage))) {
                res = dequeue_hwpoisoned_huge_page(hpage);
                if (!res)
-                       return RECOVERED;
+                       return MF_RECOVERED;
        }
-       return DELAYED;
+       return MF_DELAYED;
 }
 
 /*
@@ -750,18 +776,16 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 #define lru            (1UL << PG_lru)
 #define swapbacked     (1UL << PG_swapbacked)
 #define head           (1UL << PG_head)
-#define tail           (1UL << PG_tail)
-#define compound       (1UL << PG_compound)
 #define slab           (1UL << PG_slab)
 #define reserved       (1UL << PG_reserved)
 
 static struct page_state {
        unsigned long mask;
        unsigned long res;
-       char *msg;
+       enum mf_action_page_type type;
        int (*action)(struct page *p, unsigned long pfn);
 } error_states[] = {
-       { reserved,     reserved,       "reserved kernel",      me_kernel },
+       { reserved,     reserved,       MF_MSG_KERNEL,  me_kernel },
        /*
         * free pages are specially detected outside this table:
         * PG_buddy pages only make a small fraction of all free pages.
@@ -772,31 +796,26 @@ static struct page_state {
         * currently unused objects without touching them. But just
         * treat it as standard kernel for now.
         */
-       { slab,         slab,           "kernel slab",  me_kernel },
+       { slab,         slab,           MF_MSG_SLAB,    me_kernel },
 
-#ifdef CONFIG_PAGEFLAGS_EXTENDED
-       { head,         head,           "huge",         me_huge_page },
-       { tail,         tail,           "huge",         me_huge_page },
-#else
-       { compound,     compound,       "huge",         me_huge_page },
-#endif
+       { head,         head,           MF_MSG_HUGE,            me_huge_page },
 
-       { sc|dirty,     sc|dirty,       "dirty swapcache",      me_swapcache_dirty },
-       { sc|dirty,     sc,             "clean swapcache",      me_swapcache_clean },
+       { sc|dirty,     sc|dirty,       MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
+       { sc|dirty,     sc,             MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
 
-       { mlock|dirty,  mlock|dirty,    "dirty mlocked LRU",    me_pagecache_dirty },
-       { mlock|dirty,  mlock,          "clean mlocked LRU",    me_pagecache_clean },
+       { mlock|dirty,  mlock|dirty,    MF_MSG_DIRTY_MLOCKED_LRU,       me_pagecache_dirty },
+       { mlock|dirty,  mlock,          MF_MSG_CLEAN_MLOCKED_LRU,       me_pagecache_clean },
 
-       { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
-       { unevict|dirty, unevict,       "clean unevictable LRU", me_pagecache_clean },
+       { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU,   me_pagecache_dirty },
+       { unevict|dirty, unevict,       MF_MSG_CLEAN_UNEVICTABLE_LRU,   me_pagecache_clean },
 
-       { lru|dirty,    lru|dirty,      "dirty LRU",    me_pagecache_dirty },
-       { lru|dirty,    lru,            "clean LRU",    me_pagecache_clean },
+       { lru|dirty,    lru|dirty,      MF_MSG_DIRTY_LRU,       me_pagecache_dirty },
+       { lru|dirty,    lru,            MF_MSG_CLEAN_LRU,       me_pagecache_clean },
 
        /*
         * Catchall entry: must be at end.
         */
-       { 0,            0,              "unknown page state",   me_unknown },
+       { 0,            0,              MF_MSG_UNKNOWN, me_unknown },
 };
 
 #undef dirty
@@ -816,10 +835,13 @@ static struct page_state {
  * "Dirty/Clean" indication is not 100% accurate due to the possibility of
  * setting PG_dirty outside page lock. See also comment above set_page_dirty().
  */
-static void action_result(unsigned long pfn, char *msg, int result)
+static void action_result(unsigned long pfn, enum mf_action_page_type type,
+                         enum mf_result result)
 {
-       pr_err("MCE %#lx: %s page recovery: %s\n",
-               pfn, msg, action_name[result]);
+       trace_memory_failure_event(pfn, type, result);
+
+       pr_err("MCE %#lx: recovery action for %s: %s\n",
+               pfn, action_page_types[type], action_name[result]);
 }
 
 static int page_action(struct page_state *ps, struct page *p,
@@ -829,43 +851,114 @@ static int page_action(struct page_state *ps, struct page *p,
        int count;
 
        result = ps->action(p, pfn);
-       action_result(pfn, ps->msg, result);
 
        count = page_count(p) - 1;
-       if (ps->action == me_swapcache_dirty && result == DELAYED)
+       if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
                count--;
        if (count != 0) {
                printk(KERN_ERR
-                      "MCE %#lx: %s page still referenced by %d users\n",
-                      pfn, ps->msg, count);
-               result = FAILED;
+                      "MCE %#lx: %s still referenced by %d users\n",
+                      pfn, action_page_types[ps->type], count);
+               result = MF_FAILED;
        }
+       action_result(pfn, ps->type, result);
 
        /* Could do more checks here if page looks ok */
        /*
         * Could adjust zone counters here to correct for the missing page.
         */
 
-       return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
+       return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
 }
 
+/**
+ * get_hwpoison_page() - Get refcount for memory error handling:
+ * @page:      raw error page (hit by memory error)
+ *
+ * Return: return 0 if failed to grab the refcount, otherwise true (some
+ * non-zero value.)
+ */
+int get_hwpoison_page(struct page *page)
+{
+       struct page *head = compound_head(page);
+
+       if (PageHuge(head))
+               return get_page_unless_zero(head);
+
+       /*
+        * Thp tail page has special refcounting rule (refcount of tail pages
+        * is stored in ->_mapcount,) so we can't call get_page_unless_zero()
+        * directly for tail pages.
+        */
+       if (PageTransHuge(head)) {
+               /*
+                * Non anonymous thp exists only in allocation/free time. We
+                * can't handle such a case correctly, so let's give it up.
+                * This should be better than triggering BUG_ON when kernel
+                * tries to touch the "partially handled" page.
+                */
+               if (!PageAnon(head)) {
+                       pr_err("MCE: %#lx: non anonymous thp\n",
+                               page_to_pfn(page));
+                       return 0;
+               }
+
+               if (get_page_unless_zero(head)) {
+                       if (PageTail(page))
+                               get_page(page);
+                       return 1;
+               } else {
+                       return 0;
+               }
+       }
+
+       return get_page_unless_zero(page);
+}
+EXPORT_SYMBOL_GPL(get_hwpoison_page);
+
+/**
+ * put_hwpoison_page() - Put refcount for memory error handling:
+ * @page:      raw error page (hit by memory error)
+ */
+void put_hwpoison_page(struct page *page)
+{
+       struct page *head = compound_head(page);
+
+       if (PageHuge(head)) {
+               put_page(head);
+               return;
+       }
+
+       if (PageTransHuge(head))
+               if (page != head)
+                       put_page(head);
+
+       put_page(page);
+}
+EXPORT_SYMBOL_GPL(put_hwpoison_page);
+
 /*
  * Do all that is necessary to remove user space mappings. Unmap
  * the pages and send SIGBUS to the processes if the data was dirty.
  */
 static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
-                                 int trapno, int flags)
+                                 int trapno, int flags, struct page **hpagep)
 {
        enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
        struct address_space *mapping;
        LIST_HEAD(tokill);
        int ret;
        int kill = 1, forcekill;
-       struct page *hpage = compound_head(p);
-       struct page *ppage;
+       struct page *hpage = *hpagep;
 
+       /*
+        * Here we are interested only in user-mapped pages, so skip any
+        * other types of pages.
+        */
        if (PageReserved(p) || PageSlab(p))
                return SWAP_SUCCESS;
+       if (!(PageLRU(hpage) || PageHuge(p)))
+               return SWAP_SUCCESS;
 
        /*
         * This check implies we don't kill processes if their pages
@@ -874,8 +967,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        if (!page_mapped(hpage))
                return SWAP_SUCCESS;
 
-       if (PageKsm(p))
+       if (PageKsm(p)) {
+               pr_err("MCE %#lx: can't handle KSM pages.\n", pfn);
                return SWAP_FAIL;
+       }
 
        if (PageSwapCache(p)) {
                printk(KERN_ERR
@@ -903,44 +998,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
                }
        }
 
-       /*
-        * ppage: poisoned page
-        *   if p is regular page(4k page)
-        *        ppage == real poisoned page;
-        *   else p is hugetlb or THP, ppage == head page.
-        */
-       ppage = hpage;
-
-       if (PageTransHuge(hpage)) {
-               /*
-                * Verify that this isn't a hugetlbfs head page, the check for
-                * PageAnon is just for avoid tripping a split_huge_page
-                * internal debug check, as split_huge_page refuses to deal with
-                * anything that isn't an anon page. PageAnon can't go away fro
-                * under us because we hold a refcount on the hpage, without a
-                * refcount on the hpage. split_huge_page can't be safely called
-                * in the first place, having a refcount on the tail isn't
-                * enough * to be safe.
-                */
-               if (!PageHuge(hpage) && PageAnon(hpage)) {
-                       if (unlikely(split_huge_page(hpage))) {
-                               /*
-                                * FIXME: if splitting THP is failed, it is
-                                * better to stop the following operation rather
-                                * than causing panic by unmapping. System might
-                                * survive if the page is freed later.
-                                */
-                               printk(KERN_INFO
-                                       "MCE %#lx: failed to split THP\n", pfn);
-
-                               BUG_ON(!PageHWPoison(p));
-                               return SWAP_FAIL;
-                       }
-                       /* THP is split, so ppage should be the real poisoned page. */
-                       ppage = p;
-               }
-       }
-
        /*
         * First collect all the processes that have the page
         * mapped in dirty form.  This has to be done before try_to_unmap,
@@ -950,18 +1007,12 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * there's nothing that can be done.
         */
        if (kill)
-               collect_procs(ppage, &tokill);
+               collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
 
-       if (hpage != ppage)
-               lock_page(ppage);
-
-       ret = try_to_unmap(ppage, ttu);
+       ret = try_to_unmap(hpage, ttu);
        if (ret != SWAP_SUCCESS)
                printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
-                               pfn, page_mapcount(ppage));
-
-       if (hpage != ppage)
-               unlock_page(ppage);
+                               pfn, page_mapcount(hpage));
 
        /*
         * Now that the dirty bit has been propagated to the
@@ -973,7 +1024,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * use a more force-full uncatchable kill to prevent
         * any accesses to the poisoned memory.
         */
-       forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
+       forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
        kill_procs(&tokill, forcekill, trapno,
                      ret != SWAP_SUCCESS, p, pfn, flags);
 
@@ -983,7 +1034,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 static void set_page_hwpoison_huge_page(struct page *hpage)
 {
        int i;
-       int nr_pages = 1 << compound_trans_order(hpage);
+       int nr_pages = 1 << compound_order(hpage);
        for (i = 0; i < nr_pages; i++)
                SetPageHWPoison(hpage + i);
 }
@@ -991,7 +1042,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage)
 static void clear_page_hwpoison_huge_page(struct page *hpage)
 {
        int i;
-       int nr_pages = 1 << compound_trans_order(hpage);
+       int nr_pages = 1 << compound_order(hpage);
        for (i = 0; i < nr_pages; i++)
                ClearPageHWPoison(hpage + i);
 }
@@ -1019,6 +1070,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        struct page_state *ps;
        struct page *p;
        struct page *hpage;
+       struct page *orig_head;
        int res;
        unsigned int nr_pages;
        unsigned long page_flags;
@@ -1034,7 +1086,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        }
 
        p = pfn_to_page(pfn);
-       hpage = compound_head(p);
+       orig_head = hpage = compound_head(p);
        if (TestSetPageHWPoison(p)) {
                printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
                return 0;
@@ -1051,7 +1103,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                nr_pages = 1 << compound_order(hpage);
        else /* normal page or thp */
                nr_pages = 1;
-       atomic_long_add(nr_pages, &num_poisoned_pages);
+       num_poisoned_pages_add(nr_pages);
 
        /*
         * We need/can do nothing about count=0 pages.
@@ -1067,33 +1119,48 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         * In fact it's dangerous to directly bump up page count from 0,
         * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
         */
-       if (!(flags & MF_COUNT_INCREASED) &&
-               !get_page_unless_zero(hpage)) {
+       if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
                if (is_free_buddy_page(p)) {
-                       action_result(pfn, "free buddy", DELAYED);
+                       action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
                        return 0;
                } else if (PageHuge(hpage)) {
                        /*
-                        * Check "just unpoisoned", "filter hit", and
-                        * "race with other subpage."
+                        * Check "filter hit" and "race with other subpage."
                         */
                        lock_page(hpage);
-                       if (!PageHWPoison(hpage)
-                           || (hwpoison_filter(p) && TestClearPageHWPoison(p))
-                           || (p != hpage && TestSetPageHWPoison(hpage))) {
-                               atomic_long_sub(nr_pages, &num_poisoned_pages);
-                               return 0;
+                       if (PageHWPoison(hpage)) {
+                               if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
+                                   || (p != hpage && TestSetPageHWPoison(hpage))) {
+                                       num_poisoned_pages_sub(nr_pages);
+                                       unlock_page(hpage);
+                                       return 0;
+                               }
                        }
                        set_page_hwpoison_huge_page(hpage);
                        res = dequeue_hwpoisoned_huge_page(hpage);
-                       action_result(pfn, "free huge",
-                                     res ? IGNORED : DELAYED);
+                       action_result(pfn, MF_MSG_FREE_HUGE,
+                                     res ? MF_IGNORED : MF_DELAYED);
                        unlock_page(hpage);
                        return res;
                } else {
-                       action_result(pfn, "high order kernel", IGNORED);
+                       action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
+                       return -EBUSY;
+               }
+       }
+
+       if (!PageHuge(p) && PageTransHuge(hpage)) {
+               if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
+                       if (!PageAnon(hpage))
+                               pr_err("MCE: %#lx: non anonymous thp\n", pfn);
+                       else
+                               pr_err("MCE: %#lx: thp split failed\n", pfn);
+                       if (TestClearPageHWPoison(p))
+                               num_poisoned_pages_sub(nr_pages);
+                       put_hwpoison_page(p);
                        return -EBUSY;
                }
+               VM_BUG_ON_PAGE(!page_count(p), p);
+               hpage = compound_head(p);
        }
 
        /*
@@ -1104,7 +1171,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         * The check (unnecessarily) ignores LRU pages being isolated and
         * walked by the page reclaim code, however that's not a big loss.
         */
-       if (!PageHuge(p) && !PageTransTail(p)) {
+       if (!PageHuge(p)) {
                if (!PageLRU(p))
                        shake_page(p, 0);
                if (!PageLRU(p)) {
@@ -1112,22 +1179,27 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                         * shake_page could have turned it free.
                         */
                        if (is_free_buddy_page(p)) {
-                               action_result(pfn, "free buddy, 2nd try",
-                                               DELAYED);
+                               if (flags & MF_COUNT_INCREASED)
+                                       action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
+                               else
+                                       action_result(pfn, MF_MSG_BUDDY_2ND,
+                                                     MF_DELAYED);
                                return 0;
                        }
-                       action_result(pfn, "non LRU", IGNORED);
-                       put_page(p);
-                       return -EBUSY;
                }
        }
 
+       lock_page(hpage);
+
        /*
-        * Lock the page and wait for writeback to finish.
-        * It's very difficult to mess with pages currently under IO
-        * and in many cases impossible, so we just avoid it here.
+        * The page could have changed compound pages during the locking.
+        * If this happens just bail out.
         */
-       lock_page(hpage);
+       if (PageCompound(p) && compound_head(p) != orig_head) {
+               action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
+               res = -EBUSY;
+               goto out;
+       }
 
        /*
         * We use page flags to determine what action should be taken, but
@@ -1143,26 +1215,30 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         */
        if (!PageHWPoison(p)) {
                printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
-               res = 0;
-               goto out;
+               num_poisoned_pages_sub(nr_pages);
+               unlock_page(hpage);
+               put_hwpoison_page(hpage);
+               return 0;
        }
        if (hwpoison_filter(p)) {
                if (TestClearPageHWPoison(p))
-                       atomic_long_sub(nr_pages, &num_poisoned_pages);
+                       num_poisoned_pages_sub(nr_pages);
                unlock_page(hpage);
-               put_page(hpage);
+               put_hwpoison_page(hpage);
                return 0;
        }
 
+       if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p))
+               goto identify_page_state;
+
        /*
         * For error on the tail page, we should set PG_hwpoison
         * on the head page to show that the hugepage is hwpoisoned
         */
        if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
-               action_result(pfn, "hugepage already hardware poisoned",
-                               IGNORED);
+               action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED);
                unlock_page(hpage);
-               put_page(hpage);
+               put_hwpoison_page(hpage);
                return 0;
        }
        /*
@@ -1174,14 +1250,22 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        if (PageHuge(p))
                set_page_hwpoison_huge_page(hpage);
 
+       /*
+        * It's very difficult to mess with pages currently under IO
+        * and in many cases impossible, so we just avoid it here.
+        */
        wait_on_page_writeback(p);
 
        /*
         * Now take care of user space mappings.
         * Abort on fail: __delete_from_page_cache() assumes unmapped page.
+        *
+        * When the raw error page is thp tail page, hpage points to the raw
+        * page after thp split.
         */
-       if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) {
-               printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
+       if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
+           != SWAP_SUCCESS) {
+               action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
                res = -EBUSY;
                goto out;
        }
@@ -1190,11 +1274,12 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         * Torn down by someone else?
         */
        if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
-               action_result(pfn, "already truncated LRU", IGNORED);
+               action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
                res = -EBUSY;
                goto out;
        }
 
+identify_page_state:
        res = -EBUSY;
        /*
         * The first check uses the current page flags which may not have any
@@ -1204,6 +1289,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        for (ps = error_states;; ps++)
                if ((p->flags & ps->mask) == ps->res)
                        break;
+
+       page_flags |= (p->flags & (1UL << PG_dirty));
+
        if (!ps->mask)
                for (ps = error_states;; ps++)
                        if ((page_flags & ps->mask) == ps->res)
@@ -1262,10 +1350,10 @@ void memory_failure_queue(unsigned long pfn, int trapno, int flags)
 
        mf_cpu = &get_cpu_var(memory_failure_cpu);
        spin_lock_irqsave(&mf_cpu->lock, proc_flags);
-       if (kfifo_put(&mf_cpu->fifo, &entry))
+       if (kfifo_put(&mf_cpu->fifo, entry))
                schedule_work_on(smp_processor_id(), &mf_cpu->work);
        else
-               pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n",
+               pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
                       pfn);
        spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
        put_cpu_var(memory_failure_cpu);
@@ -1279,14 +1367,17 @@ static void memory_failure_work_func(struct work_struct *work)
        unsigned long proc_flags;
        int gotten;
 
-       mf_cpu = &__get_cpu_var(memory_failure_cpu);
+       mf_cpu = this_cpu_ptr(&memory_failure_cpu);
        for (;;) {
                spin_lock_irqsave(&mf_cpu->lock, proc_flags);
                gotten = kfifo_get(&mf_cpu->fifo, &entry);
                spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
                if (!gotten)
                        break;
-               memory_failure(entry.pfn, entry.trapno, entry.flags);
+               if (entry.flags & MF_SOFT_OFFLINE)
+                       soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
+               else
+                       memory_failure(entry.pfn, entry.trapno, entry.flags);
        }
 }
 
@@ -1306,6 +1397,12 @@ static int __init memory_failure_init(void)
 }
 core_initcall(memory_failure_init);
 
+#define unpoison_pr_info(fmt, pfn, rs)                 \
+({                                                     \
+       if (__ratelimit(rs))                            \
+               pr_info(fmt, pfn);                      \
+})
+
 /**
  * unpoison_memory - Unpoison a previously poisoned page
  * @pfn: Page number of the to be unpoisoned page
@@ -1324,6 +1421,8 @@ int unpoison_memory(unsigned long pfn)
        struct page *p;
        int freeit = 0;
        unsigned int nr_pages;
+       static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
+                                       DEFAULT_RATELIMIT_BURST);
 
        if (!pfn_valid(pfn))
                return -ENXIO;
@@ -1332,13 +1431,43 @@ int unpoison_memory(unsigned long pfn)
        page = compound_head(p);
 
        if (!PageHWPoison(p)) {
-               pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
+               unpoison_pr_info("MCE: Page was already unpoisoned %#lx\n",
+                                pfn, &unpoison_rs);
+               return 0;
+       }
+
+       if (page_count(page) > 1) {
+               unpoison_pr_info("MCE: Someone grabs the hwpoison page %#lx\n",
+                                pfn, &unpoison_rs);
+               return 0;
+       }
+
+       if (page_mapped(page)) {
+               unpoison_pr_info("MCE: Someone maps the hwpoison page %#lx\n",
+                                pfn, &unpoison_rs);
                return 0;
        }
 
-       nr_pages = 1 << compound_trans_order(page);
+       if (page_mapping(page)) {
+               unpoison_pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n",
+                                pfn, &unpoison_rs);
+               return 0;
+       }
 
-       if (!get_page_unless_zero(page)) {
+       /*
+        * unpoison_memory() can encounter thp only when the thp is being
+        * worked by memory_failure() and the page lock is not held yet.
+        * In such case, we yield to memory_failure() and make unpoison fail.
+        */
+       if (!PageHuge(page) && PageTransHuge(page)) {
+               unpoison_pr_info("MCE: Memory failure is now running on %#lx\n",
+                                pfn, &unpoison_rs);
+               return 0;
+       }
+
+       nr_pages = 1 << compound_order(page);
+
+       if (!get_hwpoison_page(p)) {
                /*
                 * Since HWPoisoned hugepage should have non-zero refcount,
                 * race between memory failure and unpoison seems to happen.
@@ -1346,12 +1475,14 @@ int unpoison_memory(unsigned long pfn)
                 * to the end.
                 */
                if (PageHuge(page)) {
-                       pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
+                       unpoison_pr_info("MCE: Memory failure is now running on free hugepage %#lx\n",
+                                        pfn, &unpoison_rs);
                        return 0;
                }
                if (TestClearPageHWPoison(p))
-                       atomic_long_sub(nr_pages, &num_poisoned_pages);
-               pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
+                       num_poisoned_pages_dec();
+               unpoison_pr_info("MCE: Software-unpoisoned free page %#lx\n",
+                                pfn, &unpoison_rs);
                return 0;
        }
 
@@ -1363,17 +1494,18 @@ int unpoison_memory(unsigned long pfn)
         * the free buddy page pool.
         */
        if (TestClearPageHWPoison(page)) {
-               pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
-               atomic_long_sub(nr_pages, &num_poisoned_pages);
+               unpoison_pr_info("MCE: Software-unpoisoned page %#lx\n",
+                                pfn, &unpoison_rs);
+               num_poisoned_pages_sub(nr_pages);
                freeit = 1;
                if (PageHuge(page))
                        clear_page_hwpoison_huge_page(page);
        }
        unlock_page(page);
 
-       put_page(page);
-       if (freeit)
-               put_page(page);
+       put_hwpoison_page(page);
+       if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
+               put_hwpoison_page(page);
 
        return 0;
 }
@@ -1386,7 +1518,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x)
                return alloc_huge_page_node(page_hstate(compound_head(p)),
                                                   nid);
        else
-               return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
+               return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0);
 }
 
 /*
@@ -1402,22 +1534,11 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
        if (flags & MF_COUNT_INCREASED)
                return 1;
 
-       /*
-        * The lock_memory_hotplug prevents a race with memory hotplug.
-        * This is a big hammer, a better would be nicer.
-        */
-       lock_memory_hotplug();
-
-       /*
-        * Isolate the page, so that it doesn't get reallocated if it
-        * was free.
-        */
-       set_migratetype_isolate(p, true);
        /*
         * When the target page is a free hugepage, just remove it
         * from free hugepage list.
         */
-       if (!get_page_unless_zero(compound_head(p))) {
+       if (!get_hwpoison_page(p)) {
                if (PageHuge(p)) {
                        pr_info("%s: %#lx free huge page\n", __func__, pfn);
                        ret = 0;
@@ -1433,8 +1554,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
                /* Not a free page */
                ret = 1;
        }
-       unset_migratetype_isolate(p, MIGRATE_MOVABLE);
-       unlock_memory_hotplug();
        return ret;
 }
 
@@ -1446,14 +1565,16 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
                /*
                 * Try to free it.
                 */
-               put_page(page);
+               put_hwpoison_page(page);
                shake_page(page, 1);
 
                /*
                 * Did it turn free?
                 */
                ret = __get_any_page(page, pfn, 0);
-               if (!PageLRU(page)) {
+               if (ret == 1 && !PageLRU(page)) {
+                       /* Drop page reference which is from __get_any_page() */
+                       put_hwpoison_page(page);
                        pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
                                pfn, page->flags);
                        return -EIO;
@@ -1467,6 +1588,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
        int ret;
        unsigned long pfn = page_to_pfn(page);
        struct page *hpage = compound_head(page);
+       LIST_HEAD(pagelist);
 
        /*
         * This double-check of PageHWPoison is to avoid the race with
@@ -1475,91 +1597,47 @@ static int soft_offline_huge_page(struct page *page, int flags)
        lock_page(hpage);
        if (PageHWPoison(hpage)) {
                unlock_page(hpage);
-               put_page(hpage);
+               put_hwpoison_page(hpage);
                pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
                return -EBUSY;
        }
        unlock_page(hpage);
 
-       /* Keep page count to indicate a given hugepage is isolated. */
-       ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL,
-                               MIGRATE_SYNC);
-       put_page(hpage);
+       ret = isolate_huge_page(hpage, &pagelist);
+       /*
+        * get_any_page() and isolate_huge_page() takes a refcount each,
+        * so need to drop one here.
+        */
+       put_hwpoison_page(hpage);
+       if (!ret) {
+               pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
+               return -EBUSY;
+       }
+
+       ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
+                               MIGRATE_SYNC, MR_MEMORY_FAILURE);
        if (ret) {
                pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
                        pfn, ret, page->flags);
+               /*
+                * We know that soft_offline_huge_page() tries to migrate
+                * only one hugepage pointed to by hpage, so we need not
+                * run through the pagelist here.
+                */
+               putback_active_hugepage(hpage);
+               if (ret > 0)
+                       ret = -EIO;
        } else {
-               set_page_hwpoison_huge_page(hpage);
-               dequeue_hwpoisoned_huge_page(hpage);
-               atomic_long_add(1 << compound_trans_order(hpage),
-                               &num_poisoned_pages);
-       }
-       /* keep elevated page count for bad page */
-       return ret;
-}
-
-static int __soft_offline_page(struct page *page, int flags);
-
-/**
- * soft_offline_page - Soft offline a page.
- * @page: page to offline
- * @flags: flags. Same as memory_failure().
- *
- * Returns 0 on success, otherwise negated errno.
- *
- * Soft offline a page, by migration or invalidation,
- * without killing anything. This is for the case when
- * a page is not corrupted yet (so it's still valid to access),
- * but has had a number of corrected errors and is better taken
- * out.
- *
- * The actual policy on when to do that is maintained by
- * user space.
- *
- * This should never impact any application or cause data loss,
- * however it might take some time.
- *
- * This is not a 100% solution for all memory, but tries to be
- * ``good enough'' for the majority of memory.
- */
-int soft_offline_page(struct page *page, int flags)
-{
-       int ret;
-       unsigned long pfn = page_to_pfn(page);
-       struct page *hpage = compound_trans_head(page);
-
-       if (PageHWPoison(page)) {
-               pr_info("soft offline: %#lx page already poisoned\n", pfn);
-               return -EBUSY;
-       }
-       if (!PageHuge(page) && PageTransHuge(hpage)) {
-               if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
-                       pr_info("soft offline: %#lx: failed to split THP\n",
-                               pfn);
-                       return -EBUSY;
-               }
-       }
-
-       ret = get_any_page(page, pfn, flags);
-       if (ret < 0)
-               return ret;
-       if (ret) { /* for in-use pages */
-               if (PageHuge(page))
-                       ret = soft_offline_huge_page(page, flags);
-               else
-                       ret = __soft_offline_page(page, flags);
-       } else { /* for free pages */
+               /* overcommit hugetlb page will be freed to buddy */
                if (PageHuge(page)) {
                        set_page_hwpoison_huge_page(hpage);
                        dequeue_hwpoisoned_huge_page(hpage);
-                       atomic_long_add(1 << compound_trans_order(hpage),
-                                       &num_poisoned_pages);
+                       num_poisoned_pages_add(1 << compound_order(hpage));
                } else {
                        SetPageHWPoison(page);
-                       atomic_long_inc(&num_poisoned_pages);
+                       num_poisoned_pages_inc();
                }
        }
-       /* keep elevated page count for bad page */
        return ret;
 }
 
@@ -1578,7 +1656,7 @@ static int __soft_offline_page(struct page *page, int flags)
        wait_on_page_writeback(page);
        if (PageHWPoison(page)) {
                unlock_page(page);
-               put_page(page);
+               put_hwpoison_page(page);
                pr_info("soft offline: %#lx page already poisoned\n", pfn);
                return -EBUSY;
        }
@@ -1593,10 +1671,10 @@ static int __soft_offline_page(struct page *page, int flags)
         * would need to fix isolation locking first.
         */
        if (ret == 1) {
-               put_page(page);
+               put_hwpoison_page(page);
                pr_info("soft_offline: %#lx: invalidated\n", pfn);
                SetPageHWPoison(page);
-               atomic_long_inc(&num_poisoned_pages);
+               num_poisoned_pages_inc();
                return 0;
        }
 
@@ -1610,23 +1688,26 @@ static int __soft_offline_page(struct page *page, int flags)
         * Drop page reference which is came from get_any_page()
         * successful isolate_lru_page() already took another one.
         */
-       put_page(page);
+       put_hwpoison_page(page);
        if (!ret) {
                LIST_HEAD(pagelist);
                inc_zone_page_state(page, NR_ISOLATED_ANON +
                                        page_is_file_cache(page));
                list_add(&page->lru, &pagelist);
-               ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+               ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
                                        MIGRATE_SYNC, MR_MEMORY_FAILURE);
                if (ret) {
-                       putback_lru_pages(&pagelist);
+                       if (!list_empty(&pagelist)) {
+                               list_del(&page->lru);
+                               dec_zone_page_state(page, NR_ISOLATED_ANON +
+                                               page_is_file_cache(page));
+                               putback_lru_page(page);
+                       }
+
                        pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
                                pfn, ret, page->flags);
                        if (ret > 0)
                                ret = -EIO;
-               } else {
-                       SetPageHWPoison(page);
-                       atomic_long_inc(&num_poisoned_pages);
                }
        } else {
                pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
@@ -1634,3 +1715,69 @@ static int __soft_offline_page(struct page *page, int flags)
        }
        return ret;
 }
+
+/**
+ * soft_offline_page - Soft offline a page.
+ * @page: page to offline
+ * @flags: flags. Same as memory_failure().
+ *
+ * Returns 0 on success, otherwise negated errno.
+ *
+ * Soft offline a page, by migration or invalidation,
+ * without killing anything. This is for the case when
+ * a page is not corrupted yet (so it's still valid to access),
+ * but has had a number of corrected errors and is better taken
+ * out.
+ *
+ * The actual policy on when to do that is maintained by
+ * user space.
+ *
+ * This should never impact any application or cause data loss,
+ * however it might take some time.
+ *
+ * This is not a 100% solution for all memory, but tries to be
+ * ``good enough'' for the majority of memory.
+ */
+int soft_offline_page(struct page *page, int flags)
+{
+       int ret;
+       unsigned long pfn = page_to_pfn(page);
+       struct page *hpage = compound_head(page);
+
+       if (PageHWPoison(page)) {
+               pr_info("soft offline: %#lx page already poisoned\n", pfn);
+               if (flags & MF_COUNT_INCREASED)
+                       put_hwpoison_page(page);
+               return -EBUSY;
+       }
+       if (!PageHuge(page) && PageTransHuge(hpage)) {
+               if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
+                       pr_info("soft offline: %#lx: failed to split THP\n",
+                               pfn);
+                       if (flags & MF_COUNT_INCREASED)
+                               put_hwpoison_page(page);
+                       return -EBUSY;
+               }
+       }
+
+       get_online_mems();
+
+       ret = get_any_page(page, pfn, flags);
+       put_online_mems();
+       if (ret > 0) { /* for in-use pages */
+               if (PageHuge(page))
+                       ret = soft_offline_huge_page(page, flags);
+               else
+                       ret = __soft_offline_page(page, flags);
+       } else if (ret == 0) { /* for free pages */
+               if (PageHuge(page)) {
+                       set_page_hwpoison_huge_page(hpage);
+                       if (!dequeue_hwpoisoned_huge_page(hpage))
+                               num_poisoned_pages_add(1 << compound_order(hpage));
+               } else {
+                       if (!TestSetPageHWPoison(page))
+                               num_poisoned_pages_inc();
+               }
+       }
+       return ret;
+}