mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case NUMA_NO_NODE here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66 */
  67
  68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  69
  70 #include <linux/mempolicy.h>
  71 #include <linux/mm.h>
  72 #include <linux/highmem.h>
  73 #include <linux/hugetlb.h>
  74 #include <linux/kernel.h>
  75 #include <linux/sched.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/slab.h>
  79 #include <linux/string.h>
  80 #include <linux/export.h>
  81 #include <linux/nsproxy.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/swap.h>
  86 #include <linux/seq_file.h>
  87 #include <linux/proc_fs.h>
  88 #include <linux/migrate.h>
  89 #include <linux/ksm.h>
  90 #include <linux/rmap.h>
  91 #include <linux/security.h>
  92 #include <linux/syscalls.h>
  93 #include <linux/ctype.h>
  94 #include <linux/mm_inline.h>
  95 #include <linux/mmu_notifier.h>
  96 #include <linux/printk.h>
  97
  98 #include <asm/tlbflush.h>
  99 #include <asm/uaccess.h>
 100 #include <linux/random.h>
 101
 102 #include "internal.h"
 103
 104 /* Internal flags */
 105 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 106 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 107
 108 static struct kmem_cache *policy_cache;
 109 static struct kmem_cache *sn_cache;
 110
 111 /* Highest zone. An specific allocation for a zone below that is not
 112    policied. */
 113 enum zone_type policy_zone = 0;
 114
 115 /*
 116  * run-time system-wide default policy => local allocation
 117  */
 118 static struct mempolicy default_policy = {
 119         .refcnt = ATOMIC_INIT(1), /* never free it */
 120         .mode = MPOL_PREFERRED,
 121         .flags = MPOL_F_LOCAL,
 122 };
 123
 124 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 125
 126 struct mempolicy *get_task_policy(struct task_struct *p)
 127 {
 128         struct mempolicy *pol = p->mempolicy;
 129         int node;
 130
 131         if (pol)
 132                 return pol;
 133
 134         node = numa_node_id();
 135         if (node != NUMA_NO_NODE) {
 136                 pol = &preferred_node_policy[node];
 137                 /* preferred_node_policy is not initialised early in boot */
 138                 if (pol->mode)
 139                         return pol;
 140         }
 141
 142         return &default_policy;
 143 }
 144
 145 static const struct mempolicy_operations {
 146         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 147         /*
 148          * If read-side task has no lock to protect task->mempolicy, write-side
 149          * task will rebind the task->mempolicy by two step. The first step is
 150          * setting all the newly nodes, and the second step is cleaning all the
 151          * disallowed nodes. In this way, we can avoid finding no node to alloc
 152          * page.
 153          * If we have a lock to protect task->mempolicy in read-side, we do
 154          * rebind directly.
 155          *
 156          * step:
 157          *      MPOL_REBIND_ONCE - do rebind work at once
 158          *      MPOL_REBIND_STEP1 - set all the newly nodes
 159          *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 160          */
 161         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
 162                         enum mpol_rebind_step step);
 163 } mpol_ops[MPOL_MAX];
 164
 165 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 166 {
 167         return pol->flags & MPOL_MODE_FLAGS;
 168 }
 169
 170 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 171                                    const nodemask_t *rel)
 172 {
 173         nodemask_t tmp;
 174         nodes_fold(tmp, *orig, nodes_weight(*rel));
 175         nodes_onto(*ret, tmp, *rel);
 176 }
 177
 178 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 179 {
 180         if (nodes_empty(*nodes))
 181                 return -EINVAL;
 182         pol->v.nodes = *nodes;
 183         return 0;
 184 }
 185
 186 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 187 {
 188         if (!nodes)
 189                 pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 190         else if (nodes_empty(*nodes))
 191                 return -EINVAL;                 /*  no allowed nodes */
 192         else
 193                 pol->v.preferred_node = first_node(*nodes);
 194         return 0;
 195 }
 196
 197 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 198 {
 199         if (nodes_empty(*nodes))
 200                 return -EINVAL;
 201         pol->v.nodes = *nodes;
 202         return 0;
 203 }
 204
 205 /*
 206  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 207  * any, for the new policy.  mpol_new() has already validated the nodes
 208  * parameter with respect to the policy mode and flags.  But, we need to
 209  * handle an empty nodemask with MPOL_PREFERRED here.
 210  *
 211  * Must be called holding task's alloc_lock to protect task's mems_allowed
 212  * and mempolicy.  May also be called holding the mmap_semaphore for write.
 213  */
 214 static int mpol_set_nodemask(struct mempolicy *pol,
 215                      const nodemask_t *nodes, struct nodemask_scratch *nsc)
 216 {
 217         int ret;
 218
 219         /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 220         if (pol == NULL)
 221                 return 0;
 222         /* Check N_MEMORY */
 223         nodes_and(nsc->mask1,
 224                   cpuset_current_mems_allowed, node_states[N_MEMORY]);
 225
 226         VM_BUG_ON(!nodes);
 227         if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 228                 nodes = NULL;   /* explicit local allocation */
 229         else {
 230                 if (pol->flags & MPOL_F_RELATIVE_NODES)
 231                         mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
 232                 else
 233                         nodes_and(nsc->mask2, *nodes, nsc->mask1);
 234
 235                 if (mpol_store_user_nodemask(pol))
 236                         pol->w.user_nodemask = *nodes;
 237                 else
 238                         pol->w.cpuset_mems_allowed =
 239                                                 cpuset_current_mems_allowed;
 240         }
 241
 242         if (nodes)
 243                 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 244         else
 245                 ret = mpol_ops[pol->mode].create(pol, NULL);
 246         return ret;
 247 }
 248
 249 /*
 250  * This function just creates a new policy, does some check and simple
 251  * initialization. You must invoke mpol_set_nodemask() to set nodes.
 252  */
 253 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 254                                   nodemask_t *nodes)
 255 {
 256         struct mempolicy *policy;
 257
 258         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 259                  mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 260
 261         if (mode == MPOL_DEFAULT) {
 262                 if (nodes && !nodes_empty(*nodes))
 263                         return ERR_PTR(-EINVAL);
 264                 return NULL;
 265         }
 266         VM_BUG_ON(!nodes);
 267
 268         /*
 269          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 270          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 271          * All other modes require a valid pointer to a non-empty nodemask.
 272          */
 273         if (mode == MPOL_PREFERRED) {
 274                 if (nodes_empty(*nodes)) {
 275                         if (((flags & MPOL_F_STATIC_NODES) ||
 276                              (flags & MPOL_F_RELATIVE_NODES)))
 277                                 return ERR_PTR(-EINVAL);
 278                 }
 279         } else if (mode == MPOL_LOCAL) {
 280                 if (!nodes_empty(*nodes))
 281                         return ERR_PTR(-EINVAL);
 282                 mode = MPOL_PREFERRED;
 283         } else if (nodes_empty(*nodes))
 284                 return ERR_PTR(-EINVAL);
 285         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 286         if (!policy)
 287                 return ERR_PTR(-ENOMEM);
 288         atomic_set(&policy->refcnt, 1);
 289         policy->mode = mode;
 290         policy->flags = flags;
 291
 292         return policy;
 293 }
 294
 295 /* Slow path of a mpol destructor. */
 296 void __mpol_put(struct mempolicy *p)
 297 {
 298         if (!atomic_dec_and_test(&p->refcnt))
 299                 return;
 300         kmem_cache_free(policy_cache, p);
 301 }
 302
 303 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
 304                                 enum mpol_rebind_step step)
 305 {
 306 }
 307
 308 /*
 309  * step:
 310  *      MPOL_REBIND_ONCE  - do rebind work at once
 311  *      MPOL_REBIND_STEP1 - set all the newly nodes
 312  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 313  */
 314 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 315                                  enum mpol_rebind_step step)
 316 {
 317         nodemask_t tmp;
 318
 319         if (pol->flags & MPOL_F_STATIC_NODES)
 320                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 321         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 322                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 323         else {
 324                 /*
 325                  * if step == 1, we use ->w.cpuset_mems_allowed to cache the
 326                  * result
 327                  */
 328                 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
 329                         nodes_remap(tmp, pol->v.nodes,
 330                                         pol->w.cpuset_mems_allowed, *nodes);
 331                         pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
 332                 } else if (step == MPOL_REBIND_STEP2) {
 333                         tmp = pol->w.cpuset_mems_allowed;
 334                         pol->w.cpuset_mems_allowed = *nodes;
 335                 } else
 336                         BUG();
 337         }
 338
 339         if (nodes_empty(tmp))
 340                 tmp = *nodes;
 341
 342         if (step == MPOL_REBIND_STEP1)
 343                 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
 344         else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
 345                 pol->v.nodes = tmp;
 346         else
 347                 BUG();
 348
 349         if (!node_isset(current->il_next, tmp)) {
 350                 current->il_next = next_node(current->il_next, tmp);
 351                 if (current->il_next >= MAX_NUMNODES)
 352                         current->il_next = first_node(tmp);
 353                 if (current->il_next >= MAX_NUMNODES)
 354                         current->il_next = numa_node_id();
 355         }
 356 }
 357
 358 static void mpol_rebind_preferred(struct mempolicy *pol,
 359                                   const nodemask_t *nodes,
 360                                   enum mpol_rebind_step step)
 361 {
 362         nodemask_t tmp;
 363
 364         if (pol->flags & MPOL_F_STATIC_NODES) {
 365                 int node = first_node(pol->w.user_nodemask);
 366
 367                 if (node_isset(node, *nodes)) {
 368                         pol->v.preferred_node = node;
 369                         pol->flags &= ~MPOL_F_LOCAL;
 370                 } else
 371                         pol->flags |= MPOL_F_LOCAL;
 372         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 373                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 374                 pol->v.preferred_node = first_node(tmp);
 375         } else if (!(pol->flags & MPOL_F_LOCAL)) {
 376                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
 377                                                    pol->w.cpuset_mems_allowed,
 378                                                    *nodes);
 379                 pol->w.cpuset_mems_allowed = *nodes;
 380         }
 381 }
 382
 383 /*
 384  * mpol_rebind_policy - Migrate a policy to a different set of nodes
 385  *
 386  * If read-side task has no lock to protect task->mempolicy, write-side
 387  * task will rebind the task->mempolicy by two step. The first step is
 388  * setting all the newly nodes, and the second step is cleaning all the
 389  * disallowed nodes. In this way, we can avoid finding no node to alloc
 390  * page.
 391  * If we have a lock to protect task->mempolicy in read-side, we do
 392  * rebind directly.
 393  *
 394  * step:
 395  *      MPOL_REBIND_ONCE  - do rebind work at once
 396  *      MPOL_REBIND_STEP1 - set all the newly nodes
 397  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 398  */
 399 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
 400                                 enum mpol_rebind_step step)
 401 {
 402         if (!pol)
 403                 return;
 404         if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
 405             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 406                 return;
 407
 408         if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
 409                 return;
 410
 411         if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
 412                 BUG();
 413
 414         if (step == MPOL_REBIND_STEP1)
 415                 pol->flags |= MPOL_F_REBINDING;
 416         else if (step == MPOL_REBIND_STEP2)
 417                 pol->flags &= ~MPOL_F_REBINDING;
 418         else if (step >= MPOL_REBIND_NSTEP)
 419                 BUG();
 420
 421         mpol_ops[pol->mode].rebind(pol, newmask, step);
 422 }
 423
 424 /*
 425  * Wrapper for mpol_rebind_policy() that just requires task
 426  * pointer, and updates task mempolicy.
 427  *
 428  * Called with task's alloc_lock held.
 429  */
 430
 431 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
 432                         enum mpol_rebind_step step)
 433 {
 434         mpol_rebind_policy(tsk->mempolicy, new, step);
 435 }
 436
 437 /*
 438  * Rebind each vma in mm to new nodemask.
 439  *
 440  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 441  */
 442
 443 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 444 {
 445         struct vm_area_struct *vma;
 446
 447         down_write(&mm->mmap_sem);
 448         for (vma = mm->mmap; vma; vma = vma->vm_next)
 449                 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
 450         up_write(&mm->mmap_sem);
 451 }
 452
 453 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 454         [MPOL_DEFAULT] = {
 455                 .rebind = mpol_rebind_default,
 456         },
 457         [MPOL_INTERLEAVE] = {
 458                 .create = mpol_new_interleave,
 459                 .rebind = mpol_rebind_nodemask,
 460         },
 461         [MPOL_PREFERRED] = {
 462                 .create = mpol_new_preferred,
 463                 .rebind = mpol_rebind_preferred,
 464         },
 465         [MPOL_BIND] = {
 466                 .create = mpol_new_bind,
 467                 .rebind = mpol_rebind_nodemask,
 468         },
 469 };
 470
 471 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 472                                 unsigned long flags);
 473
 474 /*
 475  * Scan through pages checking if pages follow certain conditions,
 476  * and move them to the pagelist if they do.
 477  */
 478 static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 479                 unsigned long addr, unsigned long end,
 480                 const nodemask_t *nodes, unsigned long flags,
 481                 void *private)
 482 {
 483         pte_t *orig_pte;
 484         pte_t *pte;
 485         spinlock_t *ptl;
 486
 487         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 488         do {
 489                 struct page *page;
 490                 int nid;
 491
 492                 if (!pte_present(*pte))
 493                         continue;
 494                 page = vm_normal_page(vma, addr, *pte);
 495                 if (!page)
 496                         continue;
 497                 /*
 498                  * vm_normal_page() filters out zero pages, but there might
 499                  * still be PageReserved pages to skip, perhaps in a VDSO.
 500                  */
 501                 if (PageReserved(page))
 502                         continue;
 503                 nid = page_to_nid(page);
 504                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 505                         continue;
 506
 507                 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 508                         migrate_page_add(page, private, flags);
 509                 else
 510                         break;
 511         } while (pte++, addr += PAGE_SIZE, addr != end);
 512         pte_unmap_unlock(orig_pte, ptl);
 513         return addr != end;
 514 }
 515
 516 static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
 517                 pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
 518                                     void *private)
 519 {
 520 #ifdef CONFIG_HUGETLB_PAGE
 521         int nid;
 522         struct page *page;
 523         spinlock_t *ptl;
 524         pte_t entry;
 525
 526         ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
 527         entry = huge_ptep_get((pte_t *)pmd);
 528         if (!pte_present(entry))
 529                 goto unlock;
 530         page = pte_page(entry);
 531         nid = page_to_nid(page);
 532         if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 533                 goto unlock;
 534         /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 535         if (flags & (MPOL_MF_MOVE_ALL) ||
 536             (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
 537                 isolate_huge_page(page, private);
 538 unlock:
 539         spin_unlock(ptl);
 540 #else
 541         BUG();
 542 #endif
 543 }
 544
 545 static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 546                 unsigned long addr, unsigned long end,
 547                 const nodemask_t *nodes, unsigned long flags,
 548                 void *private)
 549 {
 550         pmd_t *pmd;
 551         unsigned long next;
 552
 553         pmd = pmd_offset(pud, addr);
 554         do {
 555                 next = pmd_addr_end(addr, end);
 556                 if (!pmd_present(*pmd))
 557                         continue;
 558                 if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
 559                         queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
 560                                                 flags, private);
 561                         continue;
 562                 }
 563                 split_huge_page_pmd(vma, addr, pmd);
 564                 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 565                         continue;
 566                 if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
 567                                     flags, private))
 568                         return -EIO;
 569         } while (pmd++, addr = next, addr != end);
 570         return 0;
 571 }
 572
 573 static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 574                 unsigned long addr, unsigned long end,
 575                 const nodemask_t *nodes, unsigned long flags,
 576                 void *private)
 577 {
 578         pud_t *pud;
 579         unsigned long next;
 580
 581         pud = pud_offset(pgd, addr);
 582         do {
 583                 next = pud_addr_end(addr, end);
 584                 if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
 585                         continue;
 586                 if (pud_none_or_clear_bad(pud))
 587                         continue;
 588                 if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
 589                                     flags, private))
 590                         return -EIO;
 591         } while (pud++, addr = next, addr != end);
 592         return 0;
 593 }
 594
 595 static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
 596                 unsigned long addr, unsigned long end,
 597                 const nodemask_t *nodes, unsigned long flags,
 598                 void *private)
 599 {
 600         pgd_t *pgd;
 601         unsigned long next;
 602
 603         pgd = pgd_offset(vma->vm_mm, addr);
 604         do {
 605                 next = pgd_addr_end(addr, end);
 606                 if (pgd_none_or_clear_bad(pgd))
 607                         continue;
 608                 if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
 609                                     flags, private))
 610                         return -EIO;
 611         } while (pgd++, addr = next, addr != end);
 612         return 0;
 613 }
 614
 615 #ifdef CONFIG_NUMA_BALANCING
 616 /*
 617  * This is used to mark a range of virtual addresses to be inaccessible.
 618  * These are later cleared by a NUMA hinting fault. Depending on these
 619  * faults, pages may be migrated for better NUMA placement.
 620  *
 621  * This is assuming that NUMA faults are handled using PROT_NONE. If
 622  * an architecture makes a different choice, it will need further
 623  * changes to the core.
 624  */
 625 unsigned long change_prot_numa(struct vm_area_struct *vma,
 626                         unsigned long addr, unsigned long end)
 627 {
 628         int nr_updated;
 629
 630         nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
 631         if (nr_updated)
 632                 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 633
 634         return nr_updated;
 635 }
 636 #else
 637 static unsigned long change_prot_numa(struct vm_area_struct *vma,
 638                         unsigned long addr, unsigned long end)
 639 {
 640         return 0;
 641 }
 642 #endif /* CONFIG_NUMA_BALANCING */
 643
 644 /*
 645  * Walk through page tables and collect pages to be migrated.
 646  *
 647  * If pages found in a given range are on a set of nodes (determined by
 648  * @nodes and @flags,) it's isolated and queued to the pagelist which is
 649  * passed via @private.)
 650  */
 651 static int
 652 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 653                 const nodemask_t *nodes, unsigned long flags, void *private)
 654 {
 655         int err = 0;
 656         struct vm_area_struct *vma, *prev;
 657
 658         vma = find_vma(mm, start);
 659         if (!vma)
 660                 return -EFAULT;
 661         prev = NULL;
 662         for (; vma && vma->vm_start < end; vma = vma->vm_next) {
 663                 unsigned long endvma = vma->vm_end;
 664
 665                 if (endvma > end)
 666                         endvma = end;
 667                 if (vma->vm_start > start)
 668                         start = vma->vm_start;
 669
 670                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 671                         if (!vma->vm_next && vma->vm_end < end)
 672                                 return -EFAULT;
 673                         if (prev && prev->vm_end < vma->vm_start)
 674                                 return -EFAULT;
 675                 }
 676
 677                 if (flags & MPOL_MF_LAZY) {
 678                         /* Similar to task_numa_work, skip inaccessible VMAs */
 679                         if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
 680                                 change_prot_numa(vma, start, endvma);
 681                         goto next;
 682                 }
 683
 684                 if ((flags & MPOL_MF_STRICT) ||
 685                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 686                       vma_migratable(vma))) {
 687
 688                         err = queue_pages_pgd_range(vma, start, endvma, nodes,
 689                                                 flags, private);
 690                         if (err)
 691                                 break;
 692                 }
 693 next:
 694                 prev = vma;
 695         }
 696         return err;
 697 }
 698
 699 /*
 700  * Apply policy to a single VMA
 701  * This must be called with the mmap_sem held for writing.
 702  */
 703 static int vma_replace_policy(struct vm_area_struct *vma,
 704                                                 struct mempolicy *pol)
 705 {
 706         int err;
 707         struct mempolicy *old;
 708         struct mempolicy *new;
 709
 710         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 711                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 712                  vma->vm_ops, vma->vm_file,
 713                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 714
 715         new = mpol_dup(pol);
 716         if (IS_ERR(new))
 717                 return PTR_ERR(new);
 718
 719         if (vma->vm_ops && vma->vm_ops->set_policy) {
 720                 err = vma->vm_ops->set_policy(vma, new);
 721                 if (err)
 722                         goto err_out;
 723         }
 724
 725         old = vma->vm_policy;
 726         vma->vm_policy = new; /* protected by mmap_sem */
 727         mpol_put(old);
 728
 729         return 0;
 730  err_out:
 731         mpol_put(new);
 732         return err;
 733 }
 734
 735 /* Step 2: apply policy to a range and do splits. */
 736 static int mbind_range(struct mm_struct *mm, unsigned long start,
 737                        unsigned long end, struct mempolicy *new_pol)
 738 {
 739         struct vm_area_struct *next;
 740         struct vm_area_struct *prev;
 741         struct vm_area_struct *vma;
 742         int err = 0;
 743         pgoff_t pgoff;
 744         unsigned long vmstart;
 745         unsigned long vmend;
 746
 747         vma = find_vma(mm, start);
 748         if (!vma || vma->vm_start > start)
 749                 return -EFAULT;
 750
 751         prev = vma->vm_prev;
 752         if (start > vma->vm_start)
 753                 prev = vma;
 754
 755         for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 756                 next = vma->vm_next;
 757                 vmstart = max(start, vma->vm_start);
 758                 vmend   = min(end, vma->vm_end);
 759
 760                 if (mpol_equal(vma_policy(vma), new_pol))
 761                         continue;
 762
 763                 pgoff = vma->vm_pgoff +
 764                         ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 765                 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 766                                   vma->anon_vma, vma->vm_file, pgoff,
 767                                   new_pol);
 768                 if (prev) {
 769                         vma = prev;
 770                         next = vma->vm_next;
 771                         if (mpol_equal(vma_policy(vma), new_pol))
 772                                 continue;
 773                         /* vma_merge() joined vma && vma->next, case 8 */
 774                         goto replace;
 775                 }
 776                 if (vma->vm_start != vmstart) {
 777                         err = split_vma(vma->vm_mm, vma, vmstart, 1);
 778                         if (err)
 779                                 goto out;
 780                 }
 781                 if (vma->vm_end != vmend) {
 782                         err = split_vma(vma->vm_mm, vma, vmend, 0);
 783                         if (err)
 784                                 goto out;
 785                 }
 786  replace:
 787                 err = vma_replace_policy(vma, new_pol);
 788                 if (err)
 789                         goto out;
 790         }
 791
 792  out:
 793         return err;
 794 }
 795
 796 /* Set the process memory policy */
 797 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 798                              nodemask_t *nodes)
 799 {
 800         struct mempolicy *new, *old;
 801         NODEMASK_SCRATCH(scratch);
 802         int ret;
 803
 804         if (!scratch)
 805                 return -ENOMEM;
 806
 807         new = mpol_new(mode, flags, nodes);
 808         if (IS_ERR(new)) {
 809                 ret = PTR_ERR(new);
 810                 goto out;
 811         }
 812
 813         task_lock(current);
 814         ret = mpol_set_nodemask(new, nodes, scratch);
 815         if (ret) {
 816                 task_unlock(current);
 817                 mpol_put(new);
 818                 goto out;
 819         }
 820         old = current->mempolicy;
 821         current->mempolicy = new;
 822         if (new && new->mode == MPOL_INTERLEAVE &&
 823             nodes_weight(new->v.nodes))
 824                 current->il_next = first_node(new->v.nodes);
 825         task_unlock(current);
 826         mpol_put(old);
 827         ret = 0;
 828 out:
 829         NODEMASK_SCRATCH_FREE(scratch);
 830         return ret;
 831 }
 832
 833 /*
 834  * Return nodemask for policy for get_mempolicy() query
 835  *
 836  * Called with task's alloc_lock held
 837  */
 838 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 839 {
 840         nodes_clear(*nodes);
 841         if (p == &default_policy)
 842                 return;
 843
 844         switch (p->mode) {
 845         case MPOL_BIND:
 846                 /* Fall through */
 847         case MPOL_INTERLEAVE:
 848                 *nodes = p->v.nodes;
 849                 break;
 850         case MPOL_PREFERRED:
 851                 if (!(p->flags & MPOL_F_LOCAL))
 852                         node_set(p->v.preferred_node, *nodes);
 853                 /* else return empty node mask for local allocation */
 854                 break;
 855         default:
 856                 BUG();
 857         }
 858 }
 859
 860 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 861 {
 862         struct page *p;
 863         int err;
 864
 865         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 866         if (err >= 0) {
 867                 err = page_to_nid(p);
 868                 put_page(p);
 869         }
 870         return err;
 871 }
 872
 873 /* Retrieve NUMA policy */
 874 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 875                              unsigned long addr, unsigned long flags)
 876 {
 877         int err;
 878         struct mm_struct *mm = current->mm;
 879         struct vm_area_struct *vma = NULL;
 880         struct mempolicy *pol = current->mempolicy;
 881
 882         if (flags &
 883                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 884                 return -EINVAL;
 885
 886         if (flags & MPOL_F_MEMS_ALLOWED) {
 887                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 888                         return -EINVAL;
 889                 *policy = 0;    /* just so it's initialized */
 890                 task_lock(current);
 891                 *nmask  = cpuset_current_mems_allowed;
 892                 task_unlock(current);
 893                 return 0;
 894         }
 895
 896         if (flags & MPOL_F_ADDR) {
 897                 /*
 898                  * Do NOT fall back to task policy if the
 899                  * vma/shared policy at addr is NULL.  We
 900                  * want to return MPOL_DEFAULT in this case.
 901                  */
 902                 down_read(&mm->mmap_sem);
 903                 vma = find_vma_intersection(mm, addr, addr+1);
 904                 if (!vma) {
 905                         up_read(&mm->mmap_sem);
 906                         return -EFAULT;
 907                 }
 908                 if (vma->vm_ops && vma->vm_ops->get_policy)
 909                         pol = vma->vm_ops->get_policy(vma, addr);
 910                 else
 911                         pol = vma->vm_policy;
 912         } else if (addr)
 913                 return -EINVAL;
 914
 915         if (!pol)
 916                 pol = &default_policy;  /* indicates default behavior */
 917
 918         if (flags & MPOL_F_NODE) {
 919                 if (flags & MPOL_F_ADDR) {
 920                         err = lookup_node(mm, addr);
 921                         if (err < 0)
 922                                 goto out;
 923                         *policy = err;
 924                 } else if (pol == current->mempolicy &&
 925                                 pol->mode == MPOL_INTERLEAVE) {
 926                         *policy = current->il_next;
 927                 } else {
 928                         err = -EINVAL;
 929                         goto out;
 930                 }
 931         } else {
 932                 *policy = pol == &default_policy ? MPOL_DEFAULT :
 933                                                 pol->mode;
 934                 /*
 935                  * Internal mempolicy flags must be masked off before exposing
 936                  * the policy to userspace.
 937                  */
 938                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
 939         }
 940
 941         if (vma) {
 942                 up_read(&current->mm->mmap_sem);
 943                 vma = NULL;
 944         }
 945
 946         err = 0;
 947         if (nmask) {
 948                 if (mpol_store_user_nodemask(pol)) {
 949                         *nmask = pol->w.user_nodemask;
 950                 } else {
 951                         task_lock(current);
 952                         get_policy_nodemask(pol, nmask);
 953                         task_unlock(current);
 954                 }
 955         }
 956
 957  out:
 958         mpol_cond_put(pol);
 959         if (vma)
 960                 up_read(&current->mm->mmap_sem);
 961         return err;
 962 }
 963
 964 #ifdef CONFIG_MIGRATION
 965 /*
 966  * page migration
 967  */
 968 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 969                                 unsigned long flags)
 970 {
 971         /*
 972          * Avoid migrating a page that is shared with others.
 973          */
 974         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 975                 if (!isolate_lru_page(page)) {
 976                         list_add_tail(&page->lru, pagelist);
 977                         inc_zone_page_state(page, NR_ISOLATED_ANON +
 978                                             page_is_file_cache(page));
 979                 }
 980         }
 981 }
 982
 983 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 984 {
 985         if (PageHuge(page))
 986                 return alloc_huge_page_node(page_hstate(compound_head(page)),
 987                                         node);
 988         else
 989                 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
 990 }
 991
 992 /*
 993  * Migrate pages from one node to a target node.
 994  * Returns error or the number of pages not migrated.
 995  */
 996 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 997                            int flags)
 998 {
 999         nodemask_t nmask;
1000         LIST_HEAD(pagelist);
1001         int err = 0;
1002
1003         nodes_clear(nmask);
1004         node_set(source, nmask);
1005
1006         /*
1007          * This does not "check" the range but isolates all pages that
1008          * need migration.  Between passing in the full user address
1009          * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1010          */
1011         VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1012         queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1013                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1014
1015         if (!list_empty(&pagelist)) {
1016                 err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1017                                         MIGRATE_SYNC, MR_SYSCALL);
1018                 if (err)
1019                         putback_movable_pages(&pagelist);
1020         }
1021
1022         return err;
1023 }
1024
1025 /*
1026  * Move pages between the two nodesets so as to preserve the physical
1027  * layout as much as possible.
1028  *
1029  * Returns the number of page that could not be moved.
1030  */
1031 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1032                      const nodemask_t *to, int flags)
1033 {
1034         int busy = 0;
1035         int err;
1036         nodemask_t tmp;
1037
1038         err = migrate_prep();
1039         if (err)
1040                 return err;
1041
1042         down_read(&mm->mmap_sem);
1043
1044         /*
1045          * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1046          * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1047          * bit in 'tmp', and return that <source, dest> pair for migration.
1048          * The pair of nodemasks 'to' and 'from' define the map.
1049          *
1050          * If no pair of bits is found that way, fallback to picking some
1051          * pair of 'source' and 'dest' bits that are not the same.  If the
1052          * 'source' and 'dest' bits are the same, this represents a node
1053          * that will be migrating to itself, so no pages need move.
1054          *
1055          * If no bits are left in 'tmp', or if all remaining bits left
1056          * in 'tmp' correspond to the same bit in 'to', return false
1057          * (nothing left to migrate).
1058          *
1059          * This lets us pick a pair of nodes to migrate between, such that
1060          * if possible the dest node is not already occupied by some other
1061          * source node, minimizing the risk of overloading the memory on a
1062          * node that would happen if we migrated incoming memory to a node
1063          * before migrating outgoing memory source that same node.
1064          *
1065          * A single scan of tmp is sufficient.  As we go, we remember the
1066          * most recent <s, d> pair that moved (s != d).  If we find a pair
1067          * that not only moved, but what's better, moved to an empty slot
1068          * (d is not set in tmp), then we break out then, with that pair.
1069          * Otherwise when we finish scanning from_tmp, we at least have the
1070          * most recent <s, d> pair that moved.  If we get all the way through
1071          * the scan of tmp without finding any node that moved, much less
1072          * moved to an empty node, then there is nothing left worth migrating.
1073          */
1074
1075         tmp = *from;
1076         while (!nodes_empty(tmp)) {
1077                 int s,d;
1078                 int source = NUMA_NO_NODE;
1079                 int dest = 0;
1080
1081                 for_each_node_mask(s, tmp) {
1082
1083                         /*
1084                          * do_migrate_pages() tries to maintain the relative
1085                          * node relationship of the pages established between
1086                          * threads and memory areas.
1087                          *
1088                          * However if the number of source nodes is not equal to
1089                          * the number of destination nodes we can not preserve
1090                          * this node relative relationship.  In that case, skip
1091                          * copying memory from a node that is in the destination
1092                          * mask.
1093                          *
1094                          * Example: [2,3,4] -> [3,4,5] moves everything.
1095                          *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1096                          */
1097
1098                         if ((nodes_weight(*from) != nodes_weight(*to)) &&
1099                                                 (node_isset(s, *to)))
1100                                 continue;
1101
1102                         d = node_remap(s, *from, *to);
1103                         if (s == d)
1104                                 continue;
1105
1106                         source = s;     /* Node moved. Memorize */
1107                         dest = d;
1108
1109                         /* dest not in remaining from nodes? */
1110                         if (!node_isset(dest, tmp))
1111                                 break;
1112                 }
1113                 if (source == NUMA_NO_NODE)
1114                         break;
1115
1116                 node_clear(source, tmp);
1117                 err = migrate_to_node(mm, source, dest, flags);
1118                 if (err > 0)
1119                         busy += err;
1120                 if (err < 0)
1121                         break;
1122         }
1123         up_read(&mm->mmap_sem);
1124         if (err < 0)
1125                 return err;
1126         return busy;
1127
1128 }
1129
1130 /*
1131  * Allocate a new page for page migration based on vma policy.
1132  * Start by assuming the page is mapped by the same vma as contains @start.
1133  * Search forward from there, if not.  N.B., this assumes that the
1134  * list of pages handed to migrate_pages()--which is how we get here--
1135  * is in virtual address order.
1136  */
1137 static struct page *new_page(struct page *page, unsigned long start, int **x)
1138 {
1139         struct vm_area_struct *vma;
1140         unsigned long uninitialized_var(address);
1141
1142         vma = find_vma(current->mm, start);
1143         while (vma) {
1144                 address = page_address_in_vma(page, vma);
1145                 if (address != -EFAULT)
1146                         break;
1147                 vma = vma->vm_next;
1148         }
1149
1150         if (PageHuge(page)) {
1151                 BUG_ON(!vma);
1152                 return alloc_huge_page_noerr(vma, address, 1);
1153         }
1154         /*
1155          * if !vma, alloc_page_vma() will use task or system default policy
1156          */
1157         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1158 }
1159 #else
1160
1161 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1162                                 unsigned long flags)
1163 {
1164 }
1165
1166 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1167                      const nodemask_t *to, int flags)
1168 {
1169         return -ENOSYS;
1170 }
1171
1172 static struct page *new_page(struct page *page, unsigned long start, int **x)
1173 {
1174         return NULL;
1175 }
1176 #endif
1177
1178 static long do_mbind(unsigned long start, unsigned long len,
1179                      unsigned short mode, unsigned short mode_flags,
1180                      nodemask_t *nmask, unsigned long flags)
1181 {
1182         struct mm_struct *mm = current->mm;
1183         struct mempolicy *new;
1184         unsigned long end;
1185         int err;
1186         LIST_HEAD(pagelist);
1187
1188         if (flags & ~(unsigned long)MPOL_MF_VALID)
1189                 return -EINVAL;
1190         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1191                 return -EPERM;
1192
1193         if (start & ~PAGE_MASK)
1194                 return -EINVAL;
1195
1196         if (mode == MPOL_DEFAULT)
1197                 flags &= ~MPOL_MF_STRICT;
1198
1199         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1200         end = start + len;
1201
1202         if (end < start)
1203                 return -EINVAL;
1204         if (end == start)
1205                 return 0;
1206
1207         new = mpol_new(mode, mode_flags, nmask);
1208         if (IS_ERR(new))
1209                 return PTR_ERR(new);
1210
1211         if (flags & MPOL_MF_LAZY)
1212                 new->flags |= MPOL_F_MOF;
1213
1214         /*
1215          * If we are using the default policy then operation
1216          * on discontinuous address spaces is okay after all
1217          */
1218         if (!new)
1219                 flags |= MPOL_MF_DISCONTIG_OK;
1220
1221         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1222                  start, start + len, mode, mode_flags,
1223                  nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1224
1225         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1226
1227                 err = migrate_prep();
1228                 if (err)
1229                         goto mpol_out;
1230         }
1231         {
1232                 NODEMASK_SCRATCH(scratch);
1233                 if (scratch) {
1234                         down_write(&mm->mmap_sem);
1235                         task_lock(current);
1236                         err = mpol_set_nodemask(new, nmask, scratch);
1237                         task_unlock(current);
1238                         if (err)
1239                                 up_write(&mm->mmap_sem);
1240                 } else
1241                         err = -ENOMEM;
1242                 NODEMASK_SCRATCH_FREE(scratch);
1243         }
1244         if (err)
1245                 goto mpol_out;
1246
1247         err = queue_pages_range(mm, start, end, nmask,
1248                           flags | MPOL_MF_INVERT, &pagelist);
1249         if (!err)
1250                 err = mbind_range(mm, start, end, new);
1251
1252         if (!err) {
1253                 int nr_failed = 0;
1254
1255                 if (!list_empty(&pagelist)) {
1256                         WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1257                         nr_failed = migrate_pages(&pagelist, new_page, NULL,
1258                                 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1259                         if (nr_failed)
1260                                 putback_movable_pages(&pagelist);
1261                 }
1262
1263                 if (nr_failed && (flags & MPOL_MF_STRICT))
1264                         err = -EIO;
1265         } else
1266                 putback_movable_pages(&pagelist);
1267
1268         up_write(&mm->mmap_sem);
1269  mpol_out:
1270         mpol_put(new);
1271         return err;
1272 }
1273
1274 /*
1275  * User space interface with variable sized bitmaps for nodelists.
1276  */
1277
1278 /* Copy a node mask from user space. */
1279 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1280                      unsigned long maxnode)
1281 {
1282         unsigned long k;
1283         unsigned long nlongs;
1284         unsigned long endmask;
1285
1286         --maxnode;
1287         nodes_clear(*nodes);
1288         if (maxnode == 0 || !nmask)
1289                 return 0;
1290         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1291                 return -EINVAL;
1292
1293         nlongs = BITS_TO_LONGS(maxnode);
1294         if ((maxnode % BITS_PER_LONG) == 0)
1295                 endmask = ~0UL;
1296         else
1297                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1298
1299         /* When the user specified more nodes than supported just check
1300            if the non supported part is all zero. */
1301         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1302                 if (nlongs > PAGE_SIZE/sizeof(long))
1303                         return -EINVAL;
1304                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1305                         unsigned long t;
1306                         if (get_user(t, nmask + k))
1307                                 return -EFAULT;
1308                         if (k == nlongs - 1) {
1309                                 if (t & endmask)
1310                                         return -EINVAL;
1311                         } else if (t)
1312                                 return -EINVAL;
1313                 }
1314                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1315                 endmask = ~0UL;
1316         }
1317
1318         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1319                 return -EFAULT;
1320         nodes_addr(*nodes)[nlongs-1] &= endmask;
1321         return 0;
1322 }
1323
1324 /* Copy a kernel node mask to user space */
1325 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1326                               nodemask_t *nodes)
1327 {
1328         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1329         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1330
1331         if (copy > nbytes) {
1332                 if (copy > PAGE_SIZE)
1333                         return -EINVAL;
1334                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1335                         return -EFAULT;
1336                 copy = nbytes;
1337         }
1338         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1339 }
1340
1341 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1342                 unsigned long, mode, const unsigned long __user *, nmask,
1343                 unsigned long, maxnode, unsigned, flags)
1344 {
1345         nodemask_t nodes;
1346         int err;
1347         unsigned short mode_flags;
1348
1349         mode_flags = mode & MPOL_MODE_FLAGS;
1350         mode &= ~MPOL_MODE_FLAGS;
1351         if (mode >= MPOL_MAX)
1352                 return -EINVAL;
1353         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1354             (mode_flags & MPOL_F_RELATIVE_NODES))
1355                 return -EINVAL;
1356         err = get_nodes(&nodes, nmask, maxnode);
1357         if (err)
1358                 return err;
1359         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1360 }
1361
1362 /* Set the process memory policy */
1363 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1364                 unsigned long, maxnode)
1365 {
1366         int err;
1367         nodemask_t nodes;
1368         unsigned short flags;
1369
1370         flags = mode & MPOL_MODE_FLAGS;
1371         mode &= ~MPOL_MODE_FLAGS;
1372         if ((unsigned int)mode >= MPOL_MAX)
1373                 return -EINVAL;
1374         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1375                 return -EINVAL;
1376         err = get_nodes(&nodes, nmask, maxnode);
1377         if (err)
1378                 return err;
1379         return do_set_mempolicy(mode, flags, &nodes);
1380 }
1381
1382 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1383                 const unsigned long __user *, old_nodes,
1384                 const unsigned long __user *, new_nodes)
1385 {
1386         const struct cred *cred = current_cred(), *tcred;
1387         struct mm_struct *mm = NULL;
1388         struct task_struct *task;
1389         nodemask_t task_nodes;
1390         int err;
1391         nodemask_t *old;
1392         nodemask_t *new;
1393         NODEMASK_SCRATCH(scratch);
1394
1395         if (!scratch)
1396                 return -ENOMEM;
1397
1398         old = &scratch->mask1;
1399         new = &scratch->mask2;
1400
1401         err = get_nodes(old, old_nodes, maxnode);
1402         if (err)
1403                 goto out;
1404
1405         err = get_nodes(new, new_nodes, maxnode);
1406         if (err)
1407                 goto out;
1408
1409         /* Find the mm_struct */
1410         rcu_read_lock();
1411         task = pid ? find_task_by_vpid(pid) : current;
1412         if (!task) {
1413                 rcu_read_unlock();
1414                 err = -ESRCH;
1415                 goto out;
1416         }
1417         get_task_struct(task);
1418
1419         err = -EINVAL;
1420
1421         /*
1422          * Check if this process has the right to modify the specified
1423          * process. The right exists if the process has administrative
1424          * capabilities, superuser privileges or the same
1425          * userid as the target process.
1426          */
1427         tcred = __task_cred(task);
1428         if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1429             !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1430             !capable(CAP_SYS_NICE)) {
1431                 rcu_read_unlock();
1432                 err = -EPERM;
1433                 goto out_put;
1434         }
1435         rcu_read_unlock();
1436
1437         task_nodes = cpuset_mems_allowed(task);
1438         /* Is the user allowed to access the target nodes? */
1439         if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1440                 err = -EPERM;
1441                 goto out_put;
1442         }
1443
1444         if (!nodes_subset(*new, node_states[N_MEMORY])) {
1445                 err = -EINVAL;
1446                 goto out_put;
1447         }
1448
1449         err = security_task_movememory(task);
1450         if (err)
1451                 goto out_put;
1452
1453         mm = get_task_mm(task);
1454         put_task_struct(task);
1455
1456         if (!mm) {
1457                 err = -EINVAL;
1458                 goto out;
1459         }
1460
1461         err = do_migrate_pages(mm, old, new,
1462                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1463
1464         mmput(mm);
1465 out:
1466         NODEMASK_SCRATCH_FREE(scratch);
1467
1468         return err;
1469
1470 out_put:
1471         put_task_struct(task);
1472         goto out;
1473
1474 }
1475
1476
1477 /* Retrieve NUMA policy */
1478 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1479                 unsigned long __user *, nmask, unsigned long, maxnode,
1480                 unsigned long, addr, unsigned long, flags)
1481 {
1482         int err;
1483         int uninitialized_var(pval);
1484         nodemask_t nodes;
1485
1486         if (nmask != NULL && maxnode < MAX_NUMNODES)
1487                 return -EINVAL;
1488
1489         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1490
1491         if (err)
1492                 return err;
1493
1494         if (policy && put_user(pval, policy))
1495                 return -EFAULT;
1496
1497         if (nmask)
1498                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1499
1500         return err;
1501 }
1502
1503 #ifdef CONFIG_COMPAT
1504
1505 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1506                        compat_ulong_t __user *, nmask,
1507                        compat_ulong_t, maxnode,
1508                        compat_ulong_t, addr, compat_ulong_t, flags)
1509 {
1510         long err;
1511         unsigned long __user *nm = NULL;
1512         unsigned long nr_bits, alloc_size;
1513         DECLARE_BITMAP(bm, MAX_NUMNODES);
1514
1515         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1516         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1517
1518         if (nmask)
1519                 nm = compat_alloc_user_space(alloc_size);
1520
1521         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1522
1523         if (!err && nmask) {
1524                 unsigned long copy_size;
1525                 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1526                 err = copy_from_user(bm, nm, copy_size);
1527                 /* ensure entire bitmap is zeroed */
1528                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1529                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1530         }
1531
1532         return err;
1533 }
1534
1535 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1536                        compat_ulong_t, maxnode)
1537 {
1538         long err = 0;
1539         unsigned long __user *nm = NULL;
1540         unsigned long nr_bits, alloc_size;
1541         DECLARE_BITMAP(bm, MAX_NUMNODES);
1542
1543         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1544         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1545
1546         if (nmask) {
1547                 err = compat_get_bitmap(bm, nmask, nr_bits);
1548                 nm = compat_alloc_user_space(alloc_size);
1549                 err |= copy_to_user(nm, bm, alloc_size);
1550         }
1551
1552         if (err)
1553                 return -EFAULT;
1554
1555         return sys_set_mempolicy(mode, nm, nr_bits+1);
1556 }
1557
1558 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1559                        compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1560                        compat_ulong_t, maxnode, compat_ulong_t, flags)
1561 {
1562         long err = 0;
1563         unsigned long __user *nm = NULL;
1564         unsigned long nr_bits, alloc_size;
1565         nodemask_t bm;
1566
1567         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1568         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1569
1570         if (nmask) {
1571                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1572                 nm = compat_alloc_user_space(alloc_size);
1573                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1574         }
1575
1576         if (err)
1577                 return -EFAULT;
1578
1579         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1580 }
1581
1582 #endif
1583
1584 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1585                                                 unsigned long addr)
1586 {
1587         struct mempolicy *pol = NULL;
1588
1589         if (vma) {
1590                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1591                         pol = vma->vm_ops->get_policy(vma, addr);
1592                 } else if (vma->vm_policy) {
1593                         pol = vma->vm_policy;
1594
1595                         /*
1596                          * shmem_alloc_page() passes MPOL_F_SHARED policy with
1597                          * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1598                          * count on these policies which will be dropped by
1599                          * mpol_cond_put() later
1600                          */
1601                         if (mpol_needs_cond_ref(pol))
1602                                 mpol_get(pol);
1603                 }
1604         }
1605
1606         return pol;
1607 }
1608
1609 /*
1610  * get_vma_policy(@vma, @addr)
1611  * @vma: virtual memory area whose policy is sought
1612  * @addr: address in @vma for shared policy lookup
1613  *
1614  * Returns effective policy for a VMA at specified address.
1615  * Falls back to current->mempolicy or system default policy, as necessary.
1616  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1617  * count--added by the get_policy() vm_op, as appropriate--to protect against
1618  * freeing by another task.  It is the caller's responsibility to free the
1619  * extra reference for shared policies.
1620  */
1621 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1622                                                 unsigned long addr)
1623 {
1624         struct mempolicy *pol = __get_vma_policy(vma, addr);
1625
1626         if (!pol)
1627                 pol = get_task_policy(current);
1628
1629         return pol;
1630 }
1631
1632 bool vma_policy_mof(struct vm_area_struct *vma)
1633 {
1634         struct mempolicy *pol;
1635
1636         if (vma->vm_ops && vma->vm_ops->get_policy) {
1637                 bool ret = false;
1638
1639                 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1640                 if (pol && (pol->flags & MPOL_F_MOF))
1641                         ret = true;
1642                 mpol_cond_put(pol);
1643
1644                 return ret;
1645         }
1646
1647         pol = vma->vm_policy;
1648         if (!pol)
1649                 pol = get_task_policy(current);
1650
1651         return pol->flags & MPOL_F_MOF;
1652 }
1653
1654 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1655 {
1656         enum zone_type dynamic_policy_zone = policy_zone;
1657
1658         BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1659
1660         /*
1661          * if policy->v.nodes has movable memory only,
1662          * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1663          *
1664          * policy->v.nodes is intersect with node_states[N_MEMORY].
1665          * so if the following test faile, it implies
1666          * policy->v.nodes has movable memory only.
1667          */
1668         if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1669                 dynamic_policy_zone = ZONE_MOVABLE;
1670
1671         return zone >= dynamic_policy_zone;
1672 }
1673
1674 /*
1675  * Return a nodemask representing a mempolicy for filtering nodes for
1676  * page allocation
1677  */
1678 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1679 {
1680         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1681         if (unlikely(policy->mode == MPOL_BIND) &&
1682                         apply_policy_zone(policy, gfp_zone(gfp)) &&
1683                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1684                 return &policy->v.nodes;
1685
1686         return NULL;
1687 }
1688
1689 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1690 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1691         int nd)
1692 {
1693         switch (policy->mode) {
1694         case MPOL_PREFERRED:
1695                 if (!(policy->flags & MPOL_F_LOCAL))
1696                         nd = policy->v.preferred_node;
1697                 break;
1698         case MPOL_BIND:
1699                 /*
1700                  * Normally, MPOL_BIND allocations are node-local within the
1701                  * allowed nodemask.  However, if __GFP_THISNODE is set and the
1702                  * current node isn't part of the mask, we use the zonelist for
1703                  * the first node in the mask instead.
1704                  */
1705                 if (unlikely(gfp & __GFP_THISNODE) &&
1706                                 unlikely(!node_isset(nd, policy->v.nodes)))
1707                         nd = first_node(policy->v.nodes);
1708                 break;
1709         default:
1710                 BUG();
1711         }
1712         return node_zonelist(nd, gfp);
1713 }
1714
1715 /* Do dynamic interleaving for a process */
1716 static unsigned interleave_nodes(struct mempolicy *policy)
1717 {
1718         unsigned nid, next;
1719         struct task_struct *me = current;
1720
1721         nid = me->il_next;
1722         next = next_node(nid, policy->v.nodes);
1723         if (next >= MAX_NUMNODES)
1724                 next = first_node(policy->v.nodes);
1725         if (next < MAX_NUMNODES)
1726                 me->il_next = next;
1727         return nid;
1728 }
1729
1730 /*
1731  * Depending on the memory policy provide a node from which to allocate the
1732  * next slab entry.
1733  */
1734 unsigned int mempolicy_slab_node(void)
1735 {
1736         struct mempolicy *policy;
1737         int node = numa_mem_id();
1738
1739         if (in_interrupt())
1740                 return node;
1741
1742         policy = current->mempolicy;
1743         if (!policy || policy->flags & MPOL_F_LOCAL)
1744                 return node;
1745
1746         switch (policy->mode) {
1747         case MPOL_PREFERRED:
1748                 /*
1749                  * handled MPOL_F_LOCAL above
1750                  */
1751                 return policy->v.preferred_node;
1752
1753         case MPOL_INTERLEAVE:
1754                 return interleave_nodes(policy);
1755
1756         case MPOL_BIND: {
1757                 /*
1758                  * Follow bind policy behavior and start allocation at the
1759                  * first node.
1760                  */
1761                 struct zonelist *zonelist;
1762                 struct zone *zone;
1763                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1764                 zonelist = &NODE_DATA(node)->node_zonelists[0];
1765                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1766                                                         &policy->v.nodes,
1767                                                         &zone);
1768                 return zone ? zone->node : node;
1769         }
1770
1771         default:
1772                 BUG();
1773         }
1774 }
1775
1776 /* Do static interleaving for a VMA with known offset. */
1777 static unsigned offset_il_node(struct mempolicy *pol,
1778                 struct vm_area_struct *vma, unsigned long off)
1779 {
1780         unsigned nnodes = nodes_weight(pol->v.nodes);
1781         unsigned target;
1782         int c;
1783         int nid = NUMA_NO_NODE;
1784
1785         if (!nnodes)
1786                 return numa_node_id();
1787         target = (unsigned int)off % nnodes;
1788         c = 0;
1789         do {
1790                 nid = next_node(nid, pol->v.nodes);
1791                 c++;
1792         } while (c <= target);
1793         return nid;
1794 }
1795
1796 /* Determine a node number for interleave */
1797 static inline unsigned interleave_nid(struct mempolicy *pol,
1798                  struct vm_area_struct *vma, unsigned long addr, int shift)
1799 {
1800         if (vma) {
1801                 unsigned long off;
1802
1803                 /*
1804                  * for small pages, there is no difference between
1805                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1806                  * for huge pages, since vm_pgoff is in units of small
1807                  * pages, we need to shift off the always 0 bits to get
1808                  * a useful offset.
1809                  */
1810                 BUG_ON(shift < PAGE_SHIFT);
1811                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1812                 off += (addr - vma->vm_start) >> shift;
1813                 return offset_il_node(pol, vma, off);
1814         } else
1815                 return interleave_nodes(pol);
1816 }
1817
1818 /*
1819  * Return the bit number of a random bit set in the nodemask.
1820  * (returns NUMA_NO_NODE if nodemask is empty)
1821  */
1822 int node_random(const nodemask_t *maskp)
1823 {
1824         int w, bit = NUMA_NO_NODE;
1825
1826         w = nodes_weight(*maskp);
1827         if (w)
1828                 bit = bitmap_ord_to_pos(maskp->bits,
1829                         get_random_int() % w, MAX_NUMNODES);
1830         return bit;
1831 }
1832
1833 #ifdef CONFIG_HUGETLBFS
1834 /*
1835  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1836  * @vma: virtual memory area whose policy is sought
1837  * @addr: address in @vma for shared policy lookup and interleave policy
1838  * @gfp_flags: for requested zone
1839  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1840  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1841  *
1842  * Returns a zonelist suitable for a huge page allocation and a pointer
1843  * to the struct mempolicy for conditional unref after allocation.
1844  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1845  * @nodemask for filtering the zonelist.
1846  *
1847  * Must be protected by read_mems_allowed_begin()
1848  */
1849 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1850                                 gfp_t gfp_flags, struct mempolicy **mpol,
1851                                 nodemask_t **nodemask)
1852 {
1853         struct zonelist *zl;
1854
1855         *mpol = get_vma_policy(vma, addr);
1856         *nodemask = NULL;       /* assume !MPOL_BIND */
1857
1858         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1859                 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1860                                 huge_page_shift(hstate_vma(vma))), gfp_flags);
1861         } else {
1862                 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1863                 if ((*mpol)->mode == MPOL_BIND)
1864                         *nodemask = &(*mpol)->v.nodes;
1865         }
1866         return zl;
1867 }
1868
1869 /*
1870  * init_nodemask_of_mempolicy
1871  *
1872  * If the current task's mempolicy is "default" [NULL], return 'false'
1873  * to indicate default policy.  Otherwise, extract the policy nodemask
1874  * for 'bind' or 'interleave' policy into the argument nodemask, or
1875  * initialize the argument nodemask to contain the single node for
1876  * 'preferred' or 'local' policy and return 'true' to indicate presence
1877  * of non-default mempolicy.
1878  *
1879  * We don't bother with reference counting the mempolicy [mpol_get/put]
1880  * because the current task is examining it's own mempolicy and a task's
1881  * mempolicy is only ever changed by the task itself.
1882  *
1883  * N.B., it is the caller's responsibility to free a returned nodemask.
1884  */
1885 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1886 {
1887         struct mempolicy *mempolicy;
1888         int nid;
1889
1890         if (!(mask && current->mempolicy))
1891                 return false;
1892
1893         task_lock(current);
1894         mempolicy = current->mempolicy;
1895         switch (mempolicy->mode) {
1896         case MPOL_PREFERRED:
1897                 if (mempolicy->flags & MPOL_F_LOCAL)
1898                         nid = numa_node_id();
1899                 else
1900                         nid = mempolicy->v.preferred_node;
1901                 init_nodemask_of_node(mask, nid);
1902                 break;
1903
1904         case MPOL_BIND:
1905                 /* Fall through */
1906         case MPOL_INTERLEAVE:
1907                 *mask =  mempolicy->v.nodes;
1908                 break;
1909
1910         default:
1911                 BUG();
1912         }
1913         task_unlock(current);
1914
1915         return true;
1916 }
1917 #endif
1918
1919 /*
1920  * mempolicy_nodemask_intersects
1921  *
1922  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1923  * policy.  Otherwise, check for intersection between mask and the policy
1924  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1925  * policy, always return true since it may allocate elsewhere on fallback.
1926  *
1927  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1928  */
1929 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1930                                         const nodemask_t *mask)
1931 {
1932         struct mempolicy *mempolicy;
1933         bool ret = true;
1934
1935         if (!mask)
1936                 return ret;
1937         task_lock(tsk);
1938         mempolicy = tsk->mempolicy;
1939         if (!mempolicy)
1940                 goto out;
1941
1942         switch (mempolicy->mode) {
1943         case MPOL_PREFERRED:
1944                 /*
1945                  * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1946                  * allocate from, they may fallback to other nodes when oom.
1947                  * Thus, it's possible for tsk to have allocated memory from
1948                  * nodes in mask.
1949                  */
1950                 break;
1951         case MPOL_BIND:
1952         case MPOL_INTERLEAVE:
1953                 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1954                 break;
1955         default:
1956                 BUG();
1957         }
1958 out:
1959         task_unlock(tsk);
1960         return ret;
1961 }
1962
1963 /* Allocate a page in interleaved policy.
1964    Own path because it needs to do special accounting. */
1965 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1966                                         unsigned nid)
1967 {
1968         struct zonelist *zl;
1969         struct page *page;
1970
1971         zl = node_zonelist(nid, gfp);
1972         page = __alloc_pages(gfp, order, zl);
1973         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1974                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1975         return page;
1976 }
1977
1978 /**
1979  *      alloc_pages_vma - Allocate a page for a VMA.
1980  *
1981  *      @gfp:
1982  *      %GFP_USER    user allocation.
1983  *      %GFP_KERNEL  kernel allocations,
1984  *      %GFP_HIGHMEM highmem/user allocations,
1985  *      %GFP_FS      allocation should not call back into a file system.
1986  *      %GFP_ATOMIC  don't sleep.
1987  *
1988  *      @order:Order of the GFP allocation.
1989  *      @vma:  Pointer to VMA or NULL if not available.
1990  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1991  *      @node: Which node to prefer for allocation (modulo policy).
1992  *      @hugepage: for hugepages try only the preferred node if possible
1993  *
1994  *      This function allocates a page from the kernel page pool and applies
1995  *      a NUMA policy associated with the VMA or the current process.
1996  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1997  *      mm_struct of the VMA to prevent it from going away. Should be used for
1998  *      all allocations for pages that will be mapped into user space. Returns
1999  *      NULL when no page can be allocated.
2000  */
2001 struct page *
2002 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2003                 unsigned long addr, int node, bool hugepage)
2004 {
2005         struct mempolicy *pol;
2006         struct page *page;
2007         unsigned int cpuset_mems_cookie;
2008         struct zonelist *zl;
2009         nodemask_t *nmask;
2010
2011 retry_cpuset:
2012         pol = get_vma_policy(vma, addr);
2013         cpuset_mems_cookie = read_mems_allowed_begin();
2014
2015         if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage &&
2016                                         pol->mode != MPOL_INTERLEAVE)) {
2017                 /*
2018                  * For hugepage allocation and non-interleave policy which
2019                  * allows the current node, we only try to allocate from the
2020                  * current node and don't fall back to other nodes, as the
2021                  * cost of remote accesses would likely offset THP benefits.
2022                  *
2023                  * If the policy is interleave, or does not allow the current
2024                  * node in its nodemask, we allocate the standard way.
2025                  */
2026                 nmask = policy_nodemask(gfp, pol);
2027                 if (!nmask || node_isset(node, *nmask)) {
2028                         mpol_cond_put(pol);
2029                         page = alloc_pages_exact_node(node, gfp, order);
2030                         goto out;
2031                 }
2032         }
2033
2034         if (pol->mode == MPOL_INTERLEAVE) {
2035                 unsigned nid;
2036
2037                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2038                 mpol_cond_put(pol);
2039                 page = alloc_page_interleave(gfp, order, nid);
2040                 goto out;
2041         }
2042
2043         nmask = policy_nodemask(gfp, pol);
2044         zl = policy_zonelist(gfp, pol, node);
2045         mpol_cond_put(pol);
2046         page = __alloc_pages_nodemask(gfp, order, zl, nmask);
2047 out:
2048         if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2049                 goto retry_cpuset;
2050         return page;
2051 }
2052
2053 /**
2054  *      alloc_pages_current - Allocate pages.
2055  *
2056  *      @gfp:
2057  *              %GFP_USER   user allocation,
2058  *              %GFP_KERNEL kernel allocation,
2059  *              %GFP_HIGHMEM highmem allocation,
2060  *              %GFP_FS     don't call back into a file system.
2061  *              %GFP_ATOMIC don't sleep.
2062  *      @order: Power of two of allocation size in pages. 0 is a single page.
2063  *
2064  *      Allocate a page from the kernel page pool.  When not in
2065  *      interrupt context and apply the current process NUMA policy.
2066  *      Returns NULL when no page can be allocated.
2067  *
2068  *      Don't call cpuset_update_task_memory_state() unless
2069  *      1) it's ok to take cpuset_sem (can WAIT), and
2070  *      2) allocating for current task (not interrupt).
2071  */
2072 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2073 {
2074         struct mempolicy *pol = &default_policy;
2075         struct page *page;
2076         unsigned int cpuset_mems_cookie;
2077
2078         if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2079                 pol = get_task_policy(current);
2080
2081 retry_cpuset:
2082         cpuset_mems_cookie = read_mems_allowed_begin();
2083
2084         /*
2085          * No reference counting needed for current->mempolicy
2086          * nor system default_policy
2087          */
2088         if (pol->mode == MPOL_INTERLEAVE)
2089                 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2090         else
2091                 page = __alloc_pages_nodemask(gfp, order,
2092                                 policy_zonelist(gfp, pol, numa_node_id()),
2093                                 policy_nodemask(gfp, pol));
2094
2095         if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2096                 goto retry_cpuset;
2097
2098         return page;
2099 }
2100 EXPORT_SYMBOL(alloc_pages_current);
2101
2102 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2103 {
2104         struct mempolicy *pol = mpol_dup(vma_policy(src));
2105
2106         if (IS_ERR(pol))
2107                 return PTR_ERR(pol);
2108         dst->vm_policy = pol;
2109         return 0;
2110 }
2111
2112 /*
2113  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2114  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2115  * with the mems_allowed returned by cpuset_mems_allowed().  This
2116  * keeps mempolicies cpuset relative after its cpuset moves.  See
2117  * further kernel/cpuset.c update_nodemask().
2118  *
2119  * current's mempolicy may be rebinded by the other task(the task that changes
2120  * cpuset's mems), so we needn't do rebind work for current task.
2121  */
2122
2123 /* Slow path of a mempolicy duplicate */
2124 struct mempolicy *__mpol_dup(struct mempolicy *old)
2125 {
2126         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2127
2128         if (!new)
2129                 return ERR_PTR(-ENOMEM);
2130
2131         /* task's mempolicy is protected by alloc_lock */
2132         if (old == current->mempolicy) {
2133                 task_lock(current);
2134                 *new = *old;
2135                 task_unlock(current);
2136         } else
2137                 *new = *old;
2138
2139         if (current_cpuset_is_being_rebound()) {
2140                 nodemask_t mems = cpuset_mems_allowed(current);
2141                 if (new->flags & MPOL_F_REBINDING)
2142                         mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2143                 else
2144                         mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2145         }
2146         atomic_set(&new->refcnt, 1);
2147         return new;
2148 }
2149
2150 /* Slow path of a mempolicy comparison */
2151 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2152 {
2153         if (!a || !b)
2154                 return false;
2155         if (a->mode != b->mode)
2156                 return false;
2157         if (a->flags != b->flags)
2158                 return false;
2159         if (mpol_store_user_nodemask(a))
2160                 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2161                         return false;
2162
2163         switch (a->mode) {
2164         case MPOL_BIND:
2165                 /* Fall through */
2166         case MPOL_INTERLEAVE:
2167                 return !!nodes_equal(a->v.nodes, b->v.nodes);
2168         case MPOL_PREFERRED:
2169                 return a->v.preferred_node == b->v.preferred_node;
2170         default:
2171                 BUG();
2172                 return false;
2173         }
2174 }
2175
2176 /*
2177  * Shared memory backing store policy support.
2178  *
2179  * Remember policies even when nobody has shared memory mapped.
2180  * The policies are kept in Red-Black tree linked from the inode.
2181  * They are protected by the sp->lock spinlock, which should be held
2182  * for any accesses to the tree.
2183  */
2184
2185 /* lookup first element intersecting start-end */
2186 /* Caller holds sp->lock */
2187 static struct sp_node *
2188 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2189 {
2190         struct rb_node *n = sp->root.rb_node;
2191
2192         while (n) {
2193                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2194
2195                 if (start >= p->end)
2196                         n = n->rb_right;
2197                 else if (end <= p->start)
2198                         n = n->rb_left;
2199                 else
2200                         break;
2201         }
2202         if (!n)
2203                 return NULL;
2204         for (;;) {
2205                 struct sp_node *w = NULL;
2206                 struct rb_node *prev = rb_prev(n);
2207                 if (!prev)
2208                         break;
2209                 w = rb_entry(prev, struct sp_node, nd);
2210                 if (w->end <= start)
2211                         break;
2212                 n = prev;
2213         }
2214         return rb_entry(n, struct sp_node, nd);
2215 }
2216
2217 /* Insert a new shared policy into the list. */
2218 /* Caller holds sp->lock */
2219 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2220 {
2221         struct rb_node **p = &sp->root.rb_node;
2222         struct rb_node *parent = NULL;
2223         struct sp_node *nd;
2224
2225         while (*p) {
2226                 parent = *p;
2227                 nd = rb_entry(parent, struct sp_node, nd);
2228                 if (new->start < nd->start)
2229                         p = &(*p)->rb_left;
2230                 else if (new->end > nd->end)
2231                         p = &(*p)->rb_right;
2232                 else
2233                         BUG();
2234         }
2235         rb_link_node(&new->nd, parent, p);
2236         rb_insert_color(&new->nd, &sp->root);
2237         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2238                  new->policy ? new->policy->mode : 0);
2239 }
2240
2241 /* Find shared policy intersecting idx */
2242 struct mempolicy *
2243 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2244 {
2245         struct mempolicy *pol = NULL;
2246         struct sp_node *sn;
2247
2248         if (!sp->root.rb_node)
2249                 return NULL;
2250         spin_lock(&sp->lock);
2251         sn = sp_lookup(sp, idx, idx+1);
2252         if (sn) {
2253                 mpol_get(sn->policy);
2254                 pol = sn->policy;
2255         }
2256         spin_unlock(&sp->lock);
2257         return pol;
2258 }
2259
2260 static void sp_free(struct sp_node *n)
2261 {
2262         mpol_put(n->policy);
2263         kmem_cache_free(sn_cache, n);
2264 }
2265
2266 /**
2267  * mpol_misplaced - check whether current page node is valid in policy
2268  *
2269  * @page: page to be checked
2270  * @vma: vm area where page mapped
2271  * @addr: virtual address where page mapped
2272  *
2273  * Lookup current policy node id for vma,addr and "compare to" page's
2274  * node id.
2275  *
2276  * Returns:
2277  *      -1      - not misplaced, page is in the right node
2278  *      node    - node id where the page should be
2279  *
2280  * Policy determination "mimics" alloc_page_vma().
2281  * Called from fault path where we know the vma and faulting address.
2282  */
2283 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2284 {
2285         struct mempolicy *pol;
2286         struct zone *zone;
2287         int curnid = page_to_nid(page);
2288         unsigned long pgoff;
2289         int thiscpu = raw_smp_processor_id();
2290         int thisnid = cpu_to_node(thiscpu);
2291         int polnid = -1;
2292         int ret = -1;
2293
2294         BUG_ON(!vma);
2295
2296         pol = get_vma_policy(vma, addr);
2297         if (!(pol->flags & MPOL_F_MOF))
2298                 goto out;
2299
2300         switch (pol->mode) {
2301         case MPOL_INTERLEAVE:
2302                 BUG_ON(addr >= vma->vm_end);
2303                 BUG_ON(addr < vma->vm_start);
2304
2305                 pgoff = vma->vm_pgoff;
2306                 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2307                 polnid = offset_il_node(pol, vma, pgoff);
2308                 break;
2309
2310         case MPOL_PREFERRED:
2311                 if (pol->flags & MPOL_F_LOCAL)
2312                         polnid = numa_node_id();
2313                 else
2314                         polnid = pol->v.preferred_node;
2315                 break;
2316
2317         case MPOL_BIND:
2318                 /*
2319                  * allows binding to multiple nodes.
2320                  * use current page if in policy nodemask,
2321                  * else select nearest allowed node, if any.
2322                  * If no allowed nodes, use current [!misplaced].
2323                  */
2324                 if (node_isset(curnid, pol->v.nodes))
2325                         goto out;
2326                 (void)first_zones_zonelist(
2327                                 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2328                                 gfp_zone(GFP_HIGHUSER),
2329                                 &pol->v.nodes, &zone);
2330                 polnid = zone->node;
2331                 break;
2332
2333         default:
2334                 BUG();
2335         }
2336
2337         /* Migrate the page towards the node whose CPU is referencing it */
2338         if (pol->flags & MPOL_F_MORON) {
2339                 polnid = thisnid;
2340
2341                 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2342                         goto out;
2343         }
2344
2345         if (curnid != polnid)
2346                 ret = polnid;
2347 out:
2348         mpol_cond_put(pol);
2349
2350         return ret;
2351 }
2352
2353 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2354 {
2355         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2356         rb_erase(&n->nd, &sp->root);
2357         sp_free(n);
2358 }
2359
2360 static void sp_node_init(struct sp_node *node, unsigned long start,
2361                         unsigned long end, struct mempolicy *pol)
2362 {
2363         node->start = start;
2364         node->end = end;
2365         node->policy = pol;
2366 }
2367
2368 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2369                                 struct mempolicy *pol)
2370 {
2371         struct sp_node *n;
2372         struct mempolicy *newpol;
2373
2374         n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2375         if (!n)
2376                 return NULL;
2377
2378         newpol = mpol_dup(pol);
2379         if (IS_ERR(newpol)) {
2380                 kmem_cache_free(sn_cache, n);
2381                 return NULL;
2382         }
2383         newpol->flags |= MPOL_F_SHARED;
2384         sp_node_init(n, start, end, newpol);
2385
2386         return n;
2387 }
2388
2389 /* Replace a policy range. */
2390 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2391                                  unsigned long end, struct sp_node *new)
2392 {
2393         struct sp_node *n;
2394         struct sp_node *n_new = NULL;
2395         struct mempolicy *mpol_new = NULL;
2396         int ret = 0;
2397
2398 restart:
2399         spin_lock(&sp->lock);
2400         n = sp_lookup(sp, start, end);
2401         /* Take care of old policies in the same range. */
2402         while (n && n->start < end) {
2403                 struct rb_node *next = rb_next(&n->nd);
2404                 if (n->start >= start) {
2405                         if (n->end <= end)
2406                                 sp_delete(sp, n);
2407                         else
2408                                 n->start = end;
2409                 } else {
2410                         /* Old policy spanning whole new range. */
2411                         if (n->end > end) {
2412                                 if (!n_new)
2413                                         goto alloc_new;
2414
2415                                 *mpol_new = *n->policy;
2416                                 atomic_set(&mpol_new->refcnt, 1);
2417                                 sp_node_init(n_new, end, n->end, mpol_new);
2418                                 n->end = start;
2419                                 sp_insert(sp, n_new);
2420                                 n_new = NULL;
2421                                 mpol_new = NULL;
2422                                 break;
2423                         } else
2424                                 n->end = start;
2425                 }
2426                 if (!next)
2427                         break;
2428                 n = rb_entry(next, struct sp_node, nd);
2429         }
2430         if (new)
2431                 sp_insert(sp, new);
2432         spin_unlock(&sp->lock);
2433         ret = 0;
2434
2435 err_out:
2436         if (mpol_new)
2437                 mpol_put(mpol_new);
2438         if (n_new)
2439                 kmem_cache_free(sn_cache, n_new);
2440
2441         return ret;
2442
2443 alloc_new:
2444         spin_unlock(&sp->lock);
2445         ret = -ENOMEM;
2446         n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2447         if (!n_new)
2448                 goto err_out;
2449         mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2450         if (!mpol_new)
2451                 goto err_out;
2452         goto restart;
2453 }
2454
2455 /**
2456  * mpol_shared_policy_init - initialize shared policy for inode
2457  * @sp: pointer to inode shared policy
2458  * @mpol:  struct mempolicy to install
2459  *
2460  * Install non-NULL @mpol in inode's shared policy rb-tree.
2461  * On entry, the current task has a reference on a non-NULL @mpol.
2462  * This must be released on exit.
2463  * This is called at get_inode() calls and we can use GFP_KERNEL.
2464  */
2465 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2466 {
2467         int ret;
2468
2469         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2470         spin_lock_init(&sp->lock);
2471
2472         if (mpol) {
2473                 struct vm_area_struct pvma;
2474                 struct mempolicy *new;
2475                 NODEMASK_SCRATCH(scratch);
2476
2477                 if (!scratch)
2478                         goto put_mpol;
2479                 /* contextualize the tmpfs mount point mempolicy */
2480                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2481                 if (IS_ERR(new))
2482                         goto free_scratch; /* no valid nodemask intersection */
2483
2484                 task_lock(current);
2485                 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2486                 task_unlock(current);
2487                 if (ret)
2488                         goto put_new;
2489
2490                 /* Create pseudo-vma that contains just the policy */
2491                 memset(&pvma, 0, sizeof(struct vm_area_struct));
2492                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2493                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2494
2495 put_new:
2496                 mpol_put(new);                  /* drop initial ref */
2497 free_scratch:
2498                 NODEMASK_SCRATCH_FREE(scratch);
2499 put_mpol:
2500                 mpol_put(mpol); /* drop our incoming ref on sb mpol */
2501         }
2502 }
2503
2504 int mpol_set_shared_policy(struct shared_policy *info,
2505                         struct vm_area_struct *vma, struct mempolicy *npol)
2506 {
2507         int err;
2508         struct sp_node *new = NULL;
2509         unsigned long sz = vma_pages(vma);
2510
2511         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2512                  vma->vm_pgoff,
2513                  sz, npol ? npol->mode : -1,
2514                  npol ? npol->flags : -1,
2515                  npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2516
2517         if (npol) {
2518                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2519                 if (!new)
2520                         return -ENOMEM;
2521         }
2522         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2523         if (err && new)
2524                 sp_free(new);
2525         return err;
2526 }
2527
2528 /* Free a backing policy store on inode delete. */
2529 void mpol_free_shared_policy(struct shared_policy *p)
2530 {
2531         struct sp_node *n;
2532         struct rb_node *next;
2533
2534         if (!p->root.rb_node)
2535                 return;
2536         spin_lock(&p->lock);
2537         next = rb_first(&p->root);
2538         while (next) {
2539                 n = rb_entry(next, struct sp_node, nd);
2540                 next = rb_next(&n->nd);
2541                 sp_delete(p, n);
2542         }
2543         spin_unlock(&p->lock);
2544 }
2545
2546 #ifdef CONFIG_NUMA_BALANCING
2547 static int __initdata numabalancing_override;
2548
2549 static void __init check_numabalancing_enable(void)
2550 {
2551         bool numabalancing_default = false;
2552
2553         if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2554                 numabalancing_default = true;
2555
2556         /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2557         if (numabalancing_override)
2558                 set_numabalancing_state(numabalancing_override == 1);
2559
2560         if (nr_node_ids > 1 && !numabalancing_override) {
2561                 pr_info("%s automatic NUMA balancing. "
2562                         "Configure with numa_balancing= or the "
2563                         "kernel.numa_balancing sysctl",
2564                         numabalancing_default ? "Enabling" : "Disabling");
2565                 set_numabalancing_state(numabalancing_default);
2566         }
2567 }
2568
2569 static int __init setup_numabalancing(char *str)
2570 {
2571         int ret = 0;
2572         if (!str)
2573                 goto out;
2574
2575         if (!strcmp(str, "enable")) {
2576                 numabalancing_override = 1;
2577                 ret = 1;
2578         } else if (!strcmp(str, "disable")) {
2579                 numabalancing_override = -1;
2580                 ret = 1;
2581         }
2582 out:
2583         if (!ret)
2584                 pr_warn("Unable to parse numa_balancing=\n");
2585
2586         return ret;
2587 }
2588 __setup("numa_balancing=", setup_numabalancing);
2589 #else
2590 static inline void __init check_numabalancing_enable(void)
2591 {
2592 }
2593 #endif /* CONFIG_NUMA_BALANCING */
2594
2595 /* assumes fs == KERNEL_DS */
2596 void __init numa_policy_init(void)
2597 {
2598         nodemask_t interleave_nodes;
2599         unsigned long largest = 0;
2600         int nid, prefer = 0;
2601
2602         policy_cache = kmem_cache_create("numa_policy",
2603                                          sizeof(struct mempolicy),
2604                                          0, SLAB_PANIC, NULL);
2605
2606         sn_cache = kmem_cache_create("shared_policy_node",
2607                                      sizeof(struct sp_node),
2608                                      0, SLAB_PANIC, NULL);
2609
2610         for_each_node(nid) {
2611                 preferred_node_policy[nid] = (struct mempolicy) {
2612                         .refcnt = ATOMIC_INIT(1),
2613                         .mode = MPOL_PREFERRED,
2614                         .flags = MPOL_F_MOF | MPOL_F_MORON,
2615                         .v = { .preferred_node = nid, },
2616                 };
2617         }
2618
2619         /*
2620          * Set interleaving policy for system init. Interleaving is only
2621          * enabled across suitably sized nodes (default is >= 16MB), or
2622          * fall back to the largest node if they're all smaller.
2623          */
2624         nodes_clear(interleave_nodes);
2625         for_each_node_state(nid, N_MEMORY) {
2626                 unsigned long total_pages = node_present_pages(nid);
2627
2628                 /* Preserve the largest node */
2629                 if (largest < total_pages) {
2630                         largest = total_pages;
2631                         prefer = nid;
2632                 }
2633
2634                 /* Interleave this node? */
2635                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2636                         node_set(nid, interleave_nodes);
2637         }
2638
2639         /* All too small, use the largest */
2640         if (unlikely(nodes_empty(interleave_nodes)))
2641                 node_set(prefer, interleave_nodes);
2642
2643         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2644                 pr_err("%s: interleaving failed\n", __func__);
2645
2646         check_numabalancing_enable();
2647 }
2648
2649 /* Reset policy of current process to default */
2650 void numa_default_policy(void)
2651 {
2652         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2653 }
2654
2655 /*
2656  * Parse and format mempolicy from/to strings
2657  */
2658
2659 /*
2660  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2661  */
2662 static const char * const policy_modes[] =
2663 {
2664         [MPOL_DEFAULT]    = "default",
2665         [MPOL_PREFERRED]  = "prefer",
2666         [MPOL_BIND]       = "bind",
2667         [MPOL_INTERLEAVE] = "interleave",
2668         [MPOL_LOCAL]      = "local",
2669 };
2670
2671
2672 #ifdef CONFIG_TMPFS
2673 /**
2674  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2675  * @str:  string containing mempolicy to parse
2676  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2677  *
2678  * Format of input:
2679  *      <mode>[=<flags>][:<nodelist>]
2680  *
2681  * On success, returns 0, else 1
2682  */
2683 int mpol_parse_str(char *str, struct mempolicy **mpol)
2684 {
2685         struct mempolicy *new = NULL;
2686         unsigned short mode;
2687         unsigned short mode_flags;
2688         nodemask_t nodes;
2689         char *nodelist = strchr(str, ':');
2690         char *flags = strchr(str, '=');
2691         int err = 1;
2692
2693         if (nodelist) {
2694                 /* NUL-terminate mode or flags string */
2695                 *nodelist++ = '\0';
2696                 if (nodelist_parse(nodelist, nodes))
2697                         goto out;
2698                 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2699                         goto out;
2700         } else
2701                 nodes_clear(nodes);
2702
2703         if (flags)
2704                 *flags++ = '\0';        /* terminate mode string */
2705
2706         for (mode = 0; mode < MPOL_MAX; mode++) {
2707                 if (!strcmp(str, policy_modes[mode])) {
2708                         break;
2709                 }
2710         }
2711         if (mode >= MPOL_MAX)
2712                 goto out;
2713
2714         switch (mode) {
2715         case MPOL_PREFERRED:
2716                 /*
2717                  * Insist on a nodelist of one node only
2718                  */
2719                 if (nodelist) {
2720                         char *rest = nodelist;
2721                         while (isdigit(*rest))
2722                                 rest++;
2723                         if (*rest)
2724                                 goto out;
2725                 }
2726                 break;
2727         case MPOL_INTERLEAVE:
2728                 /*
2729                  * Default to online nodes with memory if no nodelist
2730                  */
2731                 if (!nodelist)
2732                         nodes = node_states[N_MEMORY];
2733                 break;
2734         case MPOL_LOCAL:
2735                 /*
2736                  * Don't allow a nodelist;  mpol_new() checks flags
2737                  */
2738                 if (nodelist)
2739                         goto out;
2740                 mode = MPOL_PREFERRED;
2741                 break;
2742         case MPOL_DEFAULT:
2743                 /*
2744                  * Insist on a empty nodelist
2745                  */
2746                 if (!nodelist)
2747                         err = 0;
2748                 goto out;
2749         case MPOL_BIND:
2750                 /*
2751                  * Insist on a nodelist
2752                  */
2753                 if (!nodelist)
2754                         goto out;
2755         }
2756
2757         mode_flags = 0;
2758         if (flags) {
2759                 /*
2760                  * Currently, we only support two mutually exclusive
2761                  * mode flags.
2762                  */
2763                 if (!strcmp(flags, "static"))
2764                         mode_flags |= MPOL_F_STATIC_NODES;
2765                 else if (!strcmp(flags, "relative"))
2766                         mode_flags |= MPOL_F_RELATIVE_NODES;
2767                 else
2768                         goto out;
2769         }
2770
2771         new = mpol_new(mode, mode_flags, &nodes);
2772         if (IS_ERR(new))
2773                 goto out;
2774
2775         /*
2776          * Save nodes for mpol_to_str() to show the tmpfs mount options
2777          * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2778          */
2779         if (mode != MPOL_PREFERRED)
2780                 new->v.nodes = nodes;
2781         else if (nodelist)
2782                 new->v.preferred_node = first_node(nodes);
2783         else
2784                 new->flags |= MPOL_F_LOCAL;
2785
2786         /*
2787          * Save nodes for contextualization: this will be used to "clone"
2788          * the mempolicy in a specific context [cpuset] at a later time.
2789          */
2790         new->w.user_nodemask = nodes;
2791
2792         err = 0;
2793
2794 out:
2795         /* Restore string for error message */
2796         if (nodelist)
2797                 *--nodelist = ':';
2798         if (flags)
2799                 *--flags = '=';
2800         if (!err)
2801                 *mpol = new;
2802         return err;
2803 }
2804 #endif /* CONFIG_TMPFS */
2805
2806 /**
2807  * mpol_to_str - format a mempolicy structure for printing
2808  * @buffer:  to contain formatted mempolicy string
2809  * @maxlen:  length of @buffer
2810  * @pol:  pointer to mempolicy to be formatted
2811  *
2812  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2813  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2814  * longest flag, "relative", and to display at least a few node ids.
2815  */
2816 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2817 {
2818         char *p = buffer;
2819         nodemask_t nodes = NODE_MASK_NONE;
2820         unsigned short mode = MPOL_DEFAULT;
2821         unsigned short flags = 0;
2822
2823         if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2824                 mode = pol->mode;
2825                 flags = pol->flags;
2826         }
2827
2828         switch (mode) {
2829         case MPOL_DEFAULT:
2830                 break;
2831         case MPOL_PREFERRED:
2832                 if (flags & MPOL_F_LOCAL)
2833                         mode = MPOL_LOCAL;
2834                 else
2835                         node_set(pol->v.preferred_node, nodes);
2836                 break;
2837         case MPOL_BIND:
2838         case MPOL_INTERLEAVE:
2839                 nodes = pol->v.nodes;
2840                 break;
2841         default:
2842                 WARN_ON_ONCE(1);
2843                 snprintf(p, maxlen, "unknown");
2844                 return;
2845         }
2846
2847         p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2848
2849         if (flags & MPOL_MODE_FLAGS) {
2850                 p += snprintf(p, buffer + maxlen - p, "=");
2851
2852                 /*
2853                  * Currently, the only defined flags are mutually exclusive
2854                  */
2855                 if (flags & MPOL_F_STATIC_NODES)
2856                         p += snprintf(p, buffer + maxlen - p, "static");
2857                 else if (flags & MPOL_F_RELATIVE_NODES)
2858                         p += snprintf(p, buffer + maxlen - p, "relative");
2859         }
2860
2861         if (!nodes_empty(nodes)) {
2862                 p += snprintf(p, buffer + maxlen - p, ":");
2863                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2864         }
2865 }