mm/swap.c

   1 /*
   2  *  linux/mm/swap.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  */
   6
   7 /*
   8  * This file contains the default values for the operation of the
   9  * Linux VM subsystem. Fine-tuning documentation can be found in
  10  * Documentation/sysctl/vm.txt.
  11  * Started 18.12.91
  12  * Swap aging added 23.2.95, Stephen Tweedie.
  13  * Buffermem limits added 12.3.98, Rik van Riel.
  14  */
  15
  16 #include <linux/mm.h>
  17 #include <linux/sched.h>
  18 #include <linux/kernel_stat.h>
  19 #include <linux/swap.h>
  20 #include <linux/mman.h>
  21 #include <linux/pagemap.h>
  22 #include <linux/pagevec.h>
  23 #include <linux/init.h>
  24 #include <linux/export.h>
  25 #include <linux/mm_inline.h>
  26 #include <linux/percpu_counter.h>
  27 #include <linux/percpu.h>
  28 #include <linux/cpu.h>
  29 #include <linux/notifier.h>
  30 #include <linux/backing-dev.h>
  31 #include <linux/memcontrol.h>
  32 #include <linux/gfp.h>
  33 #include <linux/uio.h>
  34 #include <linux/hugetlb.h>
  35
  36 #include "internal.h"
  37
  38 /* How many pages do we try to swap or page in/out together? */
  39 int page_cluster;
  40
  41 static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
  42 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
  43 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
  44
  45 /*
  46  * This path almost never happens for VM activity - pages are normally
  47  * freed via pagevecs.  But it gets used by networking.
  48  */
  49 static void __page_cache_release(struct page *page)
  50 {
  51         if (PageLRU(page)) {
  52                 struct zone *zone = page_zone(page);
  53                 struct lruvec *lruvec;
  54                 unsigned long flags;
  55
  56                 spin_lock_irqsave(&zone->lru_lock, flags);
  57                 lruvec = mem_cgroup_page_lruvec(page, zone);
  58                 VM_BUG_ON(!PageLRU(page));
  59                 __ClearPageLRU(page);
  60                 del_page_from_lru_list(page, lruvec, page_off_lru(page));
  61                 spin_unlock_irqrestore(&zone->lru_lock, flags);
  62         }
  63 }
  64
  65 static void __put_single_page(struct page *page)
  66 {
  67         __page_cache_release(page);
  68         free_hot_cold_page(page, 0);
  69 }
  70
  71 static void __put_compound_page(struct page *page)
  72 {
  73         compound_page_dtor *dtor;
  74
  75         __page_cache_release(page);
  76         dtor = get_compound_page_dtor(page);
  77         (*dtor)(page);
  78 }
  79
  80 static void put_compound_page(struct page *page)
  81 {
  82         /*
  83          * hugetlbfs pages cannot be split from under us.  If this is a
  84          * hugetlbfs page, check refcount on head page and release the page if
  85          * the refcount becomes zero.
  86          */
  87         if (PageHuge(page)) {
  88                 page = compound_head(page);
  89                 if (put_page_testzero(page))
  90                         __put_compound_page(page);
  91
  92                 return;
  93         }
  94
  95         if (unlikely(PageTail(page))) {
  96                 /* __split_huge_page_refcount can run under us */
  97                 struct page *page_head = compound_trans_head(page);
  98
  99                 if (likely(page != page_head &&
 100                            get_page_unless_zero(page_head))) {
 101                         unsigned long flags;
 102
 103                         /*
 104                          * THP can not break up slab pages so avoid taking
 105                          * compound_lock().  Slab performs non-atomic bit ops
 106                          * on page->flags for better performance.  In particular
 107                          * slab_unlock() in slub used to be a hot path.  It is
 108                          * still hot on arches that do not support
 109                          * this_cpu_cmpxchg_double().
 110                          */
 111                         if (PageSlab(page_head)) {
 112                                 if (PageTail(page)) {
 113                                         if (put_page_testzero(page_head))
 114                                                 VM_BUG_ON(1);
 115
 116                                         atomic_dec(&page->_mapcount);
 117                                         goto skip_lock_tail;
 118                                 } else
 119                                         goto skip_lock;
 120                         }
 121                         /*
 122                          * page_head wasn't a dangling pointer but it
 123                          * may not be a head page anymore by the time
 124                          * we obtain the lock. That is ok as long as it
 125                          * can't be freed from under us.
 126                          */
 127                         flags = compound_lock_irqsave(page_head);
 128                         if (unlikely(!PageTail(page))) {
 129                                 /* __split_huge_page_refcount run before us */
 130                                 compound_unlock_irqrestore(page_head, flags);
 131 skip_lock:
 132                                 if (put_page_testzero(page_head))
 133                                         __put_single_page(page_head);
 134 out_put_single:
 135                                 if (put_page_testzero(page))
 136                                         __put_single_page(page);
 137                                 return;
 138                         }
 139                         VM_BUG_ON(page_head != page->first_page);
 140                         /*
 141                          * We can release the refcount taken by
 142                          * get_page_unless_zero() now that
 143                          * __split_huge_page_refcount() is blocked on
 144                          * the compound_lock.
 145                          */
 146                         if (put_page_testzero(page_head))
 147                                 VM_BUG_ON(1);
 148                         /* __split_huge_page_refcount will wait now */
 149                         VM_BUG_ON(page_mapcount(page) <= 0);
 150                         atomic_dec(&page->_mapcount);
 151                         VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
 152                         VM_BUG_ON(atomic_read(&page->_count) != 0);
 153                         compound_unlock_irqrestore(page_head, flags);
 154
 155 skip_lock_tail:
 156                         if (put_page_testzero(page_head)) {
 157                                 if (PageHead(page_head))
 158                                         __put_compound_page(page_head);
 159                                 else
 160                                         __put_single_page(page_head);
 161                         }
 162                 } else {
 163                         /* page_head is a dangling pointer */
 164                         VM_BUG_ON(PageTail(page));
 165                         goto out_put_single;
 166                 }
 167         } else if (put_page_testzero(page)) {
 168                 if (PageHead(page))
 169                         __put_compound_page(page);
 170                 else
 171                         __put_single_page(page);
 172         }
 173 }
 174
 175 void put_page(struct page *page)
 176 {
 177         if (unlikely(PageCompound(page)))
 178                 put_compound_page(page);
 179         else if (put_page_testzero(page))
 180                 __put_single_page(page);
 181 }
 182 EXPORT_SYMBOL(put_page);
 183
 184 /*
 185  * This function is exported but must not be called by anything other
 186  * than get_page(). It implements the slow path of get_page().
 187  */
 188 bool __get_page_tail(struct page *page)
 189 {
 190         /*
 191          * This takes care of get_page() if run on a tail page
 192          * returned by one of the get_user_pages/follow_page variants.
 193          * get_user_pages/follow_page itself doesn't need the compound
 194          * lock because it runs __get_page_tail_foll() under the
 195          * proper PT lock that already serializes against
 196          * split_huge_page().
 197          */
 198         bool got = false;
 199         struct page *page_head;
 200
 201         /*
 202          * If this is a hugetlbfs page it cannot be split under us.  Simply
 203          * increment refcount for the head page.
 204          */
 205         if (PageHuge(page)) {
 206                 page_head = compound_head(page);
 207                 atomic_inc(&page_head->_count);
 208                 got = true;
 209         } else {
 210                 unsigned long flags;
 211
 212                 page_head = compound_trans_head(page);
 213                 if (likely(page != page_head &&
 214                                         get_page_unless_zero(page_head))) {
 215
 216                         /* Ref to put_compound_page() comment. */
 217                         if (PageSlab(page_head)) {
 218                                 if (likely(PageTail(page))) {
 219                                         __get_page_tail_foll(page, false);
 220                                         return true;
 221                                 } else {
 222                                         put_page(page_head);
 223                                         return false;
 224                                 }
 225                         }
 226
 227                         /*
 228                          * page_head wasn't a dangling pointer but it
 229                          * may not be a head page anymore by the time
 230                          * we obtain the lock. That is ok as long as it
 231                          * can't be freed from under us.
 232                          */
 233                         flags = compound_lock_irqsave(page_head);
 234                         /* here __split_huge_page_refcount won't run anymore */
 235                         if (likely(PageTail(page))) {
 236                                 __get_page_tail_foll(page, false);
 237                                 got = true;
 238                         }
 239                         compound_unlock_irqrestore(page_head, flags);
 240                         if (unlikely(!got))
 241                                 put_page(page_head);
 242                 }
 243         }
 244         return got;
 245 }
 246 EXPORT_SYMBOL(__get_page_tail);
 247
 248 /**
 249  * put_pages_list() - release a list of pages
 250  * @pages: list of pages threaded on page->lru
 251  *
 252  * Release a list of pages which are strung together on page.lru.  Currently
 253  * used by read_cache_pages() and related error recovery code.
 254  */
 255 void put_pages_list(struct list_head *pages)
 256 {
 257         while (!list_empty(pages)) {
 258                 struct page *victim;
 259
 260                 victim = list_entry(pages->prev, struct page, lru);
 261                 list_del(&victim->lru);
 262                 page_cache_release(victim);
 263         }
 264 }
 265 EXPORT_SYMBOL(put_pages_list);
 266
 267 /*
 268  * get_kernel_pages() - pin kernel pages in memory
 269  * @kiov:       An array of struct kvec structures
 270  * @nr_segs:    number of segments to pin
 271  * @write:      pinning for read/write, currently ignored
 272  * @pages:      array that receives pointers to the pages pinned.
 273  *              Should be at least nr_segs long.
 274  *
 275  * Returns number of pages pinned. This may be fewer than the number
 276  * requested. If nr_pages is 0 or negative, returns 0. If no pages
 277  * were pinned, returns -errno. Each page returned must be released
 278  * with a put_page() call when it is finished with.
 279  */
 280 int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
 281                 struct page **pages)
 282 {
 283         int seg;
 284
 285         for (seg = 0; seg < nr_segs; seg++) {
 286                 if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
 287                         return seg;
 288
 289                 pages[seg] = kmap_to_page(kiov[seg].iov_base);
 290                 page_cache_get(pages[seg]);
 291         }
 292
 293         return seg;
 294 }
 295 EXPORT_SYMBOL_GPL(get_kernel_pages);
 296
 297 /*
 298  * get_kernel_page() - pin a kernel page in memory
 299  * @start:      starting kernel address
 300  * @write:      pinning for read/write, currently ignored
 301  * @pages:      array that receives pointer to the page pinned.
 302  *              Must be at least nr_segs long.
 303  *
 304  * Returns 1 if page is pinned. If the page was not pinned, returns
 305  * -errno. The page returned must be released with a put_page() call
 306  * when it is finished with.
 307  */
 308 int get_kernel_page(unsigned long start, int write, struct page **pages)
 309 {
 310         const struct kvec kiov = {
 311                 .iov_base = (void *)start,
 312                 .iov_len = PAGE_SIZE
 313         };
 314
 315         return get_kernel_pages(&kiov, 1, write, pages);
 316 }
 317 EXPORT_SYMBOL_GPL(get_kernel_page);
 318
 319 static void pagevec_lru_move_fn(struct pagevec *pvec,
 320         void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
 321         void *arg)
 322 {
 323         int i;
 324         struct zone *zone = NULL;
 325         struct lruvec *lruvec;
 326         unsigned long flags = 0;
 327
 328         for (i = 0; i < pagevec_count(pvec); i++) {
 329                 struct page *page = pvec->pages[i];
 330                 struct zone *pagezone = page_zone(page);
 331
 332                 if (pagezone != zone) {
 333                         if (zone)
 334                                 spin_unlock_irqrestore(&zone->lru_lock, flags);
 335                         zone = pagezone;
 336                         spin_lock_irqsave(&zone->lru_lock, flags);
 337                 }
 338
 339                 lruvec = mem_cgroup_page_lruvec(page, zone);
 340                 (*move_fn)(page, lruvec, arg);
 341         }
 342         if (zone)
 343                 spin_unlock_irqrestore(&zone->lru_lock, flags);
 344         release_pages(pvec->pages, pvec->nr, pvec->cold);
 345         pagevec_reinit(pvec);
 346 }
 347
 348 static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
 349                                  void *arg)
 350 {
 351         int *pgmoved = arg;
 352
 353         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
 354                 enum lru_list lru = page_lru_base_type(page);
 355                 list_move_tail(&page->lru, &lruvec->lists[lru]);
 356                 (*pgmoved)++;
 357         }
 358 }
 359
 360 /*
 361  * pagevec_move_tail() must be called with IRQ disabled.
 362  * Otherwise this may cause nasty races.
 363  */
 364 static void pagevec_move_tail(struct pagevec *pvec)
 365 {
 366         int pgmoved = 0;
 367
 368         pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
 369         __count_vm_events(PGROTATED, pgmoved);
 370 }
 371
 372 /*
 373  * Writeback is about to end against a page which has been marked for immediate
 374  * reclaim.  If it still appears to be reclaimable, move it to the tail of the
 375  * inactive list.
 376  */
 377 void rotate_reclaimable_page(struct page *page)
 378 {
 379         if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
 380             !PageUnevictable(page) && PageLRU(page)) {
 381                 struct pagevec *pvec;
 382                 unsigned long flags;
 383
 384                 page_cache_get(page);
 385                 local_irq_save(flags);
 386                 pvec = &__get_cpu_var(lru_rotate_pvecs);
 387                 if (!pagevec_add(pvec, page))
 388                         pagevec_move_tail(pvec);
 389                 local_irq_restore(flags);
 390         }
 391 }
 392
 393 static void update_page_reclaim_stat(struct lruvec *lruvec,
 394                                      int file, int rotated)
 395 {
 396         struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 397
 398         reclaim_stat->recent_scanned[file]++;
 399         if (rotated)
 400                 reclaim_stat->recent_rotated[file]++;
 401 }
 402
 403 static void __activate_page(struct page *page, struct lruvec *lruvec,
 404                             void *arg)
 405 {
 406         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
 407                 int file = page_is_file_cache(page);
 408                 int lru = page_lru_base_type(page);
 409
 410                 del_page_from_lru_list(page, lruvec, lru);
 411                 SetPageActive(page);
 412                 lru += LRU_ACTIVE;
 413                 add_page_to_lru_list(page, lruvec, lru);
 414
 415                 __count_vm_event(PGACTIVATE);
 416                 update_page_reclaim_stat(lruvec, file, 1);
 417         }
 418 }
 419
 420 #ifdef CONFIG_SMP
 421 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
 422
 423 static void activate_page_drain(int cpu)
 424 {
 425         struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
 426
 427         if (pagevec_count(pvec))
 428                 pagevec_lru_move_fn(pvec, __activate_page, NULL);
 429 }
 430
 431 void activate_page(struct page *page)
 432 {
 433         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
 434                 struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
 435
 436                 page_cache_get(page);
 437                 if (!pagevec_add(pvec, page))
 438                         pagevec_lru_move_fn(pvec, __activate_page, NULL);
 439                 put_cpu_var(activate_page_pvecs);
 440         }
 441 }
 442
 443 #else
 444 static inline void activate_page_drain(int cpu)
 445 {
 446 }
 447
 448 void activate_page(struct page *page)
 449 {
 450         struct zone *zone = page_zone(page);
 451
 452         spin_lock_irq(&zone->lru_lock);
 453         __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL);
 454         spin_unlock_irq(&zone->lru_lock);
 455 }
 456 #endif
 457
 458 /*
 459  * Mark a page as having seen activity.
 460  *
 461  * inactive,unreferenced        ->      inactive,referenced
 462  * inactive,referenced          ->      active,unreferenced
 463  * active,unreferenced          ->      active,referenced
 464  */
 465 void mark_page_accessed(struct page *page)
 466 {
 467         if (!PageActive(page) && !PageUnevictable(page) &&
 468                         PageReferenced(page) && PageLRU(page)) {
 469                 activate_page(page);
 470                 ClearPageReferenced(page);
 471         } else if (!PageReferenced(page)) {
 472                 SetPageReferenced(page);
 473         }
 474 }
 475 EXPORT_SYMBOL(mark_page_accessed);
 476
 477 /*
 478  * Order of operations is important: flush the pagevec when it's already
 479  * full, not when adding the last page, to make sure that last page is
 480  * not added to the LRU directly when passed to this function. Because
 481  * mark_page_accessed() (called after this when writing) only activates
 482  * pages that are on the LRU, linear writes in subpage chunks would see
 483  * every PAGEVEC_SIZE page activated, which is unexpected.
 484  */
 485 void __lru_cache_add(struct page *page, enum lru_list lru)
 486 {
 487         struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
 488
 489         page_cache_get(page);
 490         if (!pagevec_space(pvec))
 491                 __pagevec_lru_add(pvec, lru);
 492         pagevec_add(pvec, page);
 493         put_cpu_var(lru_add_pvecs);
 494 }
 495 EXPORT_SYMBOL(__lru_cache_add);
 496
 497 /**
 498  * lru_cache_add_lru - add a page to a page list
 499  * @page: the page to be added to the LRU.
 500  * @lru: the LRU list to which the page is added.
 501  */
 502 void lru_cache_add_lru(struct page *page, enum lru_list lru)
 503 {
 504         if (PageActive(page)) {
 505                 VM_BUG_ON(PageUnevictable(page));
 506                 ClearPageActive(page);
 507         } else if (PageUnevictable(page)) {
 508                 VM_BUG_ON(PageActive(page));
 509                 ClearPageUnevictable(page);
 510         }
 511
 512         VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page));
 513         __lru_cache_add(page, lru);
 514 }
 515
 516 /**
 517  * add_page_to_unevictable_list - add a page to the unevictable list
 518  * @page:  the page to be added to the unevictable list
 519  *
 520  * Add page directly to its zone's unevictable list.  To avoid races with
 521  * tasks that might be making the page evictable, through eg. munlock,
 522  * munmap or exit, while it's not on the lru, we want to add the page
 523  * while it's locked or otherwise "invisible" to other tasks.  This is
 524  * difficult to do when using the pagevec cache, so bypass that.
 525  */
 526 void add_page_to_unevictable_list(struct page *page)
 527 {
 528         struct zone *zone = page_zone(page);
 529         struct lruvec *lruvec;
 530
 531         spin_lock_irq(&zone->lru_lock);
 532         lruvec = mem_cgroup_page_lruvec(page, zone);
 533         SetPageUnevictable(page);
 534         SetPageLRU(page);
 535         add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
 536         spin_unlock_irq(&zone->lru_lock);
 537 }
 538
 539 /*
 540  * If the page can not be invalidated, it is moved to the
 541  * inactive list to speed up its reclaim.  It is moved to the
 542  * head of the list, rather than the tail, to give the flusher
 543  * threads some time to write it out, as this is much more
 544  * effective than the single-page writeout from reclaim.
 545  *
 546  * If the page isn't page_mapped and dirty/writeback, the page
 547  * could reclaim asap using PG_reclaim.
 548  *
 549  * 1. active, mapped page -> none
 550  * 2. active, dirty/writeback page -> inactive, head, PG_reclaim
 551  * 3. inactive, mapped page -> none
 552  * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
 553  * 5. inactive, clean -> inactive, tail
 554  * 6. Others -> none
 555  *
 556  * In 4, why it moves inactive's head, the VM expects the page would
 557  * be write it out by flusher threads as this is much more effective
 558  * than the single-page writeout from reclaim.
 559  */
 560 static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
 561                               void *arg)
 562 {
 563         int lru, file;
 564         bool active;
 565
 566         if (!PageLRU(page))
 567                 return;
 568
 569         if (PageUnevictable(page))
 570                 return;
 571
 572         /* Some processes are using the page */
 573         if (page_mapped(page))
 574                 return;
 575
 576         active = PageActive(page);
 577         file = page_is_file_cache(page);
 578         lru = page_lru_base_type(page);
 579
 580         del_page_from_lru_list(page, lruvec, lru + active);
 581         ClearPageActive(page);
 582         ClearPageReferenced(page);
 583         add_page_to_lru_list(page, lruvec, lru);
 584
 585         if (PageWriteback(page) || PageDirty(page)) {
 586                 /*
 587                  * PG_reclaim could be raced with end_page_writeback
 588                  * It can make readahead confusing.  But race window
 589                  * is _really_ small and  it's non-critical problem.
 590                  */
 591                 SetPageReclaim(page);
 592         } else {
 593                 /*
 594                  * The page's writeback ends up during pagevec
 595                  * We moves tha page into tail of inactive.
 596                  */
 597                 list_move_tail(&page->lru, &lruvec->lists[lru]);
 598                 __count_vm_event(PGROTATED);
 599         }
 600
 601         if (active)
 602                 __count_vm_event(PGDEACTIVATE);
 603         update_page_reclaim_stat(lruvec, file, 0);
 604 }
 605
 606 /*
 607  * Drain pages out of the cpu's pagevecs.
 608  * Either "cpu" is the current CPU, and preemption has already been
 609  * disabled; or "cpu" is being hot-unplugged, and is already dead.
 610  */
 611 void lru_add_drain_cpu(int cpu)
 612 {
 613         struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
 614         struct pagevec *pvec;
 615         int lru;
 616
 617         for_each_lru(lru) {
 618                 pvec = &pvecs[lru - LRU_BASE];
 619                 if (pagevec_count(pvec))
 620                         __pagevec_lru_add(pvec, lru);
 621         }
 622
 623         pvec = &per_cpu(lru_rotate_pvecs, cpu);
 624         if (pagevec_count(pvec)) {
 625                 unsigned long flags;
 626
 627                 /* No harm done if a racing interrupt already did this */
 628                 local_irq_save(flags);
 629                 pagevec_move_tail(pvec);
 630                 local_irq_restore(flags);
 631         }
 632
 633         pvec = &per_cpu(lru_deactivate_pvecs, cpu);
 634         if (pagevec_count(pvec))
 635                 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
 636
 637         activate_page_drain(cpu);
 638 }
 639
 640 /**
 641  * deactivate_page - forcefully deactivate a page
 642  * @page: page to deactivate
 643  *
 644  * This function hints the VM that @page is a good reclaim candidate,
 645  * for example if its invalidation fails due to the page being dirty
 646  * or under writeback.
 647  */
 648 void deactivate_page(struct page *page)
 649 {
 650         /*
 651          * In a workload with many unevictable page such as mprotect, unevictable
 652          * page deactivation for accelerating reclaim is pointless.
 653          */
 654         if (PageUnevictable(page))
 655                 return;
 656
 657         if (likely(get_page_unless_zero(page))) {
 658                 struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
 659
 660                 if (!pagevec_add(pvec, page))
 661                         pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
 662                 put_cpu_var(lru_deactivate_pvecs);
 663         }
 664 }
 665
 666 void lru_add_drain(void)
 667 {
 668         lru_add_drain_cpu(get_cpu());
 669         put_cpu();
 670 }
 671
 672 static void lru_add_drain_per_cpu(struct work_struct *dummy)
 673 {
 674         lru_add_drain();
 675 }
 676
 677 /*
 678  * Returns 0 for success
 679  */
 680 int lru_add_drain_all(void)
 681 {
 682         return schedule_on_each_cpu(lru_add_drain_per_cpu);
 683 }
 684
 685 /*
 686  * Batched page_cache_release().  Decrement the reference count on all the
 687  * passed pages.  If it fell to zero then remove the page from the LRU and
 688  * free it.
 689  *
 690  * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
 691  * for the remainder of the operation.
 692  *
 693  * The locking in this function is against shrink_inactive_list(): we recheck
 694  * the page count inside the lock to see whether shrink_inactive_list()
 695  * grabbed the page via the LRU.  If it did, give up: shrink_inactive_list()
 696  * will free it.
 697  */
 698 void release_pages(struct page **pages, int nr, int cold)
 699 {
 700         int i;
 701         LIST_HEAD(pages_to_free);
 702         struct zone *zone = NULL;
 703         struct lruvec *lruvec;
 704         unsigned long uninitialized_var(flags);
 705
 706         for (i = 0; i < nr; i++) {
 707                 struct page *page = pages[i];
 708
 709                 if (unlikely(PageCompound(page))) {
 710                         if (zone) {
 711                                 spin_unlock_irqrestore(&zone->lru_lock, flags);
 712                                 zone = NULL;
 713                         }
 714                         put_compound_page(page);
 715                         continue;
 716                 }
 717
 718                 if (!put_page_testzero(page))
 719                         continue;
 720
 721                 if (PageLRU(page)) {
 722                         struct zone *pagezone = page_zone(page);
 723
 724                         if (pagezone != zone) {
 725                                 if (zone)
 726                                         spin_unlock_irqrestore(&zone->lru_lock,
 727                                                                         flags);
 728                                 zone = pagezone;
 729                                 spin_lock_irqsave(&zone->lru_lock, flags);
 730                         }
 731
 732                         lruvec = mem_cgroup_page_lruvec(page, zone);
 733                         VM_BUG_ON(!PageLRU(page));
 734                         __ClearPageLRU(page);
 735                         del_page_from_lru_list(page, lruvec, page_off_lru(page));
 736                 }
 737
 738                 list_add(&page->lru, &pages_to_free);
 739         }
 740         if (zone)
 741                 spin_unlock_irqrestore(&zone->lru_lock, flags);
 742
 743         free_hot_cold_page_list(&pages_to_free, cold);
 744 }
 745 EXPORT_SYMBOL(release_pages);
 746
 747 /*
 748  * The pages which we're about to release may be in the deferred lru-addition
 749  * queues.  That would prevent them from really being freed right now.  That's
 750  * OK from a correctness point of view but is inefficient - those pages may be
 751  * cache-warm and we want to give them back to the page allocator ASAP.
 752  *
 753  * So __pagevec_release() will drain those queues here.  __pagevec_lru_add()
 754  * and __pagevec_lru_add_active() call release_pages() directly to avoid
 755  * mutual recursion.
 756  */
 757 void __pagevec_release(struct pagevec *pvec)
 758 {
 759         lru_add_drain();
 760         release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
 761         pagevec_reinit(pvec);
 762 }
 763 EXPORT_SYMBOL(__pagevec_release);
 764
 765 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 766 /* used by __split_huge_page_refcount() */
 767 void lru_add_page_tail(struct page *page, struct page *page_tail,
 768                        struct lruvec *lruvec, struct list_head *list)
 769 {
 770         int uninitialized_var(active);
 771         enum lru_list lru;
 772         const int file = 0;
 773
 774         VM_BUG_ON(!PageHead(page));
 775         VM_BUG_ON(PageCompound(page_tail));
 776         VM_BUG_ON(PageLRU(page_tail));
 777         VM_BUG_ON(NR_CPUS != 1 &&
 778                   !spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
 779
 780         if (!list)
 781                 SetPageLRU(page_tail);
 782
 783         if (page_evictable(page_tail)) {
 784                 if (PageActive(page)) {
 785                         SetPageActive(page_tail);
 786                         active = 1;
 787                         lru = LRU_ACTIVE_ANON;
 788                 } else {
 789                         active = 0;
 790                         lru = LRU_INACTIVE_ANON;
 791                 }
 792         } else {
 793                 SetPageUnevictable(page_tail);
 794                 lru = LRU_UNEVICTABLE;
 795         }
 796
 797         if (likely(PageLRU(page)))
 798                 list_add_tail(&page_tail->lru, &page->lru);
 799         else if (list) {
 800                 /* page reclaim is reclaiming a huge page */
 801                 get_page(page_tail);
 802                 list_add_tail(&page_tail->lru, list);
 803         } else {
 804                 struct list_head *list_head;
 805                 /*
 806                  * Head page has not yet been counted, as an hpage,
 807                  * so we must account for each subpage individually.
 808                  *
 809                  * Use the standard add function to put page_tail on the list,
 810                  * but then correct its position so they all end up in order.
 811                  */
 812                 add_page_to_lru_list(page_tail, lruvec, lru);
 813                 list_head = page_tail->lru.prev;
 814                 list_move_tail(&page_tail->lru, list_head);
 815         }
 816
 817         if (!PageUnevictable(page))
 818                 update_page_reclaim_stat(lruvec, file, active);
 819 }
 820 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 821
 822 static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
 823                                  void *arg)
 824 {
 825         enum lru_list lru = (enum lru_list)arg;
 826         int file = is_file_lru(lru);
 827         int active = is_active_lru(lru);
 828
 829         VM_BUG_ON(PageActive(page));
 830         VM_BUG_ON(PageUnevictable(page));
 831         VM_BUG_ON(PageLRU(page));
 832
 833         SetPageLRU(page);
 834         if (active)
 835                 SetPageActive(page);
 836         add_page_to_lru_list(page, lruvec, lru);
 837         update_page_reclaim_stat(lruvec, file, active);
 838 }
 839
 840 /*
 841  * Add the passed pages to the LRU, then drop the caller's refcount
 842  * on them.  Reinitialises the caller's pagevec.
 843  */
 844 void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
 845 {
 846         VM_BUG_ON(is_unevictable_lru(lru));
 847
 848         pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, (void *)lru);
 849 }
 850 EXPORT_SYMBOL(__pagevec_lru_add);
 851
 852 /**
 853  * pagevec_lookup - gang pagecache lookup
 854  * @pvec:       Where the resulting pages are placed
 855  * @mapping:    The address_space to search
 856  * @start:      The starting page index
 857  * @nr_pages:   The maximum number of pages
 858  *
 859  * pagevec_lookup() will search for and return a group of up to @nr_pages pages
 860  * in the mapping.  The pages are placed in @pvec.  pagevec_lookup() takes a
 861  * reference against the pages in @pvec.
 862  *
 863  * The search returns a group of mapping-contiguous pages with ascending
 864  * indexes.  There may be holes in the indices due to not-present pages.
 865  *
 866  * pagevec_lookup() returns the number of pages which were found.
 867  */
 868 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 869                 pgoff_t start, unsigned nr_pages)
 870 {
 871         pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
 872         return pagevec_count(pvec);
 873 }
 874 EXPORT_SYMBOL(pagevec_lookup);
 875
 876 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
 877                 pgoff_t *index, int tag, unsigned nr_pages)
 878 {
 879         pvec->nr = find_get_pages_tag(mapping, index, tag,
 880                                         nr_pages, pvec->pages);
 881         return pagevec_count(pvec);
 882 }
 883 EXPORT_SYMBOL(pagevec_lookup_tag);
 884
 885 /*
 886  * Perform any setup for the swap system
 887  */
 888 void __init swap_setup(void)
 889 {
 890         unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
 891 #ifdef CONFIG_SWAP
 892         int i;
 893
 894         bdi_init(swapper_spaces[0].backing_dev_info);
 895         for (i = 0; i < MAX_SWAPFILES; i++) {
 896                 spin_lock_init(&swapper_spaces[i].tree_lock);
 897                 INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
 898         }
 899 #endif
 900
 901         /* Use a smaller cluster for small-memory machines */
 902         if (megs < 16)
 903                 page_cluster = 2;
 904         else
 905                 page_cluster = 3;
 906         /*
 907          * Right now other parts of the system means that we
 908          * _really_ don't want to cluster much more
 909          */
 910 }