Merge tag 'cleanup-for-3.17' of git://git.kernel.org/pub/scm/linux/kernel/git/arm...

[firefly-linux-kernel-4.4.55.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 20d17f8266fed9e482055c5c68034b8149f3329c..18cee0d4c8a20705a4b3e7dd73e7d1d8a8b8d595 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -680,9 +680,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
         int migratetype = 0;
         int batch_free = 0;
         int to_free = count;
+       unsigned long nr_scanned;
  
         spin_lock(&zone->lock);
-       zone->pages_scanned = 0;
+       nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+       if (nr_scanned)
+               __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
  
         while (to_free) {
                 struct page *page;
@@ -731,8 +734,11 @@ static void free_one_page(struct zone *zone,
                                 unsigned int order,
                                 int migratetype)
  {
+       unsigned long nr_scanned;
         spin_lock(&zone->lock);
-       zone->pages_scanned = 0;
+       nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+       if (nr_scanned)
+               __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
  
         __free_one_page(page, pfn, zone, order, migratetype);
         if (unlikely(!is_migrate_isolate(migratetype)))
@@ -816,9 +822,21 @@ void __init init_cma_reserved_pageblock(struct page *page)
                 set_page_count(p, 0);
         } while (++p, --i);
  
-       set_page_refcounted(page);
         set_pageblock_migratetype(page, MIGRATE_CMA);
-       __free_pages(page, pageblock_order);
+
+       if (pageblock_order >= MAX_ORDER) {
+               i = pageblock_nr_pages;
+               p = page;
+               do {
+                       set_page_refcounted(p);
+                       __free_pages(p, MAX_ORDER - 1);
+                       p += MAX_ORDER_NR_PAGES;
+               } while (i -= MAX_ORDER_NR_PAGES);
+       } else {
+               set_page_refcounted(page);
+               __free_pages(page, pageblock_order);
+       }
+
         adjust_managed_page_count(page, pageblock_nr_pages);
  }
  #endif
@@ -1245,15 +1263,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
  {
         unsigned long flags;
-       int to_drain;
-       unsigned long batch;
+       int to_drain, batch;
  
         local_irq_save(flags);
         batch = ACCESS_ONCE(pcp->batch);
-       if (pcp->count >= batch)
-               to_drain = batch;
-       else
-               to_drain = pcp->count;
+       to_drain = min(pcp->count, batch);
         if (to_drain > 0) {
                 free_pcppages_bulk(zone, to_drain, pcp);
                 pcp->count -= to_drain;
@@ -1598,6 +1612,9 @@ again:
         }
  
         __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
+       if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 &&
+           !zone_is_fair_depleted(zone))
+               zone_set_flag(zone, ZONE_FAIR_DEPLETED);
  
         __count_zone_vm_events(PGALLOC, zone, 1 << order);
         zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1700,7 +1717,6 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
  {
         /* free_pages my go negative - that's OK */
         long min = mark;
-       long lowmem_reserve = z->lowmem_reserve[classzone_idx];
         int o;
         long free_cma = 0;
  
@@ -1715,7 +1731,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
                 free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
  #endif
  
-       if (free_pages - free_cma <= min + lowmem_reserve)
+       if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
                 return false;
         for (o = 0; o < order; o++) {
                 /* At the next order, this order's pages become unavailable */
@@ -1910,6 +1926,18 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
  
  #endif /* CONFIG_NUMA */
  
+static void reset_alloc_batches(struct zone *preferred_zone)
+{
+       struct zone *zone = preferred_zone->zone_pgdat->node_zones;
+
+       do {
+               mod_zone_page_state(zone, NR_ALLOC_BATCH,
+                       high_wmark_pages(zone) - low_wmark_pages(zone) -
+                       atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
+               zone_clear_flag(zone, ZONE_FAIR_DEPLETED);
+       } while (zone++ != preferred_zone);
+}
+
  /*
   * get_page_from_freelist goes through the zonelist trying to allocate
   * a page.
@@ -1927,8 +1955,12 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
         int did_zlc_setup = 0;          /* just call zlc_setup() one time */
         bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
                                 (gfp_mask & __GFP_WRITE);
+       int nr_fair_skipped = 0;
+       bool zonelist_rescan;
  
  zonelist_scan:
+       zonelist_rescan = false;
+
         /*
          * Scan zonelist, looking for a zone with enough free.
          * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
@@ -1952,9 +1984,11 @@ zonelist_scan:
                  */
                 if (alloc_flags & ALLOC_FAIR) {
                         if (!zone_local(preferred_zone, zone))
+                               break;
+                       if (zone_is_fair_depleted(zone)) {
+                               nr_fair_skipped++;
                                 continue;
-                       if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
-                               continue;
+                       }
                 }
                 /*
                  * When allocating a page cache page for writing, we
@@ -2060,13 +2094,7 @@ this_zone_full:
                         zlc_mark_zone_full(zonelist, z);
         }
  
-       if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
-               /* Disable zlc cache for second zonelist scan */
-               zlc_active = 0;
-               goto zonelist_scan;
-       }
-
-       if (page)
+       if (page) {
                 /*
                  * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
                  * necessary to allocate the page. The expectation is
@@ -2075,8 +2103,37 @@ this_zone_full:
                  * for !PFMEMALLOC purposes.
                  */
                 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
+               return page;
+       }
  
-       return page;
+       /*
+        * The first pass makes sure allocations are spread fairly within the
+        * local node.  However, the local node might have free pages left
+        * after the fairness batches are exhausted, and remote zones haven't
+        * even been considered yet.  Try once more without fairness, and
+        * include remote zones now, before entering the slowpath and waking
+        * kswapd: prefer spilling to a remote zone over swapping locally.
+        */
+       if (alloc_flags & ALLOC_FAIR) {
+               alloc_flags &= ~ALLOC_FAIR;
+               if (nr_fair_skipped) {
+                       zonelist_rescan = true;
+                       reset_alloc_batches(preferred_zone);
+               }
+               if (nr_online_nodes > 1)
+                       zonelist_rescan = true;
+       }
+
+       if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
+               /* Disable zlc cache for second zonelist scan */
+               zlc_active = 0;
+               zonelist_rescan = true;
+       }
+
+       if (zonelist_rescan)
+               goto zonelist_scan;
+
+       return NULL;
  }
  
  /*
@@ -2189,8 +2246,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
  {
         struct page *page;
  
-       /* Acquire the OOM killer lock for the zones in zonelist */
-       if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
+       /* Acquire the per-zone oom lock for each zone */
+       if (!oom_zonelist_trylock(zonelist, gfp_mask)) {
                 schedule_timeout_uninterruptible(1);
                 return NULL;
         }
@@ -2228,7 +2285,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
         out_of_memory(zonelist, gfp_mask, order, nodemask, false);
  
  out:
-       clear_zonelist_oom(zonelist, gfp_mask);
+       oom_zonelist_unlock(zonelist, gfp_mask);
         return page;
  }
  
@@ -2397,28 +2454,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
         return page;
  }
  
-static void reset_alloc_batches(struct zonelist *zonelist,
-                               enum zone_type high_zoneidx,
-                               struct zone *preferred_zone)
-{
-       struct zoneref *z;
-       struct zone *zone;
-
-       for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-               /*
-                * Only reset the batches of zones that were actually
-                * considered in the fairness pass, we don't want to
-                * trash fairness information for zones that are not
-                * actually part of this zonelist's round-robin cycle.
-                */
-               if (!zone_local(preferred_zone, zone))
-                       continue;
-               mod_zone_page_state(zone, NR_ALLOC_BATCH,
-                       high_wmark_pages(zone) - low_wmark_pages(zone) -
-                       atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
-       }
-}
-
  static void wake_all_kswapds(unsigned int order,
                              struct zonelist *zonelist,
                              enum zone_type high_zoneidx,
@@ -2435,7 +2470,7 @@ static inline int
  gfp_to_alloc_flags(gfp_t gfp_mask)
  {
         int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
-       const gfp_t wait = gfp_mask & __GFP_WAIT;
+       const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
  
         /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
         BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
@@ -2444,20 +2479,20 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
          * The caller may dip into page reserves a bit more if the caller
          * cannot run direct reclaim, or if the caller has realtime scheduling
          * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
-        * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
+        * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
          */
         alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
  
-       if (!wait) {
+       if (atomic) {
                 /*
-                * Not worth trying to allocate harder for
-                * __GFP_NOMEMALLOC even if it can't schedule.
+                * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
+                * if it can't schedule.
                  */
-               if  (!(gfp_mask & __GFP_NOMEMALLOC))
+               if (!(gfp_mask & __GFP_NOMEMALLOC))
                         alloc_flags |= ALLOC_HARDER;
                 /*
-                * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
-                * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+                * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
+                * comment for __cpuset_node_allowed_softwall().
                  */
                 alloc_flags &= ~ALLOC_CPUSET;
         } else if (unlikely(rt_task(current)) && !in_interrupt())
@@ -2603,14 +2638,6 @@ rebalance:
         if (page)
                 goto got_pg;
  
-       /*
-        * It can become very expensive to allocate transparent hugepages at
-        * fault, so use asynchronous memory compaction for THP unless it is
-        * khugepaged trying to collapse.
-        */
-       if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD))
-               migration_mode = MIGRATE_SYNC_LIGHT;
-
         /*
          * If compaction is deferred for high-order allocations, it is because
          * sync compaction recently failed. In this is the case and the caller
@@ -2621,6 +2648,15 @@ rebalance:
                                                 (gfp_mask & __GFP_NO_KSWAPD))
                 goto nopage;
  
+       /*
+        * It can become very expensive to allocate transparent hugepages at
+        * fault, so use asynchronous memory compaction for THP unless it is
+        * khugepaged trying to collapse.
+        */
+       if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE ||
+                                               (current->flags & PF_KTHREAD))
+               migration_mode = MIGRATE_SYNC_LIGHT;
+
         /* Try direct reclaim and then allocating */
         page = __alloc_pages_direct_reclaim(gfp_mask, order,
                                         zonelist, high_zoneidx,
@@ -2754,28 +2790,11 @@ retry_cpuset:
         if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
                 alloc_flags |= ALLOC_CMA;
  #endif
-retry:
         /* First allocation attempt */
         page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
                         zonelist, high_zoneidx, alloc_flags,
                         preferred_zone, classzone_idx, migratetype);
         if (unlikely(!page)) {
-               /*
-                * The first pass makes sure allocations are spread
-                * fairly within the local node.  However, the local
-                * node might have free pages left after the fairness
-                * batches are exhausted, and remote zones haven't
-                * even been considered yet.  Try once more without
-                * fairness, and include remote zones now, before
-                * entering the slowpath and waking kswapd: prefer
-                * spilling to a remote zone over swapping locally.
-                */
-               if (alloc_flags & ALLOC_FAIR) {
-                       reset_alloc_batches(zonelist, high_zoneidx,
-                                           preferred_zone);
-                       alloc_flags &= ~ALLOC_FAIR;
-                       goto retry;
-               }
                 /*
                  * Runtime PM, block IO and its error handling path
                  * can deadlock because I/O on the device might not
@@ -2950,7 +2969,7 @@ EXPORT_SYMBOL(alloc_pages_exact);
   * Note this is not alloc_pages_exact_node() which allocates on a specific node,
   * but is not exact.
   */
-void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
+void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
  {
         unsigned order = get_order(size);
         struct page *p = alloc_pages_node(nid, gfp_mask, order);
@@ -2958,7 +2977,6 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
                 return NULL;
         return make_alloc_exact((unsigned long)page_address(p), order, size);
  }
-EXPORT_SYMBOL(alloc_pages_exact_nid);
  
  /**
   * free_pages_exact - release memory allocated via alloc_pages_exact()
@@ -3040,7 +3058,7 @@ static inline void show_node(struct zone *zone)
  void si_meminfo(struct sysinfo *val)
  {
         val->totalram = totalram_pages;
-       val->sharedram = 0;
+       val->sharedram = global_page_state(NR_SHMEM);
         val->freeram = global_page_state(NR_FREE_PAGES);
         val->bufferram = nr_blockdev_pages();
         val->totalhigh = totalhigh_pages;
@@ -3060,6 +3078,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
         for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
                 managed_pages += pgdat->node_zones[zone_type].managed_pages;
         val->totalram = managed_pages;
+       val->sharedram = node_page_state(nid, NR_SHMEM);
         val->freeram = node_page_state(nid, NR_FREE_PAGES);
  #ifdef CONFIG_HIGHMEM
         val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
@@ -3241,12 +3260,12 @@ void show_free_areas(unsigned int filter)
                         K(zone_page_state(zone, NR_BOUNCE)),
                         K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
                         K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
-                       zone->pages_scanned,
+                       K(zone_page_state(zone, NR_PAGES_SCANNED)),
                         (!zone_reclaimable(zone) ? "yes" : "no")
                         );
                 printk("lowmem_reserve[]:");
                 for (i = 0; i < MAX_NR_ZONES; i++)
-                       printk(" %lu", zone->lowmem_reserve[i]);
+                       printk(" %ld", zone->lowmem_reserve[i]);
                 printk("\n");
         }
  
@@ -5567,7 +5586,7 @@ static void calculate_totalreserve_pages(void)
         for_each_online_pgdat(pgdat) {
                 for (i = 0; i < MAX_NR_ZONES; i++) {
                         struct zone *zone = pgdat->node_zones + i;
-                       unsigned long max = 0;
+                       long max = 0;
  
                         /* Find valid and maximum lowmem_reserve in the zone */
                         for (j = i; j < MAX_NR_ZONES; j++) {
@@ -6050,11 +6069,13 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
  }
  
  /**
- * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
   * @page: The page within the block of interest
- * @start_bitidx: The first bit of interest to retrieve
- * @end_bitidx: The last bit of interest
- * returns pageblock_bits flags
+ * @pfn: The target page frame number
+ * @end_bitidx: The last bit of interest to retrieve
+ * @mask: mask of bits that the caller is interested in
+ *
+ * Return: pageblock_bits flags
   */
  unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
                                         unsigned long end_bitidx,
@@ -6079,9 +6100,10 @@ unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
  /**
   * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
   * @page: The page within the block of interest
- * @start_bitidx: The first bit of interest
- * @end_bitidx: The last bit of interest
   * @flags: The flags to set
+ * @pfn: The target page frame number
+ * @end_bitidx: The last bit of interest
+ * @mask: mask of bits that the caller is interested in
   */
  void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
                                         unsigned long pfn,