else
new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
- trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
+ trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
return freed;
}
list_splice(&pages_to_free, page_list);
}
+/*
+ * If a kernel thread (such as nfsd for loop-back mounts) services
+ * a backing device by writing to the page cache it sets PF_LESS_THROTTLE.
+ * In that case we should only throttle if the backing device it is
+ * writing to is congested. In other cases it is safe to throttle.
+ */
+static int current_may_throttle(void)
+{
+ return !(current->flags & PF_LESS_THROTTLE) ||
+ current->backing_dev_info == NULL ||
+ bdi_write_congested(current->backing_dev_info);
+}
+
/*
* shrink_inactive_list() is a helper for shrink_zone(). It returns the number
* of reclaimed pages
* implies that pages are cycling through the LRU faster than
* they are written so also forcibly stall.
*/
- if (nr_unqueued_dirty == nr_taken || nr_immediate)
+ if ((nr_unqueued_dirty == nr_taken || nr_immediate) &&
+ current_may_throttle())
congestion_wait(BLK_RW_ASYNC, HZ/10);
}
* is congested. Allow kswapd to continue until it starts encountering
* unqueued dirty pages or cycling through the LRU too quickly.
*/
- if (!sc->hibernation_mode && !current_is_kswapd())
+ if (!sc->hibernation_mode && !current_is_kswapd() &&
+ current_may_throttle())
wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
bool force_scan = false;
unsigned long ap, fp;
enum lru_list lru;
+ bool some_scanned;
+ int pass;
/*
* If the zone or memcg is small, nr[l] can be 0. This
fraction[1] = fp;
denominator = ap + fp + 1;
out:
- for_each_evictable_lru(lru) {
- int file = is_file_lru(lru);
- unsigned long size;
- unsigned long scan;
+ some_scanned = false;
+ /* Only use force_scan on second pass. */
+ for (pass = 0; !some_scanned && pass < 2; pass++) {
+ for_each_evictable_lru(lru) {
+ int file = is_file_lru(lru);
+ unsigned long size;
+ unsigned long scan;
- size = get_lru_size(lruvec, lru);
- scan = size >> sc->priority;
+ size = get_lru_size(lruvec, lru);
+ scan = size >> sc->priority;
- if (!scan && force_scan)
- scan = min(size, SWAP_CLUSTER_MAX);
+ if (!scan && pass && force_scan)
+ scan = min(size, SWAP_CLUSTER_MAX);
- switch (scan_balance) {
- case SCAN_EQUAL:
- /* Scan lists relative to size */
- break;
- case SCAN_FRACT:
+ switch (scan_balance) {
+ case SCAN_EQUAL:
+ /* Scan lists relative to size */
+ break;
+ case SCAN_FRACT:
+ /*
+ * Scan types proportional to swappiness and
+ * their relative recent reclaim efficiency.
+ */
+ scan = div64_u64(scan * fraction[file],
+ denominator);
+ break;
+ case SCAN_FILE:
+ case SCAN_ANON:
+ /* Scan one type exclusively */
+ if ((scan_balance == SCAN_FILE) != file)
+ scan = 0;
+ break;
+ default:
+ /* Look ma, no brain */
+ BUG();
+ }
+ nr[lru] = scan;
/*
- * Scan types proportional to swappiness and
- * their relative recent reclaim efficiency.
+ * Skip the second pass and don't force_scan,
+ * if we found something to scan.
*/
- scan = div64_u64(scan * fraction[file], denominator);
- break;
- case SCAN_FILE:
- case SCAN_ANON:
- /* Scan one type exclusively */
- if ((scan_balance == SCAN_FILE) != file)
- scan = 0;
- break;
- default:
- /* Look ma, no brain */
- BUG();
+ some_scanned |= !!scan;
}
- nr[lru] = scan;
}
}
for (i = 0; i <= ZONE_NORMAL; i++) {
zone = &pgdat->node_zones[i];
+ if (!populated_zone(zone))
+ continue;
+
pfmemalloc_reserve += min_wmark_pages(zone);
free_pages += zone_page_state(zone, NR_FREE_PAGES);
}
+ /* If there are no reserves (unexpected config) then do not throttle */
+ if (!pfmemalloc_reserve)
+ return true;
+
wmark_ok = free_pages > pfmemalloc_reserve / 2;
/* kswapd must be awake if processes are being throttled */
static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
nodemask_t *nodemask)
{
+ struct zoneref *z;
struct zone *zone;
- int high_zoneidx = gfp_zone(gfp_mask);
- pg_data_t *pgdat;
+ pg_data_t *pgdat = NULL;
/*
* Kernel threads should not be throttled as they may be indirectly
if (fatal_signal_pending(current))
goto out;
- /* Check if the pfmemalloc reserves are ok */
- first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
- pgdat = zone->zone_pgdat;
- if (pfmemalloc_watermark_ok(pgdat))
+ /*
+ * Check if the pfmemalloc reserves are ok by finding the first node
+ * with a usable ZONE_NORMAL or lower zone. The expectation is that
+ * GFP_KERNEL will be required for allocating network buffers when
+ * swapping over the network so ZONE_HIGHMEM is unusable.
+ *
+ * Throttling is based on the first usable node and throttled processes
+ * wait on a queue until kswapd makes progress and wakes them. There
+ * is an affinity then between processes waking up and where reclaim
+ * progress has been made assuming the process wakes on the same node.
+ * More importantly, processes running on remote nodes will not compete
+ * for remote pfmemalloc reserves and processes on different nodes
+ * should make reasonable progress.
+ */
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_mask, nodemask) {
+ if (zone_idx(zone) > ZONE_NORMAL)
+ continue;
+
+ /* Throttle based on the first usable node */
+ pgdat = zone->zone_pgdat;
+ if (pfmemalloc_watermark_ok(pgdat))
+ goto out;
+ break;
+ }
+
+ /* If no zone was usable by the allocation flags then do not throttle */
+ if (!pgdat)
goto out;
/* Account for the throttling */
/*
* Called by memory hotplug when all memory in a node is offlined. Caller must
- * hold lock_memory_hotplug().
+ * hold mem_hotplug_begin/end().
*/
void kswapd_stop(int nid)
{