mm: throttle direct reclaimers if PF_MEMALLOC reserves are low and swap is backed by network storage
If swap is backed by network storage such as NBD, there is a risk that a large number of reclaimers can hang the system by consuming all PF_MEMALLOC reserves. To avoid these hangs, the administrator must tune min_free_kbytes in advance which is a bit fragile. This patch throttles direct reclaimers if half the PF_MEMALLOC reserves are in use. If the system is routinely getting throttled the system administrator can increase min_free_kbytes so degradation is smoother but the system will keep running. Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: David Miller <davem@davemloft.net> Cc: Neil Brown <neilb@suse.de> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Mike Christie <michaelc@cs.wisc.edu> Cc: Eric B Munson <emunson@mgebm.net> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> Cc: Mel Gorman <mgorman@suse.de> Cc: Christoph Lameter <cl@linux.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
		
					parent
					
						
							
								7f338fe454
							
						
					
				
			
			
				commit
				
					
						5515061d22
					
				
			
		
					 3 changed files with 122 additions and 8 deletions
				
			
		| 
						 | 
					@ -705,6 +705,7 @@ typedef struct pglist_data {
 | 
				
			||||||
					     range, including holes */
 | 
										     range, including holes */
 | 
				
			||||||
	int node_id;
 | 
						int node_id;
 | 
				
			||||||
	wait_queue_head_t kswapd_wait;
 | 
						wait_queue_head_t kswapd_wait;
 | 
				
			||||||
 | 
						wait_queue_head_t pfmemalloc_wait;
 | 
				
			||||||
	struct task_struct *kswapd;	/* Protected by lock_memory_hotplug() */
 | 
						struct task_struct *kswapd;	/* Protected by lock_memory_hotplug() */
 | 
				
			||||||
	int kswapd_max_order;
 | 
						int kswapd_max_order;
 | 
				
			||||||
	enum zone_type classzone_idx;
 | 
						enum zone_type classzone_idx;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4389,6 +4389,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 | 
				
			||||||
	pgdat_resize_init(pgdat);
 | 
						pgdat_resize_init(pgdat);
 | 
				
			||||||
	pgdat->nr_zones = 0;
 | 
						pgdat->nr_zones = 0;
 | 
				
			||||||
	init_waitqueue_head(&pgdat->kswapd_wait);
 | 
						init_waitqueue_head(&pgdat->kswapd_wait);
 | 
				
			||||||
 | 
						init_waitqueue_head(&pgdat->pfmemalloc_wait);
 | 
				
			||||||
	pgdat->kswapd_max_order = 0;
 | 
						pgdat->kswapd_max_order = 0;
 | 
				
			||||||
	pgdat_page_cgroup_init(pgdat);
 | 
						pgdat_page_cgroup_init(pgdat);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										128
									
								
								mm/vmscan.c
									
										
									
									
									
								
							
							
						
						
									
										128
									
								
								mm/vmscan.c
									
										
									
									
									
								
							| 
						 | 
					@ -2112,6 +2112,80 @@ out:
 | 
				
			||||||
	return 0;
 | 
						return 0;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct zone *zone;
 | 
				
			||||||
 | 
						unsigned long pfmemalloc_reserve = 0;
 | 
				
			||||||
 | 
						unsigned long free_pages = 0;
 | 
				
			||||||
 | 
						int i;
 | 
				
			||||||
 | 
						bool wmark_ok;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						for (i = 0; i <= ZONE_NORMAL; i++) {
 | 
				
			||||||
 | 
							zone = &pgdat->node_zones[i];
 | 
				
			||||||
 | 
							pfmemalloc_reserve += min_wmark_pages(zone);
 | 
				
			||||||
 | 
							free_pages += zone_page_state(zone, NR_FREE_PAGES);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						wmark_ok = free_pages > pfmemalloc_reserve / 2;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* kswapd must be awake if processes are being throttled */
 | 
				
			||||||
 | 
						if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
 | 
				
			||||||
 | 
							pgdat->classzone_idx = min(pgdat->classzone_idx,
 | 
				
			||||||
 | 
											(enum zone_type)ZONE_NORMAL);
 | 
				
			||||||
 | 
							wake_up_interruptible(&pgdat->kswapd_wait);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return wmark_ok;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * Throttle direct reclaimers if backing storage is backed by the network
 | 
				
			||||||
 | 
					 * and the PFMEMALLOC reserve for the preferred node is getting dangerously
 | 
				
			||||||
 | 
					 * depleted. kswapd will continue to make progress and wake the processes
 | 
				
			||||||
 | 
					 * when the low watermark is reached
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
 | 
				
			||||||
 | 
										nodemask_t *nodemask)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct zone *zone;
 | 
				
			||||||
 | 
						int high_zoneidx = gfp_zone(gfp_mask);
 | 
				
			||||||
 | 
						pg_data_t *pgdat;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * Kernel threads should not be throttled as they may be indirectly
 | 
				
			||||||
 | 
						 * responsible for cleaning pages necessary for reclaim to make forward
 | 
				
			||||||
 | 
						 * progress. kjournald for example may enter direct reclaim while
 | 
				
			||||||
 | 
						 * committing a transaction where throttling it could forcing other
 | 
				
			||||||
 | 
						 * processes to block on log_wait_commit().
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						if (current->flags & PF_KTHREAD)
 | 
				
			||||||
 | 
							return;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* Check if the pfmemalloc reserves are ok */
 | 
				
			||||||
 | 
						first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
 | 
				
			||||||
 | 
						pgdat = zone->zone_pgdat;
 | 
				
			||||||
 | 
						if (pfmemalloc_watermark_ok(pgdat))
 | 
				
			||||||
 | 
							return;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * If the caller cannot enter the filesystem, it's possible that it
 | 
				
			||||||
 | 
						 * is due to the caller holding an FS lock or performing a journal
 | 
				
			||||||
 | 
						 * transaction in the case of a filesystem like ext[3|4]. In this case,
 | 
				
			||||||
 | 
						 * it is not safe to block on pfmemalloc_wait as kswapd could be
 | 
				
			||||||
 | 
						 * blocked waiting on the same lock. Instead, throttle for up to a
 | 
				
			||||||
 | 
						 * second before continuing.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						if (!(gfp_mask & __GFP_FS)) {
 | 
				
			||||||
 | 
							wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
 | 
				
			||||||
 | 
								pfmemalloc_watermark_ok(pgdat), HZ);
 | 
				
			||||||
 | 
							return;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* Throttle until kswapd wakes the process */
 | 
				
			||||||
 | 
						wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
 | 
				
			||||||
 | 
							pfmemalloc_watermark_ok(pgdat));
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 | 
					unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 | 
				
			||||||
				gfp_t gfp_mask, nodemask_t *nodemask)
 | 
									gfp_t gfp_mask, nodemask_t *nodemask)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
| 
						 | 
					@ -2131,6 +2205,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 | 
				
			||||||
		.gfp_mask = sc.gfp_mask,
 | 
							.gfp_mask = sc.gfp_mask,
 | 
				
			||||||
	};
 | 
						};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * Do not enter reclaim if fatal signal is pending. 1 is returned so
 | 
				
			||||||
 | 
						 * that the page allocator does not consider triggering OOM
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						if (fatal_signal_pending(current))
 | 
				
			||||||
 | 
							return 1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	trace_mm_vmscan_direct_reclaim_begin(order,
 | 
						trace_mm_vmscan_direct_reclaim_begin(order,
 | 
				
			||||||
				sc.may_writepage,
 | 
									sc.may_writepage,
 | 
				
			||||||
				gfp_mask);
 | 
									gfp_mask);
 | 
				
			||||||
| 
						 | 
					@ -2275,8 +2358,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
 | 
				
			||||||
	return balanced_pages >= (present_pages >> 2);
 | 
						return balanced_pages >= (present_pages >> 2);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* is kswapd sleeping prematurely? */
 | 
					/*
 | 
				
			||||||
static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
 | 
					 * Prepare kswapd for sleeping. This verifies that there are no processes
 | 
				
			||||||
 | 
					 * waiting in throttle_direct_reclaim() and that watermarks have been met.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Returns true if kswapd is ready to sleep
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 | 
				
			||||||
					int classzone_idx)
 | 
										int classzone_idx)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	int i;
 | 
						int i;
 | 
				
			||||||
| 
						 | 
					@ -2285,7 +2373,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
 | 
						/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
 | 
				
			||||||
	if (remaining)
 | 
						if (remaining)
 | 
				
			||||||
		return true;
 | 
							return false;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * There is a potential race between when kswapd checks its watermarks
 | 
				
			||||||
 | 
						 * and a process gets throttled. There is also a potential race if
 | 
				
			||||||
 | 
						 * processes get throttled, kswapd wakes, a large process exits therby
 | 
				
			||||||
 | 
						 * balancing the zones that causes kswapd to miss a wakeup. If kswapd
 | 
				
			||||||
 | 
						 * is going to sleep, no process should be sleeping on pfmemalloc_wait
 | 
				
			||||||
 | 
						 * so wake them now if necessary. If necessary, processes will wake
 | 
				
			||||||
 | 
						 * kswapd and get throttled again
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
 | 
				
			||||||
 | 
							wake_up(&pgdat->pfmemalloc_wait);
 | 
				
			||||||
 | 
							return false;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* Check the watermark levels */
 | 
						/* Check the watermark levels */
 | 
				
			||||||
	for (i = 0; i <= classzone_idx; i++) {
 | 
						for (i = 0; i <= classzone_idx; i++) {
 | 
				
			||||||
| 
						 | 
					@ -2318,9 +2420,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
 | 
				
			||||||
	 * must be balanced
 | 
						 * must be balanced
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	if (order)
 | 
						if (order)
 | 
				
			||||||
		return !pgdat_balanced(pgdat, balanced, classzone_idx);
 | 
							return pgdat_balanced(pgdat, balanced, classzone_idx);
 | 
				
			||||||
	else
 | 
						else
 | 
				
			||||||
		return !all_zones_ok;
 | 
							return all_zones_ok;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					/*
 | 
				
			||||||
| 
						 | 
					@ -2546,6 +2648,16 @@ loop_again:
 | 
				
			||||||
			}
 | 
								}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							/*
 | 
				
			||||||
 | 
							 * If the low watermark is met there is no need for processes
 | 
				
			||||||
 | 
							 * to be throttled on pfmemalloc_wait as they should not be
 | 
				
			||||||
 | 
							 * able to safely make forward progress. Wake them
 | 
				
			||||||
 | 
							 */
 | 
				
			||||||
 | 
							if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
 | 
				
			||||||
 | 
									pfmemalloc_watermark_ok(pgdat))
 | 
				
			||||||
 | 
								wake_up(&pgdat->pfmemalloc_wait);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
 | 
							if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
 | 
				
			||||||
			break;		/* kswapd: all done */
 | 
								break;		/* kswapd: all done */
 | 
				
			||||||
		/*
 | 
							/*
 | 
				
			||||||
| 
						 | 
					@ -2647,7 +2759,7 @@ out:
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * Return the order we were reclaiming at so sleeping_prematurely()
 | 
						 * Return the order we were reclaiming at so prepare_kswapd_sleep()
 | 
				
			||||||
	 * makes a decision on the order we were last reclaiming at. However,
 | 
						 * makes a decision on the order we were last reclaiming at. However,
 | 
				
			||||||
	 * if another caller entered the allocator slow path while kswapd
 | 
						 * if another caller entered the allocator slow path while kswapd
 | 
				
			||||||
	 * was awake, order will remain at the higher level
 | 
						 * was awake, order will remain at the higher level
 | 
				
			||||||
| 
						 | 
					@ -2667,7 +2779,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 | 
				
			||||||
	prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
 | 
						prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* Try to sleep for a short interval */
 | 
						/* Try to sleep for a short interval */
 | 
				
			||||||
	if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
 | 
						if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
 | 
				
			||||||
		remaining = schedule_timeout(HZ/10);
 | 
							remaining = schedule_timeout(HZ/10);
 | 
				
			||||||
		finish_wait(&pgdat->kswapd_wait, &wait);
 | 
							finish_wait(&pgdat->kswapd_wait, &wait);
 | 
				
			||||||
		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
 | 
							prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
 | 
				
			||||||
| 
						 | 
					@ -2677,7 +2789,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 | 
				
			||||||
	 * After a short sleep, check if it was a premature sleep. If not, then
 | 
						 * After a short sleep, check if it was a premature sleep. If not, then
 | 
				
			||||||
	 * go fully to sleep until explicitly woken up.
 | 
						 * go fully to sleep until explicitly woken up.
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
 | 
						if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
 | 
				
			||||||
		trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
 | 
							trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		/*
 | 
							/*
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue