memcg: coalesce uncharge during unmap/truncate
In massive parallel enviroment, res_counter can be a performance bottleneck. One strong techinque to reduce lock contention is reducing calls by coalescing some amount of calls into one. Considering charge/uncharge chatacteristic, - charge is done one by one via demand-paging. - uncharge is done by - in chunk at munmap, truncate, exit, execve... - one by one via vmscan/paging. It seems we have a chance to coalesce uncharges for improving scalability at unmap/truncation. This patch is a for coalescing uncharge. For avoiding scattering memcg's structure to functions under /mm, this patch adds memcg batch uncharge information to the task. A reason for per-task batching is for making use of caller's context information. We do batched uncharge (deleyed uncharge) when truncation/unmap occurs but do direct uncharge when uncharge is called by memory reclaim (vmscan.c). The degree of coalescing depends on callers - at invalidate/trucate... pagevec size - at unmap ....ZAP_BLOCK_SIZE (memory itself will be freed in this degree.) Then, we'll not coalescing too much. On x86-64 8cpu server, I tested overheads of memcg at page fault by running a program which does map/fault/unmap in a loop. Running a task per a cpu by taskset and see sum of the number of page faults in 60secs. [without memcg config] 40156968 page-faults # 0.085 M/sec ( +- 0.046% ) 27.67 cache-miss/faults [root cgroup] 36659599 page-faults # 0.077 M/sec ( +- 0.247% ) 31.58 miss/faults [in a child cgroup] 18444157 page-faults # 0.039 M/sec ( +- 0.133% ) 69.96 miss/faults [child with this patch] 27133719 page-faults # 0.057 M/sec ( +- 0.155% ) 47.16 miss/faults We can see some amounts of improvement. (root cgroup doesn't affected by this patch) Another patch for "charge" will follow this and above will be improved more. Changelog(since 2009/10/02): - renamed filed of memcg_batch (as pages to bytes, memsw to memsw_bytes) - some clean up and commentary/description updates. - added initialize code to copy_process(). (possible bug fix) Changelog(old): - fixed !CONFIG_MEM_CGROUP case. - rebased onto the latest mmotm + softlimit fix patches. - unified patch for callers - added commetns. - make ->do_batch as bool. - removed css_get() at el. We don't need it. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
		
					parent
					
						
							
								cd9b45b78a
							
						
					
				
			
			
				commit
				
					
						569b846df5
					
				
			
		
					 6 changed files with 123 additions and 6 deletions
				
			
		|  | @ -54,6 +54,11 @@ extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru); | |||
| extern void mem_cgroup_del_lru(struct page *page); | ||||
| extern void mem_cgroup_move_lists(struct page *page, | ||||
| 				  enum lru_list from, enum lru_list to); | ||||
| 
 | ||||
| /* For coalescing uncharge for reducing memcg' overhead*/ | ||||
| extern void mem_cgroup_uncharge_start(void); | ||||
| extern void mem_cgroup_uncharge_end(void); | ||||
| 
 | ||||
| extern void mem_cgroup_uncharge_page(struct page *page); | ||||
| extern void mem_cgroup_uncharge_cache_page(struct page *page); | ||||
| extern int mem_cgroup_shmem_charge_fallback(struct page *page, | ||||
|  | @ -151,6 +156,14 @@ static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr) | |||
| { | ||||
| } | ||||
| 
 | ||||
| static inline void mem_cgroup_uncharge_start(void) | ||||
| { | ||||
| } | ||||
| 
 | ||||
| static inline void mem_cgroup_uncharge_end(void) | ||||
| { | ||||
| } | ||||
| 
 | ||||
| static inline void mem_cgroup_uncharge_page(struct page *page) | ||||
| { | ||||
| } | ||||
|  |  | |||
|  | @ -1544,6 +1544,14 @@ struct task_struct { | |||
| 	unsigned long trace_recursion; | ||||
| #endif /* CONFIG_TRACING */ | ||||
| 	unsigned long stack_start; | ||||
| #ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */ | ||||
| 	struct memcg_batch_info { | ||||
| 		int do_batch;	/* incremented when batch uncharge started */ | ||||
| 		struct mem_cgroup *memcg; /* target memcg of uncharge */ | ||||
| 		unsigned long bytes; 		/* uncharged usage */ | ||||
| 		unsigned long memsw_bytes; /* uncharged mem+swap usage */ | ||||
| 	} memcg_batch; | ||||
| #endif | ||||
| }; | ||||
| 
 | ||||
| /* Future-safe accessor for struct task_struct's cpus_allowed. */ | ||||
|  |  | |||
|  | @ -1127,6 +1127,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| #ifdef CONFIG_DEBUG_MUTEXES | ||||
| 	p->blocked_on = NULL; /* not blocked yet */ | ||||
| #endif | ||||
| #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||||
| 	p->memcg_batch.do_batch = 0; | ||||
| 	p->memcg_batch.memcg = NULL; | ||||
| #endif | ||||
| 
 | ||||
| 	p->bts = NULL; | ||||
| 
 | ||||
|  |  | |||
|  | @ -1827,6 +1827,50 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
| 	css_put(&mem->css); | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | ||||
| { | ||||
| 	struct memcg_batch_info *batch = NULL; | ||||
| 	bool uncharge_memsw = true; | ||||
| 	/* If swapout, usage of swap doesn't decrease */ | ||||
| 	if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | ||||
| 		uncharge_memsw = false; | ||||
| 	/*
 | ||||
| 	 * do_batch > 0 when unmapping pages or inode invalidate/truncate. | ||||
| 	 * In those cases, all pages freed continously can be expected to be in | ||||
| 	 * the same cgroup and we have chance to coalesce uncharges. | ||||
| 	 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) | ||||
| 	 * because we want to do uncharge as soon as possible. | ||||
| 	 */ | ||||
| 	if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE)) | ||||
| 		goto direct_uncharge; | ||||
| 
 | ||||
| 	batch = ¤t->memcg_batch; | ||||
| 	/*
 | ||||
| 	 * In usual, we do css_get() when we remember memcg pointer. | ||||
| 	 * But in this case, we keep res->usage until end of a series of | ||||
| 	 * uncharges. Then, it's ok to ignore memcg's refcnt. | ||||
| 	 */ | ||||
| 	if (!batch->memcg) | ||||
| 		batch->memcg = mem; | ||||
| 	/*
 | ||||
| 	 * In typical case, batch->memcg == mem. This means we can | ||||
| 	 * merge a series of uncharges to an uncharge of res_counter. | ||||
| 	 * If not, we uncharge res_counter ony by one. | ||||
| 	 */ | ||||
| 	if (batch->memcg != mem) | ||||
| 		goto direct_uncharge; | ||||
| 	/* remember freed charge and uncharge it later */ | ||||
| 	batch->bytes += PAGE_SIZE; | ||||
| 	if (uncharge_memsw) | ||||
| 		batch->memsw_bytes += PAGE_SIZE; | ||||
| 	return; | ||||
| direct_uncharge: | ||||
| 	res_counter_uncharge(&mem->res, PAGE_SIZE); | ||||
| 	if (uncharge_memsw) | ||||
| 		res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||||
| 	return; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * uncharge if !page_mapped(page) | ||||
|  | @ -1875,12 +1919,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
| 		break; | ||||
| 	} | ||||
| 
 | ||||
| 	if (!mem_cgroup_is_root(mem)) { | ||||
| 		res_counter_uncharge(&mem->res, PAGE_SIZE); | ||||
| 		if (do_swap_account && | ||||
| 				(ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | ||||
| 			res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||||
| 	} | ||||
| 	if (!mem_cgroup_is_root(mem)) | ||||
| 		__do_uncharge(mem, ctype); | ||||
| 	if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | ||||
| 		mem_cgroup_swap_statistics(mem, true); | ||||
| 	mem_cgroup_charge_statistics(mem, pc, false); | ||||
|  | @ -1926,6 +1966,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page) | |||
| 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. | ||||
|  * In that cases, pages are freed continuously and we can expect pages | ||||
|  * are in the same memcg. All these calls itself limits the number of | ||||
|  * pages freed at once, then uncharge_start/end() is called properly. | ||||
|  * This may be called prural(2) times in a context, | ||||
|  */ | ||||
| 
 | ||||
| void mem_cgroup_uncharge_start(void) | ||||
| { | ||||
| 	current->memcg_batch.do_batch++; | ||||
| 	/* We can do nest. */ | ||||
| 	if (current->memcg_batch.do_batch == 1) { | ||||
| 		current->memcg_batch.memcg = NULL; | ||||
| 		current->memcg_batch.bytes = 0; | ||||
| 		current->memcg_batch.memsw_bytes = 0; | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| void mem_cgroup_uncharge_end(void) | ||||
| { | ||||
| 	struct memcg_batch_info *batch = ¤t->memcg_batch; | ||||
| 
 | ||||
| 	if (!batch->do_batch) | ||||
| 		return; | ||||
| 
 | ||||
| 	batch->do_batch--; | ||||
| 	if (batch->do_batch) /* If stacked, do nothing. */ | ||||
| 		return; | ||||
| 
 | ||||
| 	if (!batch->memcg) | ||||
| 		return; | ||||
| 	/*
 | ||||
| 	 * This "batch->memcg" is valid without any css_get/put etc... | ||||
| 	 * bacause we hide charges behind us. | ||||
| 	 */ | ||||
| 	if (batch->bytes) | ||||
| 		res_counter_uncharge(&batch->memcg->res, batch->bytes); | ||||
| 	if (batch->memsw_bytes) | ||||
| 		res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); | ||||
| 	/* forget this pointer (for sanity check) */ | ||||
| 	batch->memcg = NULL; | ||||
| } | ||||
| 
 | ||||
| #ifdef CONFIG_SWAP | ||||
| /*
 | ||||
|  * called after __delete_from_swap_cache() and drop "page" account. | ||||
|  |  | |||
|  | @ -956,6 +956,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
| 		details = NULL; | ||||
| 
 | ||||
| 	BUG_ON(addr >= end); | ||||
| 	mem_cgroup_uncharge_start(); | ||||
| 	tlb_start_vma(tlb, vma); | ||||
| 	pgd = pgd_offset(vma->vm_mm, addr); | ||||
| 	do { | ||||
|  | @ -968,6 +969,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
| 						zap_work, details); | ||||
| 	} while (pgd++, addr = next, (addr != end && *zap_work > 0)); | ||||
| 	tlb_end_vma(tlb, vma); | ||||
| 	mem_cgroup_uncharge_end(); | ||||
| 
 | ||||
| 	return addr; | ||||
| } | ||||
|  |  | |||
|  | @ -272,6 +272,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
| 			pagevec_release(&pvec); | ||||
| 			break; | ||||
| 		} | ||||
| 		mem_cgroup_uncharge_start(); | ||||
| 		for (i = 0; i < pagevec_count(&pvec); i++) { | ||||
| 			struct page *page = pvec.pages[i]; | ||||
| 
 | ||||
|  | @ -286,6 +287,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
| 			unlock_page(page); | ||||
| 		} | ||||
| 		pagevec_release(&pvec); | ||||
| 		mem_cgroup_uncharge_end(); | ||||
| 	} | ||||
| } | ||||
| EXPORT_SYMBOL(truncate_inode_pages_range); | ||||
|  | @ -327,6 +329,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
| 	pagevec_init(&pvec, 0); | ||||
| 	while (next <= end && | ||||
| 			pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | ||||
| 		mem_cgroup_uncharge_start(); | ||||
| 		for (i = 0; i < pagevec_count(&pvec); i++) { | ||||
| 			struct page *page = pvec.pages[i]; | ||||
| 			pgoff_t index; | ||||
|  | @ -354,6 +357,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
| 				break; | ||||
| 		} | ||||
| 		pagevec_release(&pvec); | ||||
| 		mem_cgroup_uncharge_end(); | ||||
| 		cond_resched(); | ||||
| 	} | ||||
| 	return ret; | ||||
|  | @ -428,6 +432,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
| 	while (next <= end && !wrapped && | ||||
| 		pagevec_lookup(&pvec, mapping, next, | ||||
| 			min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { | ||||
| 		mem_cgroup_uncharge_start(); | ||||
| 		for (i = 0; i < pagevec_count(&pvec); i++) { | ||||
| 			struct page *page = pvec.pages[i]; | ||||
| 			pgoff_t page_index; | ||||
|  | @ -477,6 +482,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
| 			unlock_page(page); | ||||
| 		} | ||||
| 		pagevec_release(&pvec); | ||||
| 		mem_cgroup_uncharge_end(); | ||||
| 		cond_resched(); | ||||
| 	} | ||||
| 	return ret; | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 KAMEZAWA Hiroyuki
				KAMEZAWA Hiroyuki