tmpfs: fix race between umount and swapoff
The use of igrab() in swapoff's shmem_unuse_inode() is just as vulnerable to umount as that in shmem_writepage(). Fix this instance by extending the protection of shmem_swaplist_mutex right across shmem_unuse_inode(): while it's on the list, the inode cannot be evicted (and the filesystem cannot be unmounted) without shmem_evict_inode() taking that mutex to remove it from the list. But since shmem_writepage() might take that mutex, we should avoid making memory allocations or memcg charges while holding it: prepare them at the outer level in shmem_unuse(). When mem_cgroup_cache_charge() was originally placed, we didn't know until that point that the page from swap was actually a shmem page; but nowadays it's noted in the swap_map, so we're safe to charge upfront. For the radix_tree, do as is done in shmem_getpage(): preload upfront, but don't pin to the cpu; so we make a habit of refreshing the node pool, but might dip into GFP_NOWAIT reserves on occasion if subsequently preempted. With the allocation and charge moved out from shmem_unuse_inode(), we can also hold index map and info->lock over from finding the entry. Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Konstantin Khlebnikov <khlebnikov@openvz.org> Cc: <stable@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
		
					parent
					
						
							
								b1dea800ac
							
						
					
				
			
			
				commit
				
					
						778dd893ae
					
				
			
		
					 1 changed files with 43 additions and 45 deletions
				
			
		
							
								
								
									
										88
									
								
								mm/shmem.c
									
										
									
									
									
								
							
							
						
						
									
										88
									
								
								mm/shmem.c
									
										
									
									
									
								
							|  | @ -852,7 +852,7 @@ static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_ | ||||||
| 
 | 
 | ||||||
| static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page) | static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page) | ||||||
| { | { | ||||||
| 	struct inode *inode; | 	struct address_space *mapping; | ||||||
| 	unsigned long idx; | 	unsigned long idx; | ||||||
| 	unsigned long size; | 	unsigned long size; | ||||||
| 	unsigned long limit; | 	unsigned long limit; | ||||||
|  | @ -875,8 +875,10 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s | ||||||
| 	if (size > SHMEM_NR_DIRECT) | 	if (size > SHMEM_NR_DIRECT) | ||||||
| 		size = SHMEM_NR_DIRECT; | 		size = SHMEM_NR_DIRECT; | ||||||
| 	offset = shmem_find_swp(entry, ptr, ptr+size); | 	offset = shmem_find_swp(entry, ptr, ptr+size); | ||||||
| 	if (offset >= 0) | 	if (offset >= 0) { | ||||||
|  | 		shmem_swp_balance_unmap(); | ||||||
| 		goto found; | 		goto found; | ||||||
|  | 	} | ||||||
| 	if (!info->i_indirect) | 	if (!info->i_indirect) | ||||||
| 		goto lost2; | 		goto lost2; | ||||||
| 
 | 
 | ||||||
|  | @ -914,11 +916,11 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s | ||||||
| 			if (size > ENTRIES_PER_PAGE) | 			if (size > ENTRIES_PER_PAGE) | ||||||
| 				size = ENTRIES_PER_PAGE; | 				size = ENTRIES_PER_PAGE; | ||||||
| 			offset = shmem_find_swp(entry, ptr, ptr+size); | 			offset = shmem_find_swp(entry, ptr, ptr+size); | ||||||
| 			shmem_swp_unmap(ptr); |  | ||||||
| 			if (offset >= 0) { | 			if (offset >= 0) { | ||||||
| 				shmem_dir_unmap(dir); | 				shmem_dir_unmap(dir); | ||||||
| 				goto found; | 				goto found; | ||||||
| 			} | 			} | ||||||
|  | 			shmem_swp_unmap(ptr); | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| lost1: | lost1: | ||||||
|  | @ -928,8 +930,7 @@ lost2: | ||||||
| 	return 0; | 	return 0; | ||||||
| found: | found: | ||||||
| 	idx += offset; | 	idx += offset; | ||||||
| 	inode = igrab(&info->vfs_inode); | 	ptr += offset; | ||||||
| 	spin_unlock(&info->lock); |  | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Move _head_ to start search for next from here. | 	 * Move _head_ to start search for next from here. | ||||||
|  | @ -940,37 +941,18 @@ found: | ||||||
| 	 */ | 	 */ | ||||||
| 	if (shmem_swaplist.next != &info->swaplist) | 	if (shmem_swaplist.next != &info->swaplist) | ||||||
| 		list_move_tail(&shmem_swaplist, &info->swaplist); | 		list_move_tail(&shmem_swaplist, &info->swaplist); | ||||||
| 	mutex_unlock(&shmem_swaplist_mutex); |  | ||||||
| 
 | 
 | ||||||
| 	error = 1; |  | ||||||
| 	if (!inode) |  | ||||||
| 		goto out; |  | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Charge page using GFP_KERNEL while we can wait. | 	 * We rely on shmem_swaplist_mutex, not only to protect the swaplist, | ||||||
| 	 * Charged back to the user(not to caller) when swap account is used. | 	 * but also to hold up shmem_evict_inode(): so inode cannot be freed | ||||||
| 	 * add_to_page_cache() will be called with GFP_NOWAIT. | 	 * beneath us (pagelock doesn't help until the page is in pagecache). | ||||||
| 	 */ | 	 */ | ||||||
| 	error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); | 	mapping = info->vfs_inode.i_mapping; | ||||||
| 	if (error) | 	error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); | ||||||
| 		goto out; | 	/* which does mem_cgroup_uncharge_cache_page on error */ | ||||||
| 	error = radix_tree_preload(GFP_KERNEL); |  | ||||||
| 	if (error) { |  | ||||||
| 		mem_cgroup_uncharge_cache_page(page); |  | ||||||
| 		goto out; |  | ||||||
| 	} |  | ||||||
| 	error = 1; |  | ||||||
| 
 |  | ||||||
| 	spin_lock(&info->lock); |  | ||||||
| 	ptr = shmem_swp_entry(info, idx, NULL); |  | ||||||
| 	if (ptr && ptr->val == entry.val) { |  | ||||||
| 		error = add_to_page_cache_locked(page, inode->i_mapping, |  | ||||||
| 						idx, GFP_NOWAIT); |  | ||||||
| 		/* does mem_cgroup_uncharge_cache_page on error */ |  | ||||||
| 	} else	/* we must compensate for our precharge above */ |  | ||||||
| 		mem_cgroup_uncharge_cache_page(page); |  | ||||||
| 
 | 
 | ||||||
| 	if (error == -EEXIST) { | 	if (error == -EEXIST) { | ||||||
| 		struct page *filepage = find_get_page(inode->i_mapping, idx); | 		struct page *filepage = find_get_page(mapping, idx); | ||||||
| 		error = 1; | 		error = 1; | ||||||
| 		if (filepage) { | 		if (filepage) { | ||||||
| 			/*
 | 			/*
 | ||||||
|  | @ -990,14 +972,8 @@ found: | ||||||
| 		swap_free(entry); | 		swap_free(entry); | ||||||
| 		error = 1;	/* not an error, but entry was found */ | 		error = 1;	/* not an error, but entry was found */ | ||||||
| 	} | 	} | ||||||
| 	if (ptr) | 	shmem_swp_unmap(ptr); | ||||||
| 		shmem_swp_unmap(ptr); |  | ||||||
| 	spin_unlock(&info->lock); | 	spin_unlock(&info->lock); | ||||||
| 	radix_tree_preload_end(); |  | ||||||
| out: |  | ||||||
| 	unlock_page(page); |  | ||||||
| 	page_cache_release(page); |  | ||||||
| 	iput(inode);		/* allows for NULL */ |  | ||||||
| 	return error; | 	return error; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -1009,6 +985,26 @@ int shmem_unuse(swp_entry_t entry, struct page *page) | ||||||
| 	struct list_head *p, *next; | 	struct list_head *p, *next; | ||||||
| 	struct shmem_inode_info *info; | 	struct shmem_inode_info *info; | ||||||
| 	int found = 0; | 	int found = 0; | ||||||
|  | 	int error; | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * Charge page using GFP_KERNEL while we can wait, before taking | ||||||
|  | 	 * the shmem_swaplist_mutex which might hold up shmem_writepage(). | ||||||
|  | 	 * Charged back to the user (not to caller) when swap account is used. | ||||||
|  | 	 * add_to_page_cache() will be called with GFP_NOWAIT. | ||||||
|  | 	 */ | ||||||
|  | 	error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); | ||||||
|  | 	if (error) | ||||||
|  | 		goto out; | ||||||
|  | 	/*
 | ||||||
|  | 	 * Try to preload while we can wait, to not make a habit of | ||||||
|  | 	 * draining atomic reserves; but don't latch on to this cpu, | ||||||
|  | 	 * it's okay if sometimes we get rescheduled after this. | ||||||
|  | 	 */ | ||||||
|  | 	error = radix_tree_preload(GFP_KERNEL); | ||||||
|  | 	if (error) | ||||||
|  | 		goto uncharge; | ||||||
|  | 	radix_tree_preload_end(); | ||||||
| 
 | 
 | ||||||
| 	mutex_lock(&shmem_swaplist_mutex); | 	mutex_lock(&shmem_swaplist_mutex); | ||||||
| 	list_for_each_safe(p, next, &shmem_swaplist) { | 	list_for_each_safe(p, next, &shmem_swaplist) { | ||||||
|  | @ -1016,17 +1012,19 @@ int shmem_unuse(swp_entry_t entry, struct page *page) | ||||||
| 		found = shmem_unuse_inode(info, entry, page); | 		found = shmem_unuse_inode(info, entry, page); | ||||||
| 		cond_resched(); | 		cond_resched(); | ||||||
| 		if (found) | 		if (found) | ||||||
| 			goto out; | 			break; | ||||||
| 	} | 	} | ||||||
| 	mutex_unlock(&shmem_swaplist_mutex); | 	mutex_unlock(&shmem_swaplist_mutex); | ||||||
| 	/*
 | 
 | ||||||
| 	 * Can some race bring us here?  We've been holding page lock, | uncharge: | ||||||
| 	 * so I think not; but would rather try again later than BUG() | 	if (!found) | ||||||
| 	 */ | 		mem_cgroup_uncharge_cache_page(page); | ||||||
|  | 	if (found < 0) | ||||||
|  | 		error = found; | ||||||
|  | out: | ||||||
| 	unlock_page(page); | 	unlock_page(page); | ||||||
| 	page_cache_release(page); | 	page_cache_release(page); | ||||||
| out: | 	return error; | ||||||
| 	return (found < 0) ? found : 0; |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Hugh Dickins
				Hugh Dickins