ext4: fix ext4_flush_completed_IO wait semantics
BUG #1) All places where we call ext4_flush_completed_IO are broken because buffered io and DIO/AIO goes through three stages 1) submitted io, 2) completed io (in i_completed_io_list) conversion pended 3) finished io (conversion done) And by calling ext4_flush_completed_IO we will flush only requests which were in (2) stage, which is wrong because: 1) punch_hole and truncate _must_ wait for all outstanding unwritten io regardless to it's state. 2) fsync and nolock_dio_read should also wait because there is a time window between end_page_writeback() and ext4_add_complete_io() As result integrity fsync is broken in case of buffered write to fallocated region: fsync blkdev_completion ->filemap_write_and_wait_range ->ext4_end_bio ->end_page_writeback <-- filemap_write_and_wait_range return ->ext4_flush_completed_IO sees empty i_completed_io_list but pended conversion still exist ->ext4_add_complete_io BUG #2) Race window becomes wider due to the 'ext4: completed_io locking cleanup V4' patch series This patch make following changes: 1) ext4_flush_completed_io() now first try to flush completed io and when wait for any outstanding unwritten io via ext4_unwritten_wait() 2) Rename function to more appropriate name. 3) Assert that all callers of ext4_flush_unwritten_io should hold i_mutex to prevent endless wait Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Reviewed-by: Jan Kara <jack@suse.cz>
This commit is contained in:
		
					parent
					
						
							
								041bbb6d36
							
						
					
				
			
			
				commit
				
					
						c278531d39
					
				
			
		
					 6 changed files with 19 additions and 13 deletions
				
			
		|  | @ -1947,7 +1947,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p); | |||
| 
 | ||||
| /* fsync.c */ | ||||
| extern int ext4_sync_file(struct file *, loff_t, loff_t, int); | ||||
| extern int ext4_flush_completed_IO(struct inode *); | ||||
| extern int ext4_flush_unwritten_io(struct inode *); | ||||
| 
 | ||||
| /* hash.c */ | ||||
| extern int ext4fs_dirhash(const char *name, int len, struct | ||||
|  | @ -2371,6 +2371,7 @@ extern const struct file_operations ext4_dir_operations; | |||
| extern const struct inode_operations ext4_file_inode_operations; | ||||
| extern const struct file_operations ext4_file_operations; | ||||
| extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); | ||||
| extern void ext4_unwritten_wait(struct inode *inode); | ||||
| 
 | ||||
| /* namei.c */ | ||||
| extern const struct inode_operations ext4_dir_inode_operations; | ||||
|  |  | |||
|  | @ -4268,7 +4268,7 @@ void ext4_ext_truncate(struct inode *inode) | |||
| 	 * finish any pending end_io work so we won't run the risk of | ||||
| 	 * converting any truncated blocks to initialized later | ||||
| 	 */ | ||||
| 	ext4_flush_completed_IO(inode); | ||||
| 	ext4_flush_unwritten_io(inode); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * probably first extent we're gonna free will be last in block | ||||
|  | @ -4847,10 +4847,10 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) | |||
| 
 | ||||
| 	/* Wait all existing dio workers, newcomers will block on i_mutex */ | ||||
| 	ext4_inode_block_unlocked_dio(inode); | ||||
| 	inode_dio_wait(inode); | ||||
| 	err = ext4_flush_completed_IO(inode); | ||||
| 	err = ext4_flush_unwritten_io(inode); | ||||
| 	if (err) | ||||
| 		goto out_dio; | ||||
| 	inode_dio_wait(inode); | ||||
| 
 | ||||
| 	credits = ext4_writepage_trans_blocks(inode); | ||||
| 	handle = ext4_journal_start(inode, credits); | ||||
|  |  | |||
|  | @ -55,7 +55,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp) | |||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static void ext4_unwritten_wait(struct inode *inode) | ||||
| void ext4_unwritten_wait(struct inode *inode) | ||||
| { | ||||
| 	wait_queue_head_t *wq = ext4_ioend_wq(inode); | ||||
| 
 | ||||
|  |  | |||
|  | @ -138,7 +138,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) | |||
| 	if (inode->i_sb->s_flags & MS_RDONLY) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	ret = ext4_flush_completed_IO(inode); | ||||
| 	ret = ext4_flush_unwritten_io(inode); | ||||
| 	if (ret < 0) | ||||
| 		goto out; | ||||
| 
 | ||||
|  |  | |||
|  | @ -807,9 +807,11 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, | |||
| 
 | ||||
| retry: | ||||
| 	if (rw == READ && ext4_should_dioread_nolock(inode)) { | ||||
| 		if (unlikely(!list_empty(&ei->i_completed_io_list))) | ||||
| 			ext4_flush_completed_IO(inode); | ||||
| 
 | ||||
| 		if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) { | ||||
| 			mutex_lock(&inode->i_mutex); | ||||
| 			ext4_flush_unwritten_io(inode); | ||||
| 			mutex_unlock(&inode->i_mutex); | ||||
| 		} | ||||
| 		/*
 | ||||
| 		 * Nolock dioread optimization may be dynamically disabled | ||||
| 		 * via ext4_inode_block_unlocked_dio(). Check inode's state | ||||
|  |  | |||
|  | @ -189,8 +189,6 @@ static int ext4_do_flush_completed_IO(struct inode *inode, | |||
| 
 | ||||
| 		list_add_tail(&io->list, &complete); | ||||
| 	} | ||||
| 	/* It is important to update all flags for all end_io in one shot w/o
 | ||||
| 	 * dropping the lock.*/ | ||||
| 	spin_lock_irqsave(&ei->i_completed_io_lock, flags); | ||||
| 	while (!list_empty(&complete)) { | ||||
| 		io = list_entry(complete.next, ext4_io_end_t, list); | ||||
|  | @ -228,9 +226,14 @@ static void ext4_end_io_work(struct work_struct *work) | |||
| 	ext4_do_flush_completed_IO(io->inode, io); | ||||
| } | ||||
| 
 | ||||
| int ext4_flush_completed_IO(struct inode *inode) | ||||
| int ext4_flush_unwritten_io(struct inode *inode) | ||||
| { | ||||
| 	return ext4_do_flush_completed_IO(inode, NULL); | ||||
| 	int ret; | ||||
| 	WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) && | ||||
| 		     !(inode->i_state & I_FREEING)); | ||||
| 	ret = ext4_do_flush_completed_IO(inode, NULL); | ||||
| 	ext4_unwritten_wait(inode); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Dmitry Monakhov
				Dmitry Monakhov