xfs: merge fsync and O_SYNC handling
The guarantees for O_SYNC are exactly the same as the ones we need to
make for an fsync call (and given that Linux O_SYNC is O_DSYNC the
equivalent is fdadatasync, but we treat both the same in XFS), except
with a range data writeout.  Jan Kara has started unifying these two
path for filesystems using the generic helpers, and I've started to
look at XFS.
The actual transaction commited by xfs_fsync and xfs_write_sync_logforce
has a different transaction number, but actually is exactly the same.
We'll only use the fsync transaction going forward.  One major difference
is that xfs_write_sync_logforce never issues a cache flush unless we
commit a transaction causing that as a side-effect, which is an obvious
bug in the O_SYNC handling.  Second all the locking and i_update_size
vs i_update_core changes from 978b723712
never made it to xfs_write_sync_logforce, so we add them back.
To make xfs_fsync easily usable from the O_SYNC path, the filemap_fdatawait
call is moved up to xfs_file_fsync, so that we don't wait on the whole
file after we already waited for our portion in xfs_write.
We'll also use a plain call to filemap_write_and_wait_range instead
of the previous sync_page_rang which did it in two steps including
an half-hearted inode write out that doesn't help us.
Once we're done with this also remove the now useless i_update_size
tracking.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
	
	
This commit is contained in:
		
					parent
					
						
							
								bd16956599
							
						
					
				
			
			
				commit
				
					
						13e6d5cdde
					
				
			
		
					 10 changed files with 23 additions and 112 deletions
				
			
		|  | @ -216,7 +216,6 @@ xfs_setfilesize( | |||
| 	if (ip->i_d.di_size < isize) { | ||||
| 		ip->i_d.di_size = isize; | ||||
| 		ip->i_update_core = 1; | ||||
| 		ip->i_update_size = 1; | ||||
| 		xfs_mark_inode_dirty_sync(ip); | ||||
| 	} | ||||
| 
 | ||||
|  |  | |||
|  | @ -172,12 +172,21 @@ xfs_file_release( | |||
|  */ | ||||
| STATIC int | ||||
| xfs_file_fsync( | ||||
| 	struct file	*filp, | ||||
| 	struct dentry	*dentry, | ||||
| 	int		datasync) | ||||
| 	struct file		*file, | ||||
| 	struct dentry		*dentry, | ||||
| 	int			datasync) | ||||
| { | ||||
| 	xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED); | ||||
| 	return -xfs_fsync(XFS_I(dentry->d_inode)); | ||||
| 	struct inode		*inode = dentry->d_inode; | ||||
| 	struct xfs_inode	*ip = XFS_I(inode); | ||||
| 	int			error; | ||||
| 
 | ||||
| 	/* capture size updates in I/O completion before writing the inode. */ | ||||
| 	error = filemap_fdatawait(inode->i_mapping); | ||||
| 	if (error) | ||||
| 		return error; | ||||
| 
 | ||||
| 	xfs_iflags_clear(ip, XFS_ITRUNCATED); | ||||
| 	return -xfs_fsync(ip); | ||||
| } | ||||
| 
 | ||||
| STATIC int | ||||
|  |  | |||
|  | @ -812,18 +812,21 @@ write_retry: | |||
| 
 | ||||
| 	/* Handle various SYNC-type writes */ | ||||
| 	if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { | ||||
| 		loff_t end = pos + ret - 1; | ||||
| 		int error2; | ||||
| 
 | ||||
| 		xfs_iunlock(xip, iolock); | ||||
| 		if (need_i_mutex) | ||||
| 			mutex_unlock(&inode->i_mutex); | ||||
| 		error2 = sync_page_range(inode, mapping, pos, ret); | ||||
| 
 | ||||
| 		error2 = filemap_write_and_wait_range(mapping, pos, end); | ||||
| 		if (!error) | ||||
| 			error = error2; | ||||
| 		if (need_i_mutex) | ||||
| 			mutex_lock(&inode->i_mutex); | ||||
| 		xfs_ilock(xip, iolock); | ||||
| 		error2 = xfs_write_sync_logforce(mp, xip); | ||||
| 
 | ||||
| 		error2 = xfs_fsync(xip); | ||||
| 		if (!error) | ||||
| 			error = error2; | ||||
| 	} | ||||
|  |  | |||
|  | @ -82,7 +82,6 @@ xfs_inode_alloc( | |||
| 	memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); | ||||
| 	ip->i_flags = 0; | ||||
| 	ip->i_update_core = 0; | ||||
| 	ip->i_update_size = 0; | ||||
| 	ip->i_delayed_blks = 0; | ||||
| 	memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); | ||||
| 	ip->i_size = 0; | ||||
|  |  | |||
|  | @ -261,7 +261,6 @@ typedef struct xfs_inode { | |||
| 	/* Miscellaneous state. */ | ||||
| 	unsigned short		i_flags;	/* see defined flags below */ | ||||
| 	unsigned char		i_update_core;	/* timestamps/size is dirty */ | ||||
| 	unsigned char		i_update_size;	/* di_size field is dirty */ | ||||
| 	unsigned int		i_delayed_blks;	/* count of delay alloc blks */ | ||||
| 
 | ||||
| 	xfs_icdinode_t		i_d;		/* most of ondisk inode */ | ||||
|  |  | |||
|  | @ -262,14 +262,6 @@ xfs_inode_item_format( | |||
| 		SYNCHRONIZE(); | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * We don't have to worry about re-ordering here because | ||||
| 	 * the update_size field is protected by the inode lock | ||||
| 	 * and we have that held in exclusive mode. | ||||
| 	 */ | ||||
| 	if (ip->i_update_size) | ||||
| 		ip->i_update_size = 0; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Make sure to get the latest atime from the Linux inode. | ||||
| 	 */ | ||||
|  |  | |||
|  | @ -87,90 +87,6 @@ xfs_write_clear_setuid( | |||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Handle logging requirements of various synchronous types of write. | ||||
|  */ | ||||
| int | ||||
| xfs_write_sync_logforce( | ||||
| 	xfs_mount_t	*mp, | ||||
| 	xfs_inode_t	*ip) | ||||
| { | ||||
| 	int		error = 0; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If we're treating this as O_DSYNC and we have not updated the | ||||
| 	 * size, force the log. | ||||
| 	 */ | ||||
| 	if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) && | ||||
| 	    !(ip->i_update_size)) { | ||||
| 		xfs_inode_log_item_t	*iip = ip->i_itemp; | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * If an allocation transaction occurred | ||||
| 		 * without extending the size, then we have to force | ||||
| 		 * the log up the proper point to ensure that the | ||||
| 		 * allocation is permanent.  We can't count on | ||||
| 		 * the fact that buffered writes lock out direct I/O | ||||
| 		 * writes - the direct I/O write could have extended | ||||
| 		 * the size nontransactionally, then finished before | ||||
| 		 * we started.  xfs_write_file will think that the file | ||||
| 		 * didn't grow but the update isn't safe unless the | ||||
| 		 * size change is logged. | ||||
| 		 * | ||||
| 		 * Force the log if we've committed a transaction | ||||
| 		 * against the inode or if someone else has and | ||||
| 		 * the commit record hasn't gone to disk (e.g. | ||||
| 		 * the inode is pinned).  This guarantees that | ||||
| 		 * all changes affecting the inode are permanent | ||||
| 		 * when we return. | ||||
| 		 */ | ||||
| 		if (iip && iip->ili_last_lsn) { | ||||
| 			error = _xfs_log_force(mp, iip->ili_last_lsn, | ||||
| 					XFS_LOG_FORCE | XFS_LOG_SYNC, NULL); | ||||
| 		} else if (xfs_ipincount(ip) > 0) { | ||||
| 			error = _xfs_log_force(mp, (xfs_lsn_t)0, | ||||
| 					XFS_LOG_FORCE | XFS_LOG_SYNC, NULL); | ||||
| 		} | ||||
| 
 | ||||
| 	} else { | ||||
| 		xfs_trans_t	*tp; | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * O_SYNC or O_DSYNC _with_ a size update are handled | ||||
| 		 * the same way. | ||||
| 		 * | ||||
| 		 * If the write was synchronous then we need to make | ||||
| 		 * sure that the inode modification time is permanent. | ||||
| 		 * We'll have updated the timestamp above, so here | ||||
| 		 * we use a synchronous transaction to log the inode. | ||||
| 		 * It's not fast, but it's necessary. | ||||
| 		 * | ||||
| 		 * If this a dsync write and the size got changed | ||||
| 		 * non-transactionally, then we need to ensure that | ||||
| 		 * the size change gets logged in a synchronous | ||||
| 		 * transaction. | ||||
| 		 */ | ||||
| 		tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC); | ||||
| 		if ((error = xfs_trans_reserve(tp, 0, | ||||
| 						XFS_SWRITE_LOG_RES(mp), | ||||
| 						0, 0, 0))) { | ||||
| 			/* Transaction reserve failed */ | ||||
| 			xfs_trans_cancel(tp, 0); | ||||
| 		} else { | ||||
| 			/* Transaction reserve successful */ | ||||
| 			xfs_ilock(ip, XFS_ILOCK_EXCL); | ||||
| 			xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); | ||||
| 			xfs_trans_ihold(tp, ip); | ||||
| 			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); | ||||
| 			xfs_trans_set_sync(tp); | ||||
| 			error = xfs_trans_commit(tp, 0); | ||||
| 			xfs_iunlock(ip, XFS_ILOCK_EXCL); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	return error; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Force a shutdown of the filesystem instantly while keeping | ||||
|  * the filesystem consistent. We don't do an unmount here; just shutdown | ||||
|  |  | |||
|  | @ -68,7 +68,6 @@ xfs_get_extsz_hint( | |||
|  * Prototypes for functions in xfs_rw.c. | ||||
|  */ | ||||
| extern int xfs_write_clear_setuid(struct xfs_inode *ip); | ||||
| extern int xfs_write_sync_logforce(struct xfs_mount *mp, struct xfs_inode *ip); | ||||
| extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); | ||||
| extern int xfs_bioerror(struct xfs_buf *bp); | ||||
| extern int xfs_bioerror_relse(struct xfs_buf *bp); | ||||
|  |  | |||
|  | @ -68,7 +68,7 @@ typedef struct xfs_trans_header { | |||
| #define XFS_TRANS_GROWFS		14 | ||||
| #define XFS_TRANS_STRAT_WRITE		15 | ||||
| #define XFS_TRANS_DIOSTRAT		16 | ||||
| #define	XFS_TRANS_WRITE_SYNC		17 | ||||
| /* 17 was XFS_TRANS_WRITE_SYNC */ | ||||
| #define	XFS_TRANS_WRITEID		18 | ||||
| #define	XFS_TRANS_ADDAFORK		19 | ||||
| #define	XFS_TRANS_ATTRINVAL		20 | ||||
|  |  | |||
|  | @ -611,7 +611,7 @@ xfs_fsync( | |||
| 	xfs_inode_t	*ip) | ||||
| { | ||||
| 	xfs_trans_t	*tp; | ||||
| 	int		error; | ||||
| 	int		error = 0; | ||||
| 	int		log_flushed = 0, changed = 1; | ||||
| 
 | ||||
| 	xfs_itrace_entry(ip); | ||||
|  | @ -619,14 +619,9 @@ xfs_fsync( | |||
| 	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) | ||||
| 		return XFS_ERROR(EIO); | ||||
| 
 | ||||
| 	/* capture size updates in I/O completion before writing the inode. */ | ||||
| 	error = xfs_wait_on_pages(ip, 0, -1); | ||||
| 	if (error) | ||||
| 		return XFS_ERROR(error); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * We always need to make sure that the required inode state is safe on | ||||
| 	 * disk.  The vnode might be clean but we still might need to force the | ||||
| 	 * disk.  The inode might be clean but we still might need to force the | ||||
| 	 * log because of committed transactions that haven't hit the disk yet. | ||||
| 	 * Likewise, there could be unflushed non-transactional changes to the | ||||
| 	 * inode core that have to go to disk and this requires us to issue | ||||
|  | @ -638,7 +633,7 @@ xfs_fsync( | |||
| 	 */ | ||||
| 	xfs_ilock(ip, XFS_ILOCK_SHARED); | ||||
| 
 | ||||
| 	if (!(ip->i_update_size || ip->i_update_core)) { | ||||
| 	if (!ip->i_update_core) { | ||||
| 		/*
 | ||||
| 		 * Timestamps/size haven't changed since last inode flush or | ||||
| 		 * inode transaction commit.  That means either nothing got | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Christoph Hellwig
				Christoph Hellwig