Merge branch 'xfs-mmap-lock' into for-next
This commit is contained in:
		
				commit
				
					
						88e8fda99a
					
				
			
		
					 8 changed files with 216 additions and 113 deletions
				
			
		| 
						 | 
				
			
			@ -1599,13 +1599,6 @@ xfs_swap_extent_flush(
 | 
			
		|||
	/* Verify O_DIRECT for ftmp */
 | 
			
		||||
	if (VFS_I(ip)->i_mapping->nrpages)
 | 
			
		||||
		return -EINVAL;
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * Don't try to swap extents on mmap()d files because we can't lock
 | 
			
		||||
	 * out races against page faults safely.
 | 
			
		||||
	 */
 | 
			
		||||
	if (mapping_mapped(VFS_I(ip)->i_mapping))
 | 
			
		||||
		return -EBUSY;
 | 
			
		||||
	return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -1633,13 +1626,14 @@ xfs_swap_extents(
 | 
			
		|||
	}
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * Lock up the inodes against other IO and truncate to begin with.
 | 
			
		||||
	 * Then we can ensure the inodes are flushed and have no page cache
 | 
			
		||||
	 * safely. Once we have done this we can take the ilocks and do the rest
 | 
			
		||||
	 * of the checks.
 | 
			
		||||
	 * Lock the inodes against other IO, page faults and truncate to
 | 
			
		||||
	 * begin with.  Then we can ensure the inodes are flushed and have no
 | 
			
		||||
	 * page cache safely. Once we have done this we can take the ilocks and
 | 
			
		||||
	 * do the rest of the checks.
 | 
			
		||||
	 */
 | 
			
		||||
	lock_flags = XFS_IOLOCK_EXCL;
 | 
			
		||||
	lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
 | 
			
		||||
	xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
 | 
			
		||||
	xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
 | 
			
		||||
 | 
			
		||||
	/* Verify that both files have the same format */
 | 
			
		||||
	if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
 | 
			
		||||
| 
						 | 
				
			
			@ -1666,8 +1660,16 @@ xfs_swap_extents(
 | 
			
		|||
		xfs_trans_cancel(tp, 0);
 | 
			
		||||
		goto out_unlock;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * Lock and join the inodes to the tansaction so that transaction commit
 | 
			
		||||
	 * or cancel will unlock the inodes from this point onwards.
 | 
			
		||||
	 */
 | 
			
		||||
	xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
 | 
			
		||||
	lock_flags |= XFS_ILOCK_EXCL;
 | 
			
		||||
	xfs_trans_ijoin(tp, ip, lock_flags);
 | 
			
		||||
	xfs_trans_ijoin(tp, tip, lock_flags);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	/* Verify all data are being swapped */
 | 
			
		||||
	if (sxp->sx_offset != 0 ||
 | 
			
		||||
| 
						 | 
				
			
			@ -1720,9 +1722,6 @@ xfs_swap_extents(
 | 
			
		|||
			goto out_trans_cancel;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	xfs_trans_ijoin(tp, ip, lock_flags);
 | 
			
		||||
	xfs_trans_ijoin(tp, tip, lock_flags);
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * Before we've swapped the forks, lets set the owners of the forks
 | 
			
		||||
	 * appropriately. We have to do this as we are demand paging the btree
 | 
			
		||||
| 
						 | 
				
			
			@ -1856,5 +1855,5 @@ out_unlock:
 | 
			
		|||
 | 
			
		||||
out_trans_cancel:
 | 
			
		||||
	xfs_trans_cancel(tp, 0);
 | 
			
		||||
	goto out_unlock;
 | 
			
		||||
	goto out;
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -847,6 +847,9 @@ xfs_file_fallocate(
 | 
			
		|||
	if (error)
 | 
			
		||||
		goto out_unlock;
 | 
			
		||||
 | 
			
		||||
	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
 | 
			
		||||
	iolock |= XFS_MMAPLOCK_EXCL;
 | 
			
		||||
 | 
			
		||||
	if (mode & FALLOC_FL_PUNCH_HOLE) {
 | 
			
		||||
		error = xfs_free_file_space(ip, offset, len);
 | 
			
		||||
		if (error)
 | 
			
		||||
| 
						 | 
				
			
			@ -996,20 +999,6 @@ xfs_file_mmap(
 | 
			
		|||
	return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * mmap()d file has taken write protection fault and is being made
 | 
			
		||||
 * writable. We can set the page state up correctly for a writable
 | 
			
		||||
 * page, which means we can do correct delalloc accounting (ENOSPC
 | 
			
		||||
 * checking!) and unwritten extent mapping.
 | 
			
		||||
 */
 | 
			
		||||
STATIC int
 | 
			
		||||
xfs_vm_page_mkwrite(
 | 
			
		||||
	struct vm_area_struct	*vma,
 | 
			
		||||
	struct vm_fault		*vmf)
 | 
			
		||||
{
 | 
			
		||||
	return block_page_mkwrite(vma, vmf, xfs_get_blocks);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * This type is designed to indicate the type of offset we would like
 | 
			
		||||
 * to search from page cache for xfs_seek_hole_data().
 | 
			
		||||
| 
						 | 
				
			
			@ -1385,6 +1374,55 @@ xfs_file_llseek(
 | 
			
		|||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * Locking for serialisation of IO during page faults. This results in a lock
 | 
			
		||||
 * ordering of:
 | 
			
		||||
 *
 | 
			
		||||
 * mmap_sem (MM)
 | 
			
		||||
 *   i_mmap_lock (XFS - truncate serialisation)
 | 
			
		||||
 *     page_lock (MM)
 | 
			
		||||
 *       i_lock (XFS - extent map serialisation)
 | 
			
		||||
 */
 | 
			
		||||
STATIC int
 | 
			
		||||
xfs_filemap_fault(
 | 
			
		||||
	struct vm_area_struct	*vma,
 | 
			
		||||
	struct vm_fault		*vmf)
 | 
			
		||||
{
 | 
			
		||||
	struct xfs_inode	*ip = XFS_I(vma->vm_file->f_mapping->host);
 | 
			
		||||
	int			error;
 | 
			
		||||
 | 
			
		||||
	trace_xfs_filemap_fault(ip);
 | 
			
		||||
 | 
			
		||||
	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
 | 
			
		||||
	error = filemap_fault(vma, vmf);
 | 
			
		||||
	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
 | 
			
		||||
 | 
			
		||||
	return error;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * mmap()d file has taken write protection fault and is being made writable. We
 | 
			
		||||
 * can set the page state up correctly for a writable page, which means we can
 | 
			
		||||
 * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
 | 
			
		||||
 * mapping.
 | 
			
		||||
 */
 | 
			
		||||
STATIC int
 | 
			
		||||
xfs_filemap_page_mkwrite(
 | 
			
		||||
	struct vm_area_struct	*vma,
 | 
			
		||||
	struct vm_fault		*vmf)
 | 
			
		||||
{
 | 
			
		||||
	struct xfs_inode	*ip = XFS_I(vma->vm_file->f_mapping->host);
 | 
			
		||||
	int			error;
 | 
			
		||||
 | 
			
		||||
	trace_xfs_filemap_page_mkwrite(ip);
 | 
			
		||||
 | 
			
		||||
	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
 | 
			
		||||
	error = block_page_mkwrite(vma, vmf, xfs_get_blocks);
 | 
			
		||||
	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
 | 
			
		||||
 | 
			
		||||
	return error;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
const struct file_operations xfs_file_operations = {
 | 
			
		||||
	.llseek		= xfs_file_llseek,
 | 
			
		||||
	.read		= new_sync_read,
 | 
			
		||||
| 
						 | 
				
			
			@ -1417,7 +1455,7 @@ const struct file_operations xfs_dir_file_operations = {
 | 
			
		|||
};
 | 
			
		||||
 | 
			
		||||
static const struct vm_operations_struct xfs_file_vm_ops = {
 | 
			
		||||
	.fault		= filemap_fault,
 | 
			
		||||
	.fault		= xfs_filemap_fault,
 | 
			
		||||
	.map_pages	= filemap_map_pages,
 | 
			
		||||
	.page_mkwrite	= xfs_vm_page_mkwrite,
 | 
			
		||||
	.page_mkwrite	= xfs_filemap_page_mkwrite,
 | 
			
		||||
};
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -117,24 +117,34 @@ xfs_ilock_attr_map_shared(
 | 
			
		|||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * The xfs inode contains 2 locks: a multi-reader lock called the
 | 
			
		||||
 * i_iolock and a multi-reader lock called the i_lock.  This routine
 | 
			
		||||
 * allows either or both of the locks to be obtained.
 | 
			
		||||
 * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and
 | 
			
		||||
 * the i_lock.  This routine allows various combinations of the locks to be
 | 
			
		||||
 * obtained.
 | 
			
		||||
 *
 | 
			
		||||
 * The 2 locks should always be ordered so that the IO lock is
 | 
			
		||||
 * obtained first in order to prevent deadlock.
 | 
			
		||||
 * The 3 locks should always be ordered so that the IO lock is obtained first,
 | 
			
		||||
 * the mmap lock second and the ilock last in order to prevent deadlock.
 | 
			
		||||
 *
 | 
			
		||||
 * ip -- the inode being locked
 | 
			
		||||
 * lock_flags -- this parameter indicates the inode's locks
 | 
			
		||||
 *       to be locked.  It can be:
 | 
			
		||||
 *		XFS_IOLOCK_SHARED,
 | 
			
		||||
 *		XFS_IOLOCK_EXCL,
 | 
			
		||||
 *		XFS_ILOCK_SHARED,
 | 
			
		||||
 *		XFS_ILOCK_EXCL,
 | 
			
		||||
 *		XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
 | 
			
		||||
 *		XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
 | 
			
		||||
 *		XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
 | 
			
		||||
 *		XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
 | 
			
		||||
 * Basic locking order:
 | 
			
		||||
 *
 | 
			
		||||
 * i_iolock -> i_mmap_lock -> page_lock -> i_ilock
 | 
			
		||||
 *
 | 
			
		||||
 * mmap_sem locking order:
 | 
			
		||||
 *
 | 
			
		||||
 * i_iolock -> page lock -> mmap_sem
 | 
			
		||||
 * mmap_sem -> i_mmap_lock -> page_lock
 | 
			
		||||
 *
 | 
			
		||||
 * The difference in mmap_sem locking order mean that we cannot hold the
 | 
			
		||||
 * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
 | 
			
		||||
 * fault in pages during copy in/out (for buffered IO) or require the mmap_sem
 | 
			
		||||
 * in get_user_pages() to map the user pages into the kernel address space for
 | 
			
		||||
 * direct IO. Similarly the i_iolock cannot be taken inside a page fault because
 | 
			
		||||
 * page faults already hold the mmap_sem.
 | 
			
		||||
 *
 | 
			
		||||
 * Hence to serialise fully against both syscall and mmap based IO, we need to
 | 
			
		||||
 * take both the i_iolock and the i_mmap_lock. These locks should *only* be both
 | 
			
		||||
 * taken in places where we need to invalidate the page cache in a race
 | 
			
		||||
 * free manner (e.g. truncate, hole punch and other extent manipulation
 | 
			
		||||
 * functions).
 | 
			
		||||
 */
 | 
			
		||||
void
 | 
			
		||||
xfs_ilock(
 | 
			
		||||
| 
						 | 
				
			
			@ -150,6 +160,8 @@ xfs_ilock(
 | 
			
		|||
	 */
 | 
			
		||||
	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 | 
			
		||||
	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 | 
			
		||||
	ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
 | 
			
		||||
	       (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
 | 
			
		||||
	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 | 
			
		||||
	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 | 
			
		||||
	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
 | 
			
		||||
| 
						 | 
				
			
			@ -159,6 +171,11 @@ xfs_ilock(
 | 
			
		|||
	else if (lock_flags & XFS_IOLOCK_SHARED)
 | 
			
		||||
		mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
 | 
			
		||||
 | 
			
		||||
	if (lock_flags & XFS_MMAPLOCK_EXCL)
 | 
			
		||||
		mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
 | 
			
		||||
	else if (lock_flags & XFS_MMAPLOCK_SHARED)
 | 
			
		||||
		mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
 | 
			
		||||
 | 
			
		||||
	if (lock_flags & XFS_ILOCK_EXCL)
 | 
			
		||||
		mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 | 
			
		||||
	else if (lock_flags & XFS_ILOCK_SHARED)
 | 
			
		||||
| 
						 | 
				
			
			@ -191,6 +208,8 @@ xfs_ilock_nowait(
 | 
			
		|||
	 */
 | 
			
		||||
	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 | 
			
		||||
	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 | 
			
		||||
	ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
 | 
			
		||||
	       (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
 | 
			
		||||
	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 | 
			
		||||
	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 | 
			
		||||
	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
 | 
			
		||||
| 
						 | 
				
			
			@ -202,21 +221,35 @@ xfs_ilock_nowait(
 | 
			
		|||
		if (!mrtryaccess(&ip->i_iolock))
 | 
			
		||||
			goto out;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if (lock_flags & XFS_MMAPLOCK_EXCL) {
 | 
			
		||||
		if (!mrtryupdate(&ip->i_mmaplock))
 | 
			
		||||
			goto out_undo_iolock;
 | 
			
		||||
	} else if (lock_flags & XFS_MMAPLOCK_SHARED) {
 | 
			
		||||
		if (!mrtryaccess(&ip->i_mmaplock))
 | 
			
		||||
			goto out_undo_iolock;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if (lock_flags & XFS_ILOCK_EXCL) {
 | 
			
		||||
		if (!mrtryupdate(&ip->i_lock))
 | 
			
		||||
			goto out_undo_iolock;
 | 
			
		||||
			goto out_undo_mmaplock;
 | 
			
		||||
	} else if (lock_flags & XFS_ILOCK_SHARED) {
 | 
			
		||||
		if (!mrtryaccess(&ip->i_lock))
 | 
			
		||||
			goto out_undo_iolock;
 | 
			
		||||
			goto out_undo_mmaplock;
 | 
			
		||||
	}
 | 
			
		||||
	return 1;
 | 
			
		||||
 | 
			
		||||
 out_undo_iolock:
 | 
			
		||||
out_undo_mmaplock:
 | 
			
		||||
	if (lock_flags & XFS_MMAPLOCK_EXCL)
 | 
			
		||||
		mrunlock_excl(&ip->i_mmaplock);
 | 
			
		||||
	else if (lock_flags & XFS_MMAPLOCK_SHARED)
 | 
			
		||||
		mrunlock_shared(&ip->i_mmaplock);
 | 
			
		||||
out_undo_iolock:
 | 
			
		||||
	if (lock_flags & XFS_IOLOCK_EXCL)
 | 
			
		||||
		mrunlock_excl(&ip->i_iolock);
 | 
			
		||||
	else if (lock_flags & XFS_IOLOCK_SHARED)
 | 
			
		||||
		mrunlock_shared(&ip->i_iolock);
 | 
			
		||||
 out:
 | 
			
		||||
out:
 | 
			
		||||
	return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -244,6 +277,8 @@ xfs_iunlock(
 | 
			
		|||
	 */
 | 
			
		||||
	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 | 
			
		||||
	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 | 
			
		||||
	ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
 | 
			
		||||
	       (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
 | 
			
		||||
	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 | 
			
		||||
	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 | 
			
		||||
	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
 | 
			
		||||
| 
						 | 
				
			
			@ -254,6 +289,11 @@ xfs_iunlock(
 | 
			
		|||
	else if (lock_flags & XFS_IOLOCK_SHARED)
 | 
			
		||||
		mrunlock_shared(&ip->i_iolock);
 | 
			
		||||
 | 
			
		||||
	if (lock_flags & XFS_MMAPLOCK_EXCL)
 | 
			
		||||
		mrunlock_excl(&ip->i_mmaplock);
 | 
			
		||||
	else if (lock_flags & XFS_MMAPLOCK_SHARED)
 | 
			
		||||
		mrunlock_shared(&ip->i_mmaplock);
 | 
			
		||||
 | 
			
		||||
	if (lock_flags & XFS_ILOCK_EXCL)
 | 
			
		||||
		mrunlock_excl(&ip->i_lock);
 | 
			
		||||
	else if (lock_flags & XFS_ILOCK_SHARED)
 | 
			
		||||
| 
						 | 
				
			
			@ -271,11 +311,14 @@ xfs_ilock_demote(
 | 
			
		|||
	xfs_inode_t		*ip,
 | 
			
		||||
	uint			lock_flags)
 | 
			
		||||
{
 | 
			
		||||
	ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
 | 
			
		||||
	ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
 | 
			
		||||
	ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL));
 | 
			
		||||
	ASSERT((lock_flags &
 | 
			
		||||
		~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
 | 
			
		||||
 | 
			
		||||
	if (lock_flags & XFS_ILOCK_EXCL)
 | 
			
		||||
		mrdemote(&ip->i_lock);
 | 
			
		||||
	if (lock_flags & XFS_MMAPLOCK_EXCL)
 | 
			
		||||
		mrdemote(&ip->i_mmaplock);
 | 
			
		||||
	if (lock_flags & XFS_IOLOCK_EXCL)
 | 
			
		||||
		mrdemote(&ip->i_iolock);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -294,6 +337,12 @@ xfs_isilocked(
 | 
			
		|||
		return rwsem_is_locked(&ip->i_lock.mr_lock);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
 | 
			
		||||
		if (!(lock_flags & XFS_MMAPLOCK_SHARED))
 | 
			
		||||
			return !!ip->i_mmaplock.mr_writer;
 | 
			
		||||
		return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
 | 
			
		||||
		if (!(lock_flags & XFS_IOLOCK_SHARED))
 | 
			
		||||
			return !!ip->i_iolock.mr_writer;
 | 
			
		||||
| 
						 | 
				
			
			@ -314,14 +363,27 @@ int xfs_lock_delays;
 | 
			
		|||
#endif
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * Bump the subclass so xfs_lock_inodes() acquires each lock with
 | 
			
		||||
 * a different value
 | 
			
		||||
 * Bump the subclass so xfs_lock_inodes() acquires each lock with a different
 | 
			
		||||
 * value. This shouldn't be called for page fault locking, but we also need to
 | 
			
		||||
 * ensure we don't overrun the number of lockdep subclasses for the iolock or
 | 
			
		||||
 * mmaplock as that is limited to 12 by the mmap lock lockdep annotations.
 | 
			
		||||
 */
 | 
			
		||||
static inline int
 | 
			
		||||
xfs_lock_inumorder(int lock_mode, int subclass)
 | 
			
		||||
{
 | 
			
		||||
	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
 | 
			
		||||
	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
 | 
			
		||||
		ASSERT(subclass + XFS_LOCK_INUMORDER <
 | 
			
		||||
			(1 << (XFS_MMAPLOCK_SHIFT - XFS_IOLOCK_SHIFT)));
 | 
			
		||||
		lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
 | 
			
		||||
		ASSERT(subclass + XFS_LOCK_INUMORDER <
 | 
			
		||||
			(1 << (XFS_ILOCK_SHIFT - XFS_MMAPLOCK_SHIFT)));
 | 
			
		||||
		lock_mode |= (subclass + XFS_LOCK_INUMORDER) <<
 | 
			
		||||
							XFS_MMAPLOCK_SHIFT;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
 | 
			
		||||
		lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -440,10 +502,10 @@ again:
 | 
			
		|||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * xfs_lock_two_inodes() can only be used to lock one type of lock
 | 
			
		||||
 * at a time - the iolock or the ilock, but not both at once. If
 | 
			
		||||
 * we lock both at once, lockdep will report false positives saying
 | 
			
		||||
 * we have violated locking orders.
 | 
			
		||||
 * xfs_lock_two_inodes() can only be used to lock one type of lock at a time -
 | 
			
		||||
 * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
 | 
			
		||||
 * lock more than one at a time, lockdep will report false positives saying we
 | 
			
		||||
 * have violated locking orders.
 | 
			
		||||
 */
 | 
			
		||||
void
 | 
			
		||||
xfs_lock_two_inodes(
 | 
			
		||||
| 
						 | 
				
			
			@ -455,8 +517,12 @@ xfs_lock_two_inodes(
 | 
			
		|||
	int			attempts = 0;
 | 
			
		||||
	xfs_log_item_t		*lp;
 | 
			
		||||
 | 
			
		||||
	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
 | 
			
		||||
		ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
 | 
			
		||||
	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
 | 
			
		||||
		ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
 | 
			
		||||
		ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
 | 
			
		||||
	} else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))
 | 
			
		||||
		ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
 | 
			
		||||
 | 
			
		||||
	ASSERT(ip0->i_ino != ip1->i_ino);
 | 
			
		||||
 | 
			
		||||
	if (ip0->i_ino > ip1->i_ino) {
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -56,6 +56,7 @@ typedef struct xfs_inode {
 | 
			
		|||
	struct xfs_inode_log_item *i_itemp;	/* logging information */
 | 
			
		||||
	mrlock_t		i_lock;		/* inode lock */
 | 
			
		||||
	mrlock_t		i_iolock;	/* inode IO lock */
 | 
			
		||||
	mrlock_t		i_mmaplock;	/* inode mmap IO lock */
 | 
			
		||||
	atomic_t		i_pincount;	/* inode pin count */
 | 
			
		||||
	spinlock_t		i_flags_lock;	/* inode i_flags lock */
 | 
			
		||||
	/* Miscellaneous state. */
 | 
			
		||||
| 
						 | 
				
			
			@ -263,15 +264,20 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
 | 
			
		|||
#define	XFS_IOLOCK_SHARED	(1<<1)
 | 
			
		||||
#define	XFS_ILOCK_EXCL		(1<<2)
 | 
			
		||||
#define	XFS_ILOCK_SHARED	(1<<3)
 | 
			
		||||
#define	XFS_MMAPLOCK_EXCL	(1<<4)
 | 
			
		||||
#define	XFS_MMAPLOCK_SHARED	(1<<5)
 | 
			
		||||
 | 
			
		||||
#define XFS_LOCK_MASK		(XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
 | 
			
		||||
				| XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)
 | 
			
		||||
				| XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \
 | 
			
		||||
				| XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED)
 | 
			
		||||
 | 
			
		||||
#define XFS_LOCK_FLAGS \
 | 
			
		||||
	{ XFS_IOLOCK_EXCL,	"IOLOCK_EXCL" }, \
 | 
			
		||||
	{ XFS_IOLOCK_SHARED,	"IOLOCK_SHARED" }, \
 | 
			
		||||
	{ XFS_ILOCK_EXCL,	"ILOCK_EXCL" }, \
 | 
			
		||||
	{ XFS_ILOCK_SHARED,	"ILOCK_SHARED" }
 | 
			
		||||
	{ XFS_ILOCK_SHARED,	"ILOCK_SHARED" }, \
 | 
			
		||||
	{ XFS_MMAPLOCK_EXCL,	"MMAPLOCK_EXCL" }, \
 | 
			
		||||
	{ XFS_MMAPLOCK_SHARED,	"MMAPLOCK_SHARED" }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
| 
						 | 
				
			
			@ -302,17 +308,26 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
 | 
			
		|||
#define XFS_IOLOCK_SHIFT	16
 | 
			
		||||
#define	XFS_IOLOCK_PARENT	(XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT)
 | 
			
		||||
 | 
			
		||||
#define XFS_MMAPLOCK_SHIFT	20
 | 
			
		||||
 | 
			
		||||
#define XFS_ILOCK_SHIFT		24
 | 
			
		||||
#define	XFS_ILOCK_PARENT	(XFS_LOCK_PARENT << XFS_ILOCK_SHIFT)
 | 
			
		||||
#define	XFS_ILOCK_RTBITMAP	(XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT)
 | 
			
		||||
#define	XFS_ILOCK_RTSUM		(XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT)
 | 
			
		||||
 | 
			
		||||
#define XFS_IOLOCK_DEP_MASK	0x00ff0000
 | 
			
		||||
#define XFS_IOLOCK_DEP_MASK	0x000f0000
 | 
			
		||||
#define XFS_MMAPLOCK_DEP_MASK	0x00f00000
 | 
			
		||||
#define XFS_ILOCK_DEP_MASK	0xff000000
 | 
			
		||||
#define XFS_LOCK_DEP_MASK	(XFS_IOLOCK_DEP_MASK | XFS_ILOCK_DEP_MASK)
 | 
			
		||||
#define XFS_LOCK_DEP_MASK	(XFS_IOLOCK_DEP_MASK | \
 | 
			
		||||
				 XFS_MMAPLOCK_DEP_MASK | \
 | 
			
		||||
				 XFS_ILOCK_DEP_MASK)
 | 
			
		||||
 | 
			
		||||
#define XFS_IOLOCK_DEP(flags)	(((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
 | 
			
		||||
#define XFS_ILOCK_DEP(flags)	(((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
 | 
			
		||||
#define XFS_IOLOCK_DEP(flags)	(((flags) & XFS_IOLOCK_DEP_MASK) \
 | 
			
		||||
					>> XFS_IOLOCK_SHIFT)
 | 
			
		||||
#define XFS_MMAPLOCK_DEP(flags)	(((flags) & XFS_MMAPLOCK_DEP_MASK) \
 | 
			
		||||
					>> XFS_MMAPLOCK_SHIFT)
 | 
			
		||||
#define XFS_ILOCK_DEP(flags)	(((flags) & XFS_ILOCK_DEP_MASK) \
 | 
			
		||||
					>> XFS_ILOCK_SHIFT)
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * For multiple groups support: if S_ISGID bit is set in the parent
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -643,6 +643,9 @@ xfs_ioc_space(
 | 
			
		|||
	if (error)
 | 
			
		||||
		goto out_unlock;
 | 
			
		||||
 | 
			
		||||
	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
 | 
			
		||||
	iolock |= XFS_MMAPLOCK_EXCL;
 | 
			
		||||
 | 
			
		||||
	switch (bf->l_whence) {
 | 
			
		||||
	case 0: /*SEEK_SET*/
 | 
			
		||||
		break;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -771,6 +771,7 @@ xfs_setattr_size(
 | 
			
		|||
		return error;
 | 
			
		||||
 | 
			
		||||
	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
 | 
			
		||||
	ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
 | 
			
		||||
	ASSERT(S_ISREG(ip->i_d.di_mode));
 | 
			
		||||
	ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
 | 
			
		||||
		ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
 | 
			
		||||
| 
						 | 
				
			
			@ -834,55 +835,27 @@ xfs_setattr_size(
 | 
			
		|||
	inode_dio_wait(inode);
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * Do all the page cache truncate work outside the transaction context
 | 
			
		||||
	 * as the "lock" order is page lock->log space reservation.  i.e.
 | 
			
		||||
	 * locking pages inside the transaction can ABBA deadlock with
 | 
			
		||||
	 * writeback. We have to do the VFS inode size update before we truncate
 | 
			
		||||
	 * the pagecache, however, to avoid racing with page faults beyond the
 | 
			
		||||
	 * new EOF they are not serialised against truncate operations except by
 | 
			
		||||
	 * page locks and size updates.
 | 
			
		||||
	 * We've already locked out new page faults, so now we can safely remove
 | 
			
		||||
	 * pages from the page cache knowing they won't get refaulted until we
 | 
			
		||||
	 * drop the XFS_MMAP_EXCL lock after the extent manipulations are
 | 
			
		||||
	 * complete. The truncate_setsize() call also cleans partial EOF page
 | 
			
		||||
	 * PTEs on extending truncates and hence ensures sub-page block size
 | 
			
		||||
	 * filesystems are correctly handled, too.
 | 
			
		||||
	 *
 | 
			
		||||
	 * Hence we are in a situation where a truncate can fail with ENOMEM
 | 
			
		||||
	 * from xfs_trans_reserve(), but having already truncated the in-memory
 | 
			
		||||
	 * version of the file (i.e. made user visible changes). There's not
 | 
			
		||||
	 * much we can do about this, except to hope that the caller sees ENOMEM
 | 
			
		||||
	 * and retries the truncate operation.
 | 
			
		||||
	 * We have to do all the page cache truncate work outside the
 | 
			
		||||
	 * transaction context as the "lock" order is page lock->log space
 | 
			
		||||
	 * reservation as defined by extent allocation in the writeback path.
 | 
			
		||||
	 * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but
 | 
			
		||||
	 * having already truncated the in-memory version of the file (i.e. made
 | 
			
		||||
	 * user visible changes). There's not much we can do about this, except
 | 
			
		||||
	 * to hope that the caller sees ENOMEM and retries the truncate
 | 
			
		||||
	 * operation.
 | 
			
		||||
	 */
 | 
			
		||||
	error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
 | 
			
		||||
	if (error)
 | 
			
		||||
		return error;
 | 
			
		||||
	truncate_setsize(inode, newsize);
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * The "we can't serialise against page faults" pain gets worse.
 | 
			
		||||
	 *
 | 
			
		||||
	 * If the file is mapped then we have to clean the page at the old EOF
 | 
			
		||||
	 * when extending the file. Extending the file can expose changes the
 | 
			
		||||
	 * underlying page mapping (e.g. from beyond EOF to a hole or
 | 
			
		||||
	 * unwritten), and so on the next attempt to write to that page we need
 | 
			
		||||
	 * to remap it for write. i.e. we need .page_mkwrite() to be called.
 | 
			
		||||
	 * Hence we need to clean the page to clean the pte and so a new write
 | 
			
		||||
	 * fault will be triggered appropriately.
 | 
			
		||||
	 *
 | 
			
		||||
	 * If we do it before we change the inode size, then we can race with a
 | 
			
		||||
	 * page fault that maps the page with exactly the same problem. If we do
 | 
			
		||||
	 * it after we change the file size, then a new page fault can come in
 | 
			
		||||
	 * and allocate space before we've run the rest of the truncate
 | 
			
		||||
	 * transaction. That's kinda grotesque, but it's better than have data
 | 
			
		||||
	 * over a hole, and so that's the lesser evil that has been chosen here.
 | 
			
		||||
	 *
 | 
			
		||||
	 * The real solution, however, is to have some mechanism for locking out
 | 
			
		||||
	 * page faults while a truncate is in progress.
 | 
			
		||||
	 */
 | 
			
		||||
	if (newsize > oldsize && mapping_mapped(VFS_I(ip)->i_mapping)) {
 | 
			
		||||
		error = filemap_write_and_wait_range(
 | 
			
		||||
				VFS_I(ip)->i_mapping,
 | 
			
		||||
				round_down(oldsize, PAGE_CACHE_SIZE),
 | 
			
		||||
				round_up(oldsize, PAGE_CACHE_SIZE) - 1);
 | 
			
		||||
		if (error)
 | 
			
		||||
			return error;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
 | 
			
		||||
	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
 | 
			
		||||
	if (error)
 | 
			
		||||
| 
						 | 
				
			
			@ -981,8 +954,12 @@ xfs_vn_setattr(
 | 
			
		|||
 | 
			
		||||
		xfs_ilock(ip, iolock);
 | 
			
		||||
		error = xfs_break_layouts(dentry->d_inode, &iolock);
 | 
			
		||||
		if (!error)
 | 
			
		||||
		if (!error) {
 | 
			
		||||
			xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
 | 
			
		||||
			iolock |= XFS_MMAPLOCK_EXCL;
 | 
			
		||||
 | 
			
		||||
			error = xfs_setattr_size(ip, iattr);
 | 
			
		||||
		}
 | 
			
		||||
		xfs_iunlock(ip, iolock);
 | 
			
		||||
	} else {
 | 
			
		||||
		error = xfs_setattr_nonsize(ip, iattr, 0);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -966,6 +966,8 @@ xfs_fs_inode_init_once(
 | 
			
		|||
	atomic_set(&ip->i_pincount, 0);
 | 
			
		||||
	spin_lock_init(&ip->i_flags_lock);
 | 
			
		||||
 | 
			
		||||
	mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
 | 
			
		||||
		     "xfsino", ip->i_ino);
 | 
			
		||||
	mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
 | 
			
		||||
		     "xfsino", ip->i_ino);
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -685,6 +685,9 @@ DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag);
 | 
			
		|||
DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
 | 
			
		||||
DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
 | 
			
		||||
 | 
			
		||||
DEFINE_INODE_EVENT(xfs_filemap_fault);
 | 
			
		||||
DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
 | 
			
		||||
 | 
			
		||||
DECLARE_EVENT_CLASS(xfs_iref_class,
 | 
			
		||||
	TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
 | 
			
		||||
	TP_ARGS(ip, caller_ip),
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue