ceph: Fix i_size update race
ceph_aio_write() has an optimization that marks cap EPH_CAP_FILE_WR
dirty before data is copied to page cache and inode size is updated.
If ceph_check_caps() flushes the dirty cap before the inode size is
updated, MDS can miss the new inode size. The fix is move
ceph_{get,put}_cap_refs() into ceph_write_{begin,end}() and call
__ceph_mark_dirty_caps() after inode size is updated.
Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Sage Weil <sage@inktank.com>
	
	
This commit is contained in:
		
					parent
					
						
							
								4d1d0534f5
							
						
					
				
			
			
				commit
				
					
						22cddde104
					
				
			
		
					 2 changed files with 78 additions and 48 deletions
				
			
		|  | @ -1078,23 +1078,51 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, | ||||||
| 			    struct page **pagep, void **fsdata) | 			    struct page **pagep, void **fsdata) | ||||||
| { | { | ||||||
| 	struct inode *inode = file->f_dentry->d_inode; | 	struct inode *inode = file->f_dentry->d_inode; | ||||||
|  | 	struct ceph_inode_info *ci = ceph_inode(inode); | ||||||
|  | 	struct ceph_file_info *fi = file->private_data; | ||||||
| 	struct page *page; | 	struct page *page; | ||||||
| 	pgoff_t index = pos >> PAGE_CACHE_SHIFT; | 	pgoff_t index = pos >> PAGE_CACHE_SHIFT; | ||||||
| 	int r; | 	int r, want, got = 0; | ||||||
|  | 
 | ||||||
|  | 	if (fi->fmode & CEPH_FILE_MODE_LAZY) | ||||||
|  | 		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; | ||||||
|  | 	else | ||||||
|  | 		want = CEPH_CAP_FILE_BUFFER; | ||||||
|  | 
 | ||||||
|  | 	dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n", | ||||||
|  | 	     inode, ceph_vinop(inode), pos, len, inode->i_size); | ||||||
|  | 	r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len); | ||||||
|  | 	if (r < 0) | ||||||
|  | 		return r; | ||||||
|  | 	dout("write_begin %p %llx.%llx %llu~%u  got cap refs on %s\n", | ||||||
|  | 	     inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); | ||||||
|  | 	if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) { | ||||||
|  | 		ceph_put_cap_refs(ci, got); | ||||||
|  | 		return -EAGAIN; | ||||||
|  | 	} | ||||||
| 
 | 
 | ||||||
| 	do { | 	do { | ||||||
| 		/* get a page */ | 		/* get a page */ | ||||||
| 		page = grab_cache_page_write_begin(mapping, index, 0); | 		page = grab_cache_page_write_begin(mapping, index, 0); | ||||||
| 		if (!page) | 		if (!page) { | ||||||
| 			return -ENOMEM; | 			r = -ENOMEM; | ||||||
| 		*pagep = page; | 			break; | ||||||
|  | 		} | ||||||
| 
 | 
 | ||||||
| 		dout("write_begin file %p inode %p page %p %d~%d\n", file, | 		dout("write_begin file %p inode %p page %p %d~%d\n", file, | ||||||
| 		     inode, page, (int)pos, (int)len); | 		     inode, page, (int)pos, (int)len); | ||||||
| 
 | 
 | ||||||
| 		r = ceph_update_writeable_page(file, pos, len, page); | 		r = ceph_update_writeable_page(file, pos, len, page); | ||||||
|  | 		if (r) | ||||||
|  | 			page_cache_release(page); | ||||||
| 	} while (r == -EAGAIN); | 	} while (r == -EAGAIN); | ||||||
| 
 | 
 | ||||||
|  | 	if (r) { | ||||||
|  | 		ceph_put_cap_refs(ci, got); | ||||||
|  | 	} else { | ||||||
|  | 		*pagep = page; | ||||||
|  | 		*(int *)fsdata = got; | ||||||
|  | 	} | ||||||
| 	return r; | 	return r; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -1108,10 +1136,12 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, | ||||||
| 			  struct page *page, void *fsdata) | 			  struct page *page, void *fsdata) | ||||||
| { | { | ||||||
| 	struct inode *inode = file->f_dentry->d_inode; | 	struct inode *inode = file->f_dentry->d_inode; | ||||||
|  | 	struct ceph_inode_info *ci = ceph_inode(inode); | ||||||
| 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | ||||||
| 	struct ceph_mds_client *mdsc = fsc->mdsc; | 	struct ceph_mds_client *mdsc = fsc->mdsc; | ||||||
| 	unsigned from = pos & (PAGE_CACHE_SIZE - 1); | 	unsigned from = pos & (PAGE_CACHE_SIZE - 1); | ||||||
| 	int check_cap = 0; | 	int check_cap = 0; | ||||||
|  | 	int got = (unsigned long)fsdata; | ||||||
| 
 | 
 | ||||||
| 	dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, | 	dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, | ||||||
| 	     inode, page, (int)pos, (int)copied, (int)len); | 	     inode, page, (int)pos, (int)copied, (int)len); | ||||||
|  | @ -1134,6 +1164,19 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, | ||||||
| 	up_read(&mdsc->snap_rwsem); | 	up_read(&mdsc->snap_rwsem); | ||||||
| 	page_cache_release(page); | 	page_cache_release(page); | ||||||
| 
 | 
 | ||||||
|  | 	if (copied > 0) { | ||||||
|  | 		int dirty; | ||||||
|  | 		spin_lock(&ci->i_ceph_lock); | ||||||
|  | 		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); | ||||||
|  | 		spin_unlock(&ci->i_ceph_lock); | ||||||
|  | 		if (dirty) | ||||||
|  | 			__mark_inode_dirty(inode, dirty); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	dout("write_end %p %llx.%llx %llu~%u  dropping cap refs on %s\n", | ||||||
|  | 	     inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); | ||||||
|  | 	ceph_put_cap_refs(ci, got); | ||||||
|  | 
 | ||||||
| 	if (check_cap) | 	if (check_cap) | ||||||
| 		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); | 		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -712,63 +712,53 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, | ||||||
| 	struct ceph_osd_client *osdc = | 	struct ceph_osd_client *osdc = | ||||||
| 		&ceph_sb_to_client(inode->i_sb)->client->osdc; | 		&ceph_sb_to_client(inode->i_sb)->client->osdc; | ||||||
| 	loff_t endoff = pos + iov->iov_len; | 	loff_t endoff = pos + iov->iov_len; | ||||||
| 	int want, got = 0; | 	int got = 0; | ||||||
| 	int ret, err; | 	int ret, err, written; | ||||||
| 
 | 
 | ||||||
| 	if (ceph_snap(inode) != CEPH_NOSNAP) | 	if (ceph_snap(inode) != CEPH_NOSNAP) | ||||||
| 		return -EROFS; | 		return -EROFS; | ||||||
| 
 | 
 | ||||||
| retry_snap: | retry_snap: | ||||||
|  | 	written = 0; | ||||||
| 	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) | 	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) | ||||||
| 		return -ENOSPC; | 		return -ENOSPC; | ||||||
| 	__ceph_do_pending_vmtruncate(inode); | 	__ceph_do_pending_vmtruncate(inode); | ||||||
| 	dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", |  | ||||||
| 	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, |  | ||||||
| 	     inode->i_size); |  | ||||||
| 	if (fi->fmode & CEPH_FILE_MODE_LAZY) |  | ||||||
| 		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; |  | ||||||
| 	else |  | ||||||
| 		want = CEPH_CAP_FILE_BUFFER; |  | ||||||
| 	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); |  | ||||||
| 	if (ret < 0) |  | ||||||
| 		goto out_put; |  | ||||||
| 
 |  | ||||||
| 	dout("aio_write %p %llx.%llx %llu~%u  got cap refs on %s\n", |  | ||||||
| 	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, |  | ||||||
| 	     ceph_cap_string(got)); |  | ||||||
| 
 |  | ||||||
| 	if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || |  | ||||||
| 	    (iocb->ki_filp->f_flags & O_DIRECT) || |  | ||||||
| 	    (inode->i_sb->s_flags & MS_SYNCHRONOUS) || |  | ||||||
| 	    (fi->flags & CEPH_F_SYNC)) { |  | ||||||
| 		ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, |  | ||||||
| 			&iocb->ki_pos); |  | ||||||
| 	} else { |  | ||||||
| 		/*
 |  | ||||||
| 		 * buffered write; drop Fw early to avoid slow |  | ||||||
| 		 * revocation if we get stuck on balance_dirty_pages |  | ||||||
| 		 */ |  | ||||||
| 		int dirty; |  | ||||||
| 
 |  | ||||||
| 		spin_lock(&ci->i_ceph_lock); |  | ||||||
| 		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); |  | ||||||
| 		spin_unlock(&ci->i_ceph_lock); |  | ||||||
| 		ceph_put_cap_refs(ci, got); |  | ||||||
| 
 | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * try to do a buffered write.  if we don't have sufficient | ||||||
|  | 	 * caps, we'll get -EAGAIN from generic_file_aio_write, or a | ||||||
|  | 	 * short write if we only get caps for some pages. | ||||||
|  | 	 */ | ||||||
|  | 	if (!(iocb->ki_filp->f_flags & O_DIRECT) && | ||||||
|  | 	    !(inode->i_sb->s_flags & MS_SYNCHRONOUS) && | ||||||
|  | 	    !(fi->flags & CEPH_F_SYNC)) { | ||||||
| 		ret = generic_file_aio_write(iocb, iov, nr_segs, pos); | 		ret = generic_file_aio_write(iocb, iov, nr_segs, pos); | ||||||
|  | 		if (ret >= 0) | ||||||
|  | 			written = ret; | ||||||
|  | 
 | ||||||
| 		if ((ret >= 0 || ret == -EIOCBQUEUED) && | 		if ((ret >= 0 || ret == -EIOCBQUEUED) && | ||||||
| 		    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) | 		    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) | ||||||
| 		     || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { | 		     || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { | ||||||
| 			err = vfs_fsync_range(file, pos, pos + ret - 1, 1); | 			err = vfs_fsync_range(file, pos, pos + written - 1, 1); | ||||||
| 			if (err < 0) | 			if (err < 0) | ||||||
| 				ret = err; | 				ret = err; | ||||||
| 		} | 		} | ||||||
| 
 | 		if ((ret < 0 && ret != -EAGAIN) || pos + written >= endoff) | ||||||
| 		if (dirty) | 			goto out; | ||||||
| 			__mark_inode_dirty(inode, dirty); |  | ||||||
| 		goto out; |  | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", | ||||||
|  | 	     inode, ceph_vinop(inode), pos + written, | ||||||
|  | 	     (unsigned)iov->iov_len - written, inode->i_size); | ||||||
|  | 	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, 0, &got, endoff); | ||||||
|  | 	if (ret < 0) | ||||||
|  | 		goto out; | ||||||
|  | 
 | ||||||
|  | 	dout("aio_write %p %llx.%llx %llu~%u  got cap refs on %s\n", | ||||||
|  | 	     inode, ceph_vinop(inode), pos + written, | ||||||
|  | 	     (unsigned)iov->iov_len - written, ceph_cap_string(got)); | ||||||
|  | 	ret = ceph_sync_write(file, iov->iov_base + written, | ||||||
|  | 			      iov->iov_len - written, &iocb->ki_pos); | ||||||
| 	if (ret >= 0) { | 	if (ret >= 0) { | ||||||
| 		int dirty; | 		int dirty; | ||||||
| 		spin_lock(&ci->i_ceph_lock); | 		spin_lock(&ci->i_ceph_lock); | ||||||
|  | @ -777,13 +767,10 @@ retry_snap: | ||||||
| 		if (dirty) | 		if (dirty) | ||||||
| 			__mark_inode_dirty(inode, dirty); | 			__mark_inode_dirty(inode, dirty); | ||||||
| 	} | 	} | ||||||
| 
 |  | ||||||
| out_put: |  | ||||||
| 	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n", | 	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n", | ||||||
| 	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, | 	     inode, ceph_vinop(inode), pos + written, | ||||||
| 	     ceph_cap_string(got)); | 	     (unsigned)iov->iov_len - written, ceph_cap_string(got)); | ||||||
| 	ceph_put_cap_refs(ci, got); | 	ceph_put_cap_refs(ci, got); | ||||||
| 
 |  | ||||||
| out: | out: | ||||||
| 	if (ret == -EOLDSNAPC) { | 	if (ret == -EOLDSNAPC) { | ||||||
| 		dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", | 		dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Sage Weil
				Sage Weil