 b659ef0277
			
		
	
	
	b659ef0277
	
	
	
		
			
			Commit3a8b36f378("Btrfs: fix data loss in the fast fsync path") added a performance regression for that causes an unnecessary sync of the log trees (fs/subvol and root log trees) when 2 consecutive fsyncs are done against a file, without no writes or any metadata updates to the inode in between them and if a transaction is committed before the second fsync is called. Huang Ying reported this to lkml (https://lkml.org/lkml/2015/3/18/99) after a test sysbench test that measured a -62% decrease of file io requests per second for that tests' workload. The test is: echo performance > /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor echo performance > /sys/devices/system/cpu/cpu1/cpufreq/scaling_governor echo performance > /sys/devices/system/cpu/cpu2/cpufreq/scaling_governor echo performance > /sys/devices/system/cpu/cpu3/cpufreq/scaling_governor mkfs -t btrfs /dev/sda2 mount -t btrfs /dev/sda2 /fs/sda2 cd /fs/sda2 for ((i = 0; i < 1024; i++)); do fallocate -l 67108864 testfile.$i; done sysbench --test=fileio --max-requests=0 --num-threads=4 --max-time=600 \ --file-test-mode=rndwr --file-total-size=68719476736 --file-io-mode=sync \ --file-num=1024 run A test on kvm guest, running a debug kernel gave me the following results: Without3a8b36f378: 16.01 reqs/sec With3a8b36f378: 3.39 reqs/sec With3a8b36f378and this patch: 16.04 reqs/sec Reported-by: Huang Ying <ying.huang@intel.com> Tested-by: Huang, Ying <ying.huang@intel.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
		
			
				
	
	
		
			212 lines
		
	
	
	
		
			6.8 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			212 lines
		
	
	
	
		
			6.8 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
| /*
 | |
|  * Copyright (C) 2007 Oracle.  All rights reserved.
 | |
|  *
 | |
|  * This program is free software; you can redistribute it and/or
 | |
|  * modify it under the terms of the GNU General Public
 | |
|  * License v2 as published by the Free Software Foundation.
 | |
|  *
 | |
|  * This program is distributed in the hope that it will be useful,
 | |
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | |
|  * General Public License for more details.
 | |
|  *
 | |
|  * You should have received a copy of the GNU General Public
 | |
|  * License along with this program; if not, write to the
 | |
|  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 | |
|  * Boston, MA 021110-1307, USA.
 | |
|  */
 | |
| 
 | |
| #ifndef __BTRFS_ORDERED_DATA__
 | |
| #define __BTRFS_ORDERED_DATA__
 | |
| 
 | |
| /* one of these per inode */
 | |
| struct btrfs_ordered_inode_tree {
 | |
| 	spinlock_t lock;
 | |
| 	struct rb_root tree;
 | |
| 	struct rb_node *last;
 | |
| };
 | |
| 
 | |
| struct btrfs_ordered_sum {
 | |
| 	/* bytenr is the start of this extent on disk */
 | |
| 	u64 bytenr;
 | |
| 
 | |
| 	/*
 | |
| 	 * this is the length in bytes covered by the sums array below.
 | |
| 	 */
 | |
| 	int len;
 | |
| 	struct list_head list;
 | |
| 	/* last field is a variable length array of csums */
 | |
| 	u32 sums[];
 | |
| };
 | |
| 
 | |
| /*
 | |
|  * bits for the flags field:
 | |
|  *
 | |
|  * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
 | |
|  * It is used to make sure metadata is inserted into the tree only once
 | |
|  * per extent.
 | |
|  *
 | |
|  * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the
 | |
|  * rbtree, just before waking any waiters.  It is used to indicate the
 | |
|  * IO is done and any metadata is inserted into the tree.
 | |
|  */
 | |
| #define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
 | |
| 
 | |
| #define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
 | |
| 
 | |
| #define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
 | |
| 
 | |
| #define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
 | |
| 
 | |
| #define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
 | |
| 
 | |
| #define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
 | |
| 
 | |
| #define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
 | |
| 
 | |
| #define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
 | |
| 				       * has done its due diligence in updating
 | |
| 				       * the isize. */
 | |
| #define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered
 | |
| 				       ordered extent */
 | |
| #define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */
 | |
| 
 | |
| #define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent
 | |
| 				 * in the logging code. */
 | |
| struct btrfs_ordered_extent {
 | |
| 	/* logical offset in the file */
 | |
| 	u64 file_offset;
 | |
| 
 | |
| 	/* disk byte number */
 | |
| 	u64 start;
 | |
| 
 | |
| 	/* ram length of the extent in bytes */
 | |
| 	u64 len;
 | |
| 
 | |
| 	/* extent length on disk */
 | |
| 	u64 disk_len;
 | |
| 
 | |
| 	/* number of bytes that still need writing */
 | |
| 	u64 bytes_left;
 | |
| 
 | |
| 	/*
 | |
| 	 * the end of the ordered extent which is behind it but
 | |
| 	 * didn't update disk_i_size. Please see the comment of
 | |
| 	 * btrfs_ordered_update_i_size();
 | |
| 	 */
 | |
| 	u64 outstanding_isize;
 | |
| 
 | |
| 	/*
 | |
| 	 * If we get truncated we need to adjust the file extent we enter for
 | |
| 	 * this ordered extent so that we do not expose stale data.
 | |
| 	 */
 | |
| 	u64 truncated_len;
 | |
| 
 | |
| 	/* flags (described above) */
 | |
| 	unsigned long flags;
 | |
| 
 | |
| 	/* compression algorithm */
 | |
| 	int compress_type;
 | |
| 
 | |
| 	/* reference count */
 | |
| 	atomic_t refs;
 | |
| 
 | |
| 	/* the inode we belong to */
 | |
| 	struct inode *inode;
 | |
| 
 | |
| 	/* list of checksums for insertion when the extent io is done */
 | |
| 	struct list_head list;
 | |
| 
 | |
| 	/* If we need to wait on this to be done */
 | |
| 	struct list_head log_list;
 | |
| 
 | |
| 	/* If the transaction needs to wait on this ordered extent */
 | |
| 	struct list_head trans_list;
 | |
| 
 | |
| 	/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
 | |
| 	wait_queue_head_t wait;
 | |
| 
 | |
| 	/* our friendly rbtree entry */
 | |
| 	struct rb_node rb_node;
 | |
| 
 | |
| 	/* a per root list of all the pending ordered extents */
 | |
| 	struct list_head root_extent_list;
 | |
| 
 | |
| 	struct btrfs_work work;
 | |
| 
 | |
| 	struct completion completion;
 | |
| 	struct btrfs_work flush_work;
 | |
| 	struct list_head work_list;
 | |
| };
 | |
| 
 | |
| /*
 | |
|  * calculates the total size you need to allocate for an ordered sum
 | |
|  * structure spanning 'bytes' in the file
 | |
|  */
 | |
| static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
 | |
| 					 unsigned long bytes)
 | |
| {
 | |
| 	int num_sectors = (int)DIV_ROUND_UP(bytes, root->sectorsize);
 | |
| 	return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32);
 | |
| }
 | |
| 
 | |
| static inline void
 | |
| btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
 | |
| {
 | |
| 	spin_lock_init(&t->lock);
 | |
| 	t->tree = RB_ROOT;
 | |
| 	t->last = NULL;
 | |
| }
 | |
| 
 | |
| void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
 | |
| void btrfs_remove_ordered_extent(struct inode *inode,
 | |
| 				struct btrfs_ordered_extent *entry);
 | |
| int btrfs_dec_test_ordered_pending(struct inode *inode,
 | |
| 				   struct btrfs_ordered_extent **cached,
 | |
| 				   u64 file_offset, u64 io_size, int uptodate);
 | |
| int btrfs_dec_test_first_ordered_pending(struct inode *inode,
 | |
| 				   struct btrfs_ordered_extent **cached,
 | |
| 				   u64 *file_offset, u64 io_size,
 | |
| 				   int uptodate);
 | |
| int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 | |
| 			     u64 start, u64 len, u64 disk_len, int type);
 | |
| int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
 | |
| 				 u64 start, u64 len, u64 disk_len, int type);
 | |
| int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
 | |
| 				      u64 start, u64 len, u64 disk_len,
 | |
| 				      int type, int compress_type);
 | |
| void btrfs_add_ordered_sum(struct inode *inode,
 | |
| 			   struct btrfs_ordered_extent *entry,
 | |
| 			   struct btrfs_ordered_sum *sum);
 | |
| struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
 | |
| 							 u64 file_offset);
 | |
| void btrfs_start_ordered_extent(struct inode *inode,
 | |
| 				struct btrfs_ordered_extent *entry, int wait);
 | |
| int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
 | |
| struct btrfs_ordered_extent *
 | |
| btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
 | |
| struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
 | |
| 							u64 file_offset,
 | |
| 							u64 len);
 | |
| bool btrfs_have_ordered_extents_in_range(struct inode *inode,
 | |
| 					 u64 file_offset,
 | |
| 					 u64 len);
 | |
| int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
 | |
| 				struct btrfs_ordered_extent *ordered);
 | |
| int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
 | |
| 			   u32 *sum, int len);
 | |
| int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
 | |
| void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
 | |
| void btrfs_get_logged_extents(struct inode *inode,
 | |
| 			      struct list_head *logged_list,
 | |
| 			      const loff_t start,
 | |
| 			      const loff_t end);
 | |
| void btrfs_put_logged_extents(struct list_head *logged_list);
 | |
| void btrfs_submit_logged_extents(struct list_head *logged_list,
 | |
| 				 struct btrfs_root *log);
 | |
| void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
 | |
| 			       struct btrfs_root *log, u64 transid);
 | |
| void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
 | |
| int __init ordered_data_init(void);
 | |
| void ordered_data_exit(void);
 | |
| #endif
 |