We (Linux Kernel Performance project) found a regression introduced
by commit:
  f7fec032aa ext4: track all extent status in extent status tree
The commit causes about 20% performance decrease in fio random write
test. Profiler shows that rb_next() uses a lot of CPU time. The call
stack is:
  rb_next
  ext4_es_find_delayed_extent
  ext4_map_blocks
  _ext4_get_block
  ext4_get_block_write
  __blockdev_direct_IO
  ext4_direct_IO
  generic_file_direct_write
  __generic_file_aio_write
  ext4_file_write
  aio_rw_vect_retry
  aio_run_iocb
  do_io_submit
  sys_io_submit
  system_call_fastpath
  io_submit
  td_io_getevents
  io_u_queued_complete
  thread_main
  main
  __libc_start_main
The cause is that ext4_es_find_delayed_extent() doesn't have an
upper bound, it keeps searching until a delayed extent is found.
When there are a lots of non-delayed entries in the extent state
tree, ext4_es_find_delayed_extent() may uses a lot of CPU time.
Reported-by: LKP project <lkp@linux.intel.com>
Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
		
	
			
		
			
				
	
	
		
			127 lines
		
	
	
	
		
			3.4 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			127 lines
		
	
	
	
		
			3.4 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
/*
 | 
						|
 *  fs/ext4/extents_status.h
 | 
						|
 *
 | 
						|
 * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
 | 
						|
 * Modified by
 | 
						|
 *	Allison Henderson <achender@linux.vnet.ibm.com>
 | 
						|
 *	Zheng Liu <wenqing.lz@taobao.com>
 | 
						|
 *
 | 
						|
 */
 | 
						|
 | 
						|
#ifndef _EXT4_EXTENTS_STATUS_H
 | 
						|
#define _EXT4_EXTENTS_STATUS_H
 | 
						|
 | 
						|
/*
 | 
						|
 * Turn on ES_DEBUG__ to get lots of info about extent status operations.
 | 
						|
 */
 | 
						|
#ifdef ES_DEBUG__
 | 
						|
#define es_debug(fmt, ...)	printk(fmt, ##__VA_ARGS__)
 | 
						|
#else
 | 
						|
#define es_debug(fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
 | 
						|
#endif
 | 
						|
 | 
						|
/*
 | 
						|
 * With ES_AGGRESSIVE_TEST defined, the result of es caching will be
 | 
						|
 * checked with old map_block's result.
 | 
						|
 */
 | 
						|
#define ES_AGGRESSIVE_TEST__
 | 
						|
 | 
						|
/*
 | 
						|
 * These flags live in the high bits of extent_status.es_pblk
 | 
						|
 */
 | 
						|
#define EXTENT_STATUS_WRITTEN	(1ULL << 63)
 | 
						|
#define EXTENT_STATUS_UNWRITTEN (1ULL << 62)
 | 
						|
#define EXTENT_STATUS_DELAYED	(1ULL << 61)
 | 
						|
#define EXTENT_STATUS_HOLE	(1ULL << 60)
 | 
						|
 | 
						|
#define EXTENT_STATUS_FLAGS	(EXTENT_STATUS_WRITTEN | \
 | 
						|
				 EXTENT_STATUS_UNWRITTEN | \
 | 
						|
				 EXTENT_STATUS_DELAYED | \
 | 
						|
				 EXTENT_STATUS_HOLE)
 | 
						|
 | 
						|
struct ext4_extent;
 | 
						|
 | 
						|
struct extent_status {
 | 
						|
	struct rb_node rb_node;
 | 
						|
	ext4_lblk_t es_lblk;	/* first logical block extent covers */
 | 
						|
	ext4_lblk_t es_len;	/* length of extent in block */
 | 
						|
	ext4_fsblk_t es_pblk;	/* first physical block */
 | 
						|
};
 | 
						|
 | 
						|
struct ext4_es_tree {
 | 
						|
	struct rb_root root;
 | 
						|
	struct extent_status *cache_es;	/* recently accessed extent */
 | 
						|
};
 | 
						|
 | 
						|
extern int __init ext4_init_es(void);
 | 
						|
extern void ext4_exit_es(void);
 | 
						|
extern void ext4_es_init_tree(struct ext4_es_tree *tree);
 | 
						|
 | 
						|
extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
 | 
						|
				 ext4_lblk_t len, ext4_fsblk_t pblk,
 | 
						|
				 unsigned long long status);
 | 
						|
extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 | 
						|
				 ext4_lblk_t len);
 | 
						|
extern void ext4_es_find_delayed_extent_range(struct inode *inode,
 | 
						|
					ext4_lblk_t lblk, ext4_lblk_t end,
 | 
						|
					struct extent_status *es);
 | 
						|
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
 | 
						|
				 struct extent_status *es);
 | 
						|
extern int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex);
 | 
						|
 | 
						|
static inline int ext4_es_is_written(struct extent_status *es)
 | 
						|
{
 | 
						|
	return (es->es_pblk & EXTENT_STATUS_WRITTEN) != 0;
 | 
						|
}
 | 
						|
 | 
						|
static inline int ext4_es_is_unwritten(struct extent_status *es)
 | 
						|
{
 | 
						|
	return (es->es_pblk & EXTENT_STATUS_UNWRITTEN) != 0;
 | 
						|
}
 | 
						|
 | 
						|
static inline int ext4_es_is_delayed(struct extent_status *es)
 | 
						|
{
 | 
						|
	return (es->es_pblk & EXTENT_STATUS_DELAYED) != 0;
 | 
						|
}
 | 
						|
 | 
						|
static inline int ext4_es_is_hole(struct extent_status *es)
 | 
						|
{
 | 
						|
	return (es->es_pblk & EXTENT_STATUS_HOLE) != 0;
 | 
						|
}
 | 
						|
 | 
						|
static inline ext4_fsblk_t ext4_es_status(struct extent_status *es)
 | 
						|
{
 | 
						|
	return (es->es_pblk & EXTENT_STATUS_FLAGS);
 | 
						|
}
 | 
						|
 | 
						|
static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
 | 
						|
{
 | 
						|
	return (es->es_pblk & ~EXTENT_STATUS_FLAGS);
 | 
						|
}
 | 
						|
 | 
						|
static inline void ext4_es_store_pblock(struct extent_status *es,
 | 
						|
					ext4_fsblk_t pb)
 | 
						|
{
 | 
						|
	ext4_fsblk_t block;
 | 
						|
 | 
						|
	block = (pb & ~EXTENT_STATUS_FLAGS) |
 | 
						|
		(es->es_pblk & EXTENT_STATUS_FLAGS);
 | 
						|
	es->es_pblk = block;
 | 
						|
}
 | 
						|
 | 
						|
static inline void ext4_es_store_status(struct extent_status *es,
 | 
						|
					unsigned long long status)
 | 
						|
{
 | 
						|
	ext4_fsblk_t block;
 | 
						|
 | 
						|
	block = (status & EXTENT_STATUS_FLAGS) |
 | 
						|
		(es->es_pblk & ~EXTENT_STATUS_FLAGS);
 | 
						|
	es->es_pblk = block;
 | 
						|
}
 | 
						|
 | 
						|
extern void ext4_es_register_shrinker(struct super_block *sb);
 | 
						|
extern void ext4_es_unregister_shrinker(struct super_block *sb);
 | 
						|
extern void ext4_es_lru_add(struct inode *inode);
 | 
						|
extern void ext4_es_lru_del(struct inode *inode);
 | 
						|
 | 
						|
#endif /* _EXT4_EXTENTS_STATUS_H */
 |