 3942c07ccf
			
		
	
	
	3942c07ccf
	
	
	
		
			
			This series reworks our current object cache shrinking infrastructure in two main ways: * Noticing that a lot of users copy and paste their own version of LRU lists for objects, we put some effort in providing a generic version. It is modeled after the filesystem users: dentries, inodes, and xfs (for various tasks), but we expect that other users could benefit in the near future with little or no modification. Let us know if you have any issues. * The underlying list_lru being proposed automatically and transparently keeps the elements in per-node lists, and is able to manipulate the node lists individually. Given this infrastructure, we are able to modify the up-to-now hammer called shrink_slab to proceed with node-reclaim instead of always searching memory from all over like it has been doing. Per-node lru lists are also expected to lead to less contention in the lru locks on multi-node scans, since we are now no longer fighting for a global lock. The locks usually disappear from the profilers with this change. Although we have no official benchmarks for this version - be our guest to independently evaluate this - earlier versions of this series were performance tested (details at http://permalink.gmane.org/gmane.linux.kernel.mm/100537) yielding no visible performance regressions while yielding a better qualitative behavior in NUMA machines. With this infrastructure in place, we can use the list_lru entry point to provide memcg isolation and per-memcg targeted reclaim. Historically, those two pieces of work have been posted together. This version presents only the infrastructure work, deferring the memcg work for a later time, so we can focus on getting this part tested. You can see more about the history of such work at http://lwn.net/Articles/552769/ Dave Chinner (18): dcache: convert dentry_stat.nr_unused to per-cpu counters dentry: move to per-sb LRU locks dcache: remove dentries from LRU before putting on dispose list mm: new shrinker API shrinker: convert superblock shrinkers to new API list: add a new LRU list type inode: convert inode lru list to generic lru list code. dcache: convert to use new lru list infrastructure list_lru: per-node list infrastructure shrinker: add node awareness fs: convert inode and dentry shrinking to be node aware xfs: convert buftarg LRU to generic code xfs: rework buffer dispose list tracking xfs: convert dquot cache lru to list_lru fs: convert fs shrinkers to new scan/count API drivers: convert shrinkers to new count/scan API shrinker: convert remaining shrinkers to count/scan API shrinker: Kill old ->shrink API. Glauber Costa (7): fs: bump inode and dentry counters to long super: fix calculation of shrinkable objects for small numbers list_lru: per-node API vmscan: per-node deferred work i915: bail out earlier when shrinker cannot acquire mutex hugepage: convert huge zero page shrinker to new shrinker API list_lru: dynamically adjust node arrays This patch: There are situations in very large machines in which we can have a large quantity of dirty inodes, unused dentries, etc. This is particularly true when umounting a filesystem, where eventually since every live object will eventually be discarded. Dave Chinner reported a problem with this while experimenting with the shrinker revamp patchset. So we believe it is time for a change. This patch just moves int to longs. Machines where it matters should have a big long anyway. Signed-off-by: Glauber Costa <glommer@openvz.org> Cc: Dave Chinner <dchinner@redhat.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Artem Bityutskiy <artem.bityutskiy@linux.intel.com> Cc: Arve Hjønnevåg <arve@android.com> Cc: Carlos Maiolino <cmaiolino@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Chuck Lever <chuck.lever@oracle.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: Dave Chinner <dchinner@redhat.com> Cc: David Rientjes <rientjes@google.com> Cc: Gleb Natapov <gleb@redhat.com> Cc: Greg Thelen <gthelen@google.com> Cc: J. Bruce Fields <bfields@redhat.com> Cc: Jan Kara <jack@suse.cz> Cc: Jerome Glisse <jglisse@redhat.com> Cc: John Stultz <john.stultz@linaro.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Kent Overstreet <koverstreet@google.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Thomas Hellstrom <thellstrom@vmware.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
		
			
				
	
	
		
			204 lines
		
	
	
	
		
			7.9 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			204 lines
		
	
	
	
		
			7.9 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
| #ifndef _UAPI_LINUX_FS_H
 | |
| #define _UAPI_LINUX_FS_H
 | |
| 
 | |
| /*
 | |
|  * This file has definitions for some important file table
 | |
|  * structures etc.
 | |
|  */
 | |
| 
 | |
| #include <linux/limits.h>
 | |
| #include <linux/ioctl.h>
 | |
| #include <linux/types.h>
 | |
| 
 | |
| /*
 | |
|  * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
 | |
|  * the file limit at runtime and only root can increase the per-process
 | |
|  * nr_file rlimit, so it's safe to set up a ridiculously high absolute
 | |
|  * upper limit on files-per-process.
 | |
|  *
 | |
|  * Some programs (notably those using select()) may have to be 
 | |
|  * recompiled to take full advantage of the new limits..  
 | |
|  */
 | |
| 
 | |
| /* Fixed constants first: */
 | |
| #undef NR_OPEN
 | |
| #define INR_OPEN_CUR 1024	/* Initial setting for nfile rlimits */
 | |
| #define INR_OPEN_MAX 4096	/* Hard limit for nfile rlimits */
 | |
| 
 | |
| #define BLOCK_SIZE_BITS 10
 | |
| #define BLOCK_SIZE (1<<BLOCK_SIZE_BITS)
 | |
| 
 | |
| #define SEEK_SET	0	/* seek relative to beginning of file */
 | |
| #define SEEK_CUR	1	/* seek relative to current file position */
 | |
| #define SEEK_END	2	/* seek relative to end of file */
 | |
| #define SEEK_DATA	3	/* seek to the next data */
 | |
| #define SEEK_HOLE	4	/* seek to the next hole */
 | |
| #define SEEK_MAX	SEEK_HOLE
 | |
| 
 | |
| struct fstrim_range {
 | |
| 	__u64 start;
 | |
| 	__u64 len;
 | |
| 	__u64 minlen;
 | |
| };
 | |
| 
 | |
| /* And dynamically-tunable limits and defaults: */
 | |
| struct files_stat_struct {
 | |
| 	unsigned long nr_files;		/* read only */
 | |
| 	unsigned long nr_free_files;	/* read only */
 | |
| 	unsigned long max_files;		/* tunable */
 | |
| };
 | |
| 
 | |
| struct inodes_stat_t {
 | |
| 	long nr_inodes;
 | |
| 	long nr_unused;
 | |
| 	long dummy[5];		/* padding for sysctl ABI compatibility */
 | |
| };
 | |
| 
 | |
| 
 | |
| #define NR_FILE  8192	/* this can well be larger on a larger system */
 | |
| 
 | |
| 
 | |
| /*
 | |
|  * These are the fs-independent mount-flags: up to 32 flags are supported
 | |
|  */
 | |
| #define MS_RDONLY	 1	/* Mount read-only */
 | |
| #define MS_NOSUID	 2	/* Ignore suid and sgid bits */
 | |
| #define MS_NODEV	 4	/* Disallow access to device special files */
 | |
| #define MS_NOEXEC	 8	/* Disallow program execution */
 | |
| #define MS_SYNCHRONOUS	16	/* Writes are synced at once */
 | |
| #define MS_REMOUNT	32	/* Alter flags of a mounted FS */
 | |
| #define MS_MANDLOCK	64	/* Allow mandatory locks on an FS */
 | |
| #define MS_DIRSYNC	128	/* Directory modifications are synchronous */
 | |
| #define MS_NOATIME	1024	/* Do not update access times. */
 | |
| #define MS_NODIRATIME	2048	/* Do not update directory access times */
 | |
| #define MS_BIND		4096
 | |
| #define MS_MOVE		8192
 | |
| #define MS_REC		16384
 | |
| #define MS_VERBOSE	32768	/* War is peace. Verbosity is silence.
 | |
| 				   MS_VERBOSE is deprecated. */
 | |
| #define MS_SILENT	32768
 | |
| #define MS_POSIXACL	(1<<16)	/* VFS does not apply the umask */
 | |
| #define MS_UNBINDABLE	(1<<17)	/* change to unbindable */
 | |
| #define MS_PRIVATE	(1<<18)	/* change to private */
 | |
| #define MS_SLAVE	(1<<19)	/* change to slave */
 | |
| #define MS_SHARED	(1<<20)	/* change to shared */
 | |
| #define MS_RELATIME	(1<<21)	/* Update atime relative to mtime/ctime. */
 | |
| #define MS_KERNMOUNT	(1<<22) /* this is a kern_mount call */
 | |
| #define MS_I_VERSION	(1<<23) /* Update inode I_version field */
 | |
| #define MS_STRICTATIME	(1<<24) /* Always perform atime updates */
 | |
| 
 | |
| /* These sb flags are internal to the kernel */
 | |
| #define MS_NOSEC	(1<<28)
 | |
| #define MS_BORN		(1<<29)
 | |
| #define MS_ACTIVE	(1<<30)
 | |
| #define MS_NOUSER	(1<<31)
 | |
| 
 | |
| /*
 | |
|  * Superblock flags that can be altered by MS_REMOUNT
 | |
|  */
 | |
| #define MS_RMT_MASK	(MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION)
 | |
| 
 | |
| /*
 | |
|  * Old magic mount flag and mask
 | |
|  */
 | |
| #define MS_MGC_VAL 0xC0ED0000
 | |
| #define MS_MGC_MSK 0xffff0000
 | |
| 
 | |
| /* the read-only stuff doesn't really belong here, but any other place is
 | |
|    probably as bad and I don't want to create yet another include file. */
 | |
| 
 | |
| #define BLKROSET   _IO(0x12,93)	/* set device read-only (0 = read-write) */
 | |
| #define BLKROGET   _IO(0x12,94)	/* get read-only status (0 = read_write) */
 | |
| #define BLKRRPART  _IO(0x12,95)	/* re-read partition table */
 | |
| #define BLKGETSIZE _IO(0x12,96)	/* return device size /512 (long *arg) */
 | |
| #define BLKFLSBUF  _IO(0x12,97)	/* flush buffer cache */
 | |
| #define BLKRASET   _IO(0x12,98)	/* set read ahead for block device */
 | |
| #define BLKRAGET   _IO(0x12,99)	/* get current read ahead setting */
 | |
| #define BLKFRASET  _IO(0x12,100)/* set filesystem (mm/filemap.c) read-ahead */
 | |
| #define BLKFRAGET  _IO(0x12,101)/* get filesystem (mm/filemap.c) read-ahead */
 | |
| #define BLKSECTSET _IO(0x12,102)/* set max sectors per request (ll_rw_blk.c) */
 | |
| #define BLKSECTGET _IO(0x12,103)/* get max sectors per request (ll_rw_blk.c) */
 | |
| #define BLKSSZGET  _IO(0x12,104)/* get block device sector size */
 | |
| #if 0
 | |
| #define BLKPG      _IO(0x12,105)/* See blkpg.h */
 | |
| 
 | |
| /* Some people are morons.  Do not use sizeof! */
 | |
| 
 | |
| #define BLKELVGET  _IOR(0x12,106,size_t)/* elevator get */
 | |
| #define BLKELVSET  _IOW(0x12,107,size_t)/* elevator set */
 | |
| /* This was here just to show that the number is taken -
 | |
|    probably all these _IO(0x12,*) ioctls should be moved to blkpg.h. */
 | |
| #endif
 | |
| /* A jump here: 108-111 have been used for various private purposes. */
 | |
| #define BLKBSZGET  _IOR(0x12,112,size_t)
 | |
| #define BLKBSZSET  _IOW(0x12,113,size_t)
 | |
| #define BLKGETSIZE64 _IOR(0x12,114,size_t)	/* return device size in bytes (u64 *arg) */
 | |
| #define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup)
 | |
| #define BLKTRACESTART _IO(0x12,116)
 | |
| #define BLKTRACESTOP _IO(0x12,117)
 | |
| #define BLKTRACETEARDOWN _IO(0x12,118)
 | |
| #define BLKDISCARD _IO(0x12,119)
 | |
| #define BLKIOMIN _IO(0x12,120)
 | |
| #define BLKIOOPT _IO(0x12,121)
 | |
| #define BLKALIGNOFF _IO(0x12,122)
 | |
| #define BLKPBSZGET _IO(0x12,123)
 | |
| #define BLKDISCARDZEROES _IO(0x12,124)
 | |
| #define BLKSECDISCARD _IO(0x12,125)
 | |
| #define BLKROTATIONAL _IO(0x12,126)
 | |
| #define BLKZEROOUT _IO(0x12,127)
 | |
| 
 | |
| #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 | |
| #define FIBMAP	   _IO(0x00,1)	/* bmap access */
 | |
| #define FIGETBSZ   _IO(0x00,2)	/* get the block size used for bmap */
 | |
| #define FIFREEZE	_IOWR('X', 119, int)	/* Freeze */
 | |
| #define FITHAW		_IOWR('X', 120, int)	/* Thaw */
 | |
| #define FITRIM		_IOWR('X', 121, struct fstrim_range)	/* Trim */
 | |
| 
 | |
| #define	FS_IOC_GETFLAGS			_IOR('f', 1, long)
 | |
| #define	FS_IOC_SETFLAGS			_IOW('f', 2, long)
 | |
| #define	FS_IOC_GETVERSION		_IOR('v', 1, long)
 | |
| #define	FS_IOC_SETVERSION		_IOW('v', 2, long)
 | |
| #define FS_IOC_FIEMAP			_IOWR('f', 11, struct fiemap)
 | |
| #define FS_IOC32_GETFLAGS		_IOR('f', 1, int)
 | |
| #define FS_IOC32_SETFLAGS		_IOW('f', 2, int)
 | |
| #define FS_IOC32_GETVERSION		_IOR('v', 1, int)
 | |
| #define FS_IOC32_SETVERSION		_IOW('v', 2, int)
 | |
| 
 | |
| /*
 | |
|  * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
 | |
|  */
 | |
| #define	FS_SECRM_FL			0x00000001 /* Secure deletion */
 | |
| #define	FS_UNRM_FL			0x00000002 /* Undelete */
 | |
| #define	FS_COMPR_FL			0x00000004 /* Compress file */
 | |
| #define FS_SYNC_FL			0x00000008 /* Synchronous updates */
 | |
| #define FS_IMMUTABLE_FL			0x00000010 /* Immutable file */
 | |
| #define FS_APPEND_FL			0x00000020 /* writes to file may only append */
 | |
| #define FS_NODUMP_FL			0x00000040 /* do not dump file */
 | |
| #define FS_NOATIME_FL			0x00000080 /* do not update atime */
 | |
| /* Reserved for compression usage... */
 | |
| #define FS_DIRTY_FL			0x00000100
 | |
| #define FS_COMPRBLK_FL			0x00000200 /* One or more compressed clusters */
 | |
| #define FS_NOCOMP_FL			0x00000400 /* Don't compress */
 | |
| #define FS_ECOMPR_FL			0x00000800 /* Compression error */
 | |
| /* End compression flags --- maybe not all used */
 | |
| #define FS_BTREE_FL			0x00001000 /* btree format dir */
 | |
| #define FS_INDEX_FL			0x00001000 /* hash-indexed directory */
 | |
| #define FS_IMAGIC_FL			0x00002000 /* AFS directory */
 | |
| #define FS_JOURNAL_DATA_FL		0x00004000 /* Reserved for ext3 */
 | |
| #define FS_NOTAIL_FL			0x00008000 /* file tail should not be merged */
 | |
| #define FS_DIRSYNC_FL			0x00010000 /* dirsync behaviour (directories only) */
 | |
| #define FS_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
 | |
| #define FS_EXTENT_FL			0x00080000 /* Extents */
 | |
| #define FS_DIRECTIO_FL			0x00100000 /* Use direct i/o */
 | |
| #define FS_NOCOW_FL			0x00800000 /* Do not cow file */
 | |
| #define FS_RESERVED_FL			0x80000000 /* reserved for ext2 lib */
 | |
| 
 | |
| #define FS_FL_USER_VISIBLE		0x0003DFFF /* User visible flags */
 | |
| #define FS_FL_USER_MODIFIABLE		0x000380FF /* User modifiable flags */
 | |
| 
 | |
| 
 | |
| #define SYNC_FILE_RANGE_WAIT_BEFORE	1
 | |
| #define SYNC_FILE_RANGE_WRITE		2
 | |
| #define SYNC_FILE_RANGE_WAIT_AFTER	4
 | |
| 
 | |
| #endif /* _UAPI_LINUX_FS_H */
 |