cgroup: Merge branch 'memcg_event' into for-3.14
Merge v3.12 based patch series to move cgroup_event implementation to memcg into for-3.14. The following two commits cause a conflict in kernel/cgroup.c2ff2a7d03b("cgroup: kill css_id")79bd9814e5("cgroup, memcg: move cgroup_event implementation to memcg") Each patch removes a struct definition from kernel/cgroup.c. As the two are adjacent, they cause a context conflict. Easily resolved by removing both structs. Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:
		
				commit
				
					
						edab95103d
					
				
			
		
					 7 changed files with 335 additions and 360 deletions
				
			
		|  | @ -24,7 +24,6 @@ CONTENTS: | |||
|   2.1 Basic Usage | ||||
|   2.2 Attaching processes | ||||
|   2.3 Mounting hierarchies by name | ||||
|   2.4 Notification API | ||||
| 3. Kernel API | ||||
|   3.1 Overview | ||||
|   3.2 Synchronization | ||||
|  | @ -472,25 +471,6 @@ you give a subsystem a name. | |||
| The name of the subsystem appears as part of the hierarchy description | ||||
| in /proc/mounts and /proc/<pid>/cgroups. | ||||
| 
 | ||||
| 2.4 Notification API | ||||
| -------------------- | ||||
| 
 | ||||
| There is mechanism which allows to get notifications about changing | ||||
| status of a cgroup. | ||||
| 
 | ||||
| To register a new notification handler you need to: | ||||
|  - create a file descriptor for event notification using eventfd(2); | ||||
|  - open a control file to be monitored (e.g. memory.usage_in_bytes); | ||||
|  - write "<event_fd> <control_fd> <args>" to cgroup.event_control. | ||||
|    Interpretation of args is defined by control file implementation; | ||||
| 
 | ||||
| eventfd will be woken up by control file implementation or when the | ||||
| cgroup is removed. | ||||
| 
 | ||||
| To unregister a notification handler just close eventfd. | ||||
| 
 | ||||
| NOTE: Support of notifications should be implemented for the control | ||||
| file. See documentation for the subsystem. | ||||
| 
 | ||||
| 3. Kernel API | ||||
| ============= | ||||
|  |  | |||
|  | @ -29,7 +29,6 @@ struct cgroup_subsys; | |||
| struct inode; | ||||
| struct cgroup; | ||||
| struct css_id; | ||||
| struct eventfd_ctx; | ||||
| 
 | ||||
| extern int cgroup_init_early(void); | ||||
| extern int cgroup_init(void); | ||||
|  | @ -239,10 +238,6 @@ struct cgroup { | |||
| 	struct rcu_head rcu_head; | ||||
| 	struct work_struct destroy_work; | ||||
| 
 | ||||
| 	/* List of events which userspace want to receive */ | ||||
| 	struct list_head event_list; | ||||
| 	spinlock_t event_list_lock; | ||||
| 
 | ||||
| 	/* directory xattrs */ | ||||
| 	struct simple_xattrs xattrs; | ||||
| }; | ||||
|  | @ -506,25 +501,6 @@ struct cftype { | |||
| 	int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); | ||||
| 
 | ||||
| 	int (*release)(struct inode *inode, struct file *file); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * register_event() callback will be used to add new userspace | ||||
| 	 * waiter for changes related to the cftype. Implement it if | ||||
| 	 * you want to provide this functionality. Use eventfd_signal() | ||||
| 	 * on eventfd to send notification to userspace. | ||||
| 	 */ | ||||
| 	int (*register_event)(struct cgroup_subsys_state *css, | ||||
| 			      struct cftype *cft, struct eventfd_ctx *eventfd, | ||||
| 			      const char *args); | ||||
| 	/*
 | ||||
| 	 * unregister_event() callback will be called when userspace | ||||
| 	 * closes the eventfd or on cgroup removing. | ||||
| 	 * This callback must be implemented, if you want provide | ||||
| 	 * notification functionality. | ||||
| 	 */ | ||||
| 	void (*unregister_event)(struct cgroup_subsys_state *css, | ||||
| 				 struct cftype *cft, | ||||
| 				 struct eventfd_ctx *eventfd); | ||||
| }; | ||||
| 
 | ||||
| /*
 | ||||
|  |  | |||
|  | @ -7,6 +7,7 @@ | |||
| #include <linux/gfp.h> | ||||
| #include <linux/types.h> | ||||
| #include <linux/cgroup.h> | ||||
| #include <linux/eventfd.h> | ||||
| 
 | ||||
| struct vmpressure { | ||||
| 	unsigned long scanned; | ||||
|  | @ -33,13 +34,10 @@ extern void vmpressure_init(struct vmpressure *vmpr); | |||
| extern void vmpressure_cleanup(struct vmpressure *vmpr); | ||||
| extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); | ||||
| extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); | ||||
| extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css); | ||||
| extern int vmpressure_register_event(struct cgroup_subsys_state *css, | ||||
| 				     struct cftype *cft, | ||||
| extern int vmpressure_register_event(struct mem_cgroup *memcg, | ||||
| 				     struct eventfd_ctx *eventfd, | ||||
| 				     const char *args); | ||||
| extern void vmpressure_unregister_event(struct cgroup_subsys_state *css, | ||||
| 					struct cftype *cft, | ||||
| extern void vmpressure_unregister_event(struct mem_cgroup *memcg, | ||||
| 					struct eventfd_ctx *eventfd); | ||||
| #else | ||||
| static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, | ||||
|  |  | |||
|  | @ -848,7 +848,6 @@ config NUMA_BALANCING | |||
| 
 | ||||
| menuconfig CGROUPS | ||||
| 	boolean "Control Group support" | ||||
| 	depends on EVENTFD | ||||
| 	help | ||||
| 	  This option adds support for grouping sets of processes together, for | ||||
| 	  use with process control subsystems such as Cpusets, CFS, memory | ||||
|  | @ -915,6 +914,7 @@ config MEMCG | |||
| 	bool "Memory Resource Controller for Control Groups" | ||||
| 	depends on RESOURCE_COUNTERS | ||||
| 	select MM_OWNER | ||||
| 	select EVENTFD | ||||
| 	help | ||||
| 	  Provides a memory resource controller that manages both anonymous | ||||
| 	  memory and page cache. (See Documentation/cgroups/memory.txt) | ||||
|  | @ -1154,7 +1154,6 @@ config UIDGID_STRICT_TYPE_CHECKS | |||
| 
 | ||||
| config SCHED_AUTOGROUP | ||||
| 	bool "Automatic process group scheduling" | ||||
| 	select EVENTFD | ||||
| 	select CGROUPS | ||||
| 	select CGROUP_SCHED | ||||
| 	select FAIR_GROUP_SCHED | ||||
|  |  | |||
							
								
								
									
										259
									
								
								kernel/cgroup.c
									
										
									
									
									
								
							
							
						
						
									
										259
									
								
								kernel/cgroup.c
									
										
									
									
									
								
							|  | @ -56,11 +56,8 @@ | |||
| #include <linux/pid_namespace.h> | ||||
| #include <linux/idr.h> | ||||
| #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ | ||||
| #include <linux/eventfd.h> | ||||
| #include <linux/poll.h> | ||||
| #include <linux/flex_array.h> /* used in cgroup_attach_task */ | ||||
| #include <linux/kthread.h> | ||||
| #include <linux/file.h> | ||||
| 
 | ||||
| #include <linux/atomic.h> | ||||
| 
 | ||||
|  | @ -132,36 +129,6 @@ struct cfent { | |||
| 	struct simple_xattrs		xattrs; | ||||
| }; | ||||
| 
 | ||||
| /*
 | ||||
|  * cgroup_event represents events which userspace want to receive. | ||||
|  */ | ||||
| struct cgroup_event { | ||||
| 	/*
 | ||||
| 	 * css which the event belongs to. | ||||
| 	 */ | ||||
| 	struct cgroup_subsys_state *css; | ||||
| 	/*
 | ||||
| 	 * Control file which the event associated. | ||||
| 	 */ | ||||
| 	struct cftype *cft; | ||||
| 	/*
 | ||||
| 	 * eventfd to signal userspace about the event. | ||||
| 	 */ | ||||
| 	struct eventfd_ctx *eventfd; | ||||
| 	/*
 | ||||
| 	 * Each of these stored in a list by the cgroup. | ||||
| 	 */ | ||||
| 	struct list_head list; | ||||
| 	/*
 | ||||
| 	 * All fields below needed to unregister event when | ||||
| 	 * userspace closes eventfd. | ||||
| 	 */ | ||||
| 	poll_table pt; | ||||
| 	wait_queue_head_t *wqh; | ||||
| 	wait_queue_t wait; | ||||
| 	struct work_struct remove; | ||||
| }; | ||||
| 
 | ||||
| /* The list of hierarchy roots */ | ||||
| 
 | ||||
| static LIST_HEAD(cgroup_roots); | ||||
|  | @ -1351,8 +1318,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
| 	INIT_LIST_HEAD(&cgrp->pidlists); | ||||
| 	mutex_init(&cgrp->pidlist_mutex); | ||||
| 	cgrp->dummy_css.cgroup = cgrp; | ||||
| 	INIT_LIST_HEAD(&cgrp->event_list); | ||||
| 	spin_lock_init(&cgrp->event_list_lock); | ||||
| 	simple_xattrs_init(&cgrp->xattrs); | ||||
| } | ||||
| 
 | ||||
|  | @ -2626,16 +2591,6 @@ static const struct inode_operations cgroup_dir_inode_operations = { | |||
| 	.removexattr = cgroup_removexattr, | ||||
| }; | ||||
| 
 | ||||
| /*
 | ||||
|  * Check if a file is a control file | ||||
|  */ | ||||
| static inline struct cftype *__file_cft(struct file *file) | ||||
| { | ||||
| 	if (file_inode(file)->i_fop != &cgroup_file_operations) | ||||
| 		return ERR_PTR(-EINVAL); | ||||
| 	return __d_cft(file->f_dentry); | ||||
| } | ||||
| 
 | ||||
| static int cgroup_create_file(struct dentry *dentry, umode_t mode, | ||||
| 				struct super_block *sb) | ||||
| { | ||||
|  | @ -3915,202 +3870,6 @@ static void cgroup_dput(struct cgroup *cgrp) | |||
| 	deactivate_super(sb); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Unregister event and free resources. | ||||
|  * | ||||
|  * Gets called from workqueue. | ||||
|  */ | ||||
| static void cgroup_event_remove(struct work_struct *work) | ||||
| { | ||||
| 	struct cgroup_event *event = container_of(work, struct cgroup_event, | ||||
| 			remove); | ||||
| 	struct cgroup_subsys_state *css = event->css; | ||||
| 
 | ||||
| 	remove_wait_queue(event->wqh, &event->wait); | ||||
| 
 | ||||
| 	event->cft->unregister_event(css, event->cft, event->eventfd); | ||||
| 
 | ||||
| 	/* Notify userspace the event is going away. */ | ||||
| 	eventfd_signal(event->eventfd, 1); | ||||
| 
 | ||||
| 	eventfd_ctx_put(event->eventfd); | ||||
| 	kfree(event); | ||||
| 	css_put(css); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Gets called on POLLHUP on eventfd when user closes it. | ||||
|  * | ||||
|  * Called with wqh->lock held and interrupts disabled. | ||||
|  */ | ||||
| static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, | ||||
| 		int sync, void *key) | ||||
| { | ||||
| 	struct cgroup_event *event = container_of(wait, | ||||
| 			struct cgroup_event, wait); | ||||
| 	struct cgroup *cgrp = event->css->cgroup; | ||||
| 	unsigned long flags = (unsigned long)key; | ||||
| 
 | ||||
| 	if (flags & POLLHUP) { | ||||
| 		/*
 | ||||
| 		 * If the event has been detached at cgroup removal, we | ||||
| 		 * can simply return knowing the other side will cleanup | ||||
| 		 * for us. | ||||
| 		 * | ||||
| 		 * We can't race against event freeing since the other | ||||
| 		 * side will require wqh->lock via remove_wait_queue(), | ||||
| 		 * which we hold. | ||||
| 		 */ | ||||
| 		spin_lock(&cgrp->event_list_lock); | ||||
| 		if (!list_empty(&event->list)) { | ||||
| 			list_del_init(&event->list); | ||||
| 			/*
 | ||||
| 			 * We are in atomic context, but cgroup_event_remove() | ||||
| 			 * may sleep, so we have to call it in workqueue. | ||||
| 			 */ | ||||
| 			schedule_work(&event->remove); | ||||
| 		} | ||||
| 		spin_unlock(&cgrp->event_list_lock); | ||||
| 	} | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static void cgroup_event_ptable_queue_proc(struct file *file, | ||||
| 		wait_queue_head_t *wqh, poll_table *pt) | ||||
| { | ||||
| 	struct cgroup_event *event = container_of(pt, | ||||
| 			struct cgroup_event, pt); | ||||
| 
 | ||||
| 	event->wqh = wqh; | ||||
| 	add_wait_queue(wqh, &event->wait); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Parse input and register new cgroup event handler. | ||||
|  * | ||||
|  * Input must be in format '<event_fd> <control_fd> <args>'. | ||||
|  * Interpretation of args is defined by control file implementation. | ||||
|  */ | ||||
| static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, | ||||
| 				      struct cftype *cft, const char *buffer) | ||||
| { | ||||
| 	struct cgroup *cgrp = dummy_css->cgroup; | ||||
| 	struct cgroup_event *event; | ||||
| 	struct cgroup_subsys_state *cfile_css; | ||||
| 	unsigned int efd, cfd; | ||||
| 	struct fd efile; | ||||
| 	struct fd cfile; | ||||
| 	char *endp; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	efd = simple_strtoul(buffer, &endp, 10); | ||||
| 	if (*endp != ' ') | ||||
| 		return -EINVAL; | ||||
| 	buffer = endp + 1; | ||||
| 
 | ||||
| 	cfd = simple_strtoul(buffer, &endp, 10); | ||||
| 	if ((*endp != ' ') && (*endp != '\0')) | ||||
| 		return -EINVAL; | ||||
| 	buffer = endp + 1; | ||||
| 
 | ||||
| 	event = kzalloc(sizeof(*event), GFP_KERNEL); | ||||
| 	if (!event) | ||||
| 		return -ENOMEM; | ||||
| 
 | ||||
| 	INIT_LIST_HEAD(&event->list); | ||||
| 	init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); | ||||
| 	init_waitqueue_func_entry(&event->wait, cgroup_event_wake); | ||||
| 	INIT_WORK(&event->remove, cgroup_event_remove); | ||||
| 
 | ||||
| 	efile = fdget(efd); | ||||
| 	if (!efile.file) { | ||||
| 		ret = -EBADF; | ||||
| 		goto out_kfree; | ||||
| 	} | ||||
| 
 | ||||
| 	event->eventfd = eventfd_ctx_fileget(efile.file); | ||||
| 	if (IS_ERR(event->eventfd)) { | ||||
| 		ret = PTR_ERR(event->eventfd); | ||||
| 		goto out_put_efile; | ||||
| 	} | ||||
| 
 | ||||
| 	cfile = fdget(cfd); | ||||
| 	if (!cfile.file) { | ||||
| 		ret = -EBADF; | ||||
| 		goto out_put_eventfd; | ||||
| 	} | ||||
| 
 | ||||
| 	/* the process need read permission on control file */ | ||||
| 	/* AV: shouldn't we check that it's been opened for read instead? */ | ||||
| 	ret = inode_permission(file_inode(cfile.file), MAY_READ); | ||||
| 	if (ret < 0) | ||||
| 		goto out_put_cfile; | ||||
| 
 | ||||
| 	event->cft = __file_cft(cfile.file); | ||||
| 	if (IS_ERR(event->cft)) { | ||||
| 		ret = PTR_ERR(event->cft); | ||||
| 		goto out_put_cfile; | ||||
| 	} | ||||
| 
 | ||||
| 	if (!event->cft->ss) { | ||||
| 		ret = -EBADF; | ||||
| 		goto out_put_cfile; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Determine the css of @cfile, verify it belongs to the same | ||||
| 	 * cgroup as cgroup.event_control, and associate @event with it. | ||||
| 	 * Remaining events are automatically removed on cgroup destruction | ||||
| 	 * but the removal is asynchronous, so take an extra ref. | ||||
| 	 */ | ||||
| 	rcu_read_lock(); | ||||
| 
 | ||||
| 	ret = -EINVAL; | ||||
| 	event->css = cgroup_css(cgrp, event->cft->ss); | ||||
| 	cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss); | ||||
| 	if (event->css && event->css == cfile_css && css_tryget(event->css)) | ||||
| 		ret = 0; | ||||
| 
 | ||||
| 	rcu_read_unlock(); | ||||
| 	if (ret) | ||||
| 		goto out_put_cfile; | ||||
| 
 | ||||
| 	if (!event->cft->register_event || !event->cft->unregister_event) { | ||||
| 		ret = -EINVAL; | ||||
| 		goto out_put_css; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = event->cft->register_event(event->css, event->cft, | ||||
| 			event->eventfd, buffer); | ||||
| 	if (ret) | ||||
| 		goto out_put_css; | ||||
| 
 | ||||
| 	efile.file->f_op->poll(efile.file, &event->pt); | ||||
| 
 | ||||
| 	spin_lock(&cgrp->event_list_lock); | ||||
| 	list_add(&event->list, &cgrp->event_list); | ||||
| 	spin_unlock(&cgrp->event_list_lock); | ||||
| 
 | ||||
| 	fdput(cfile); | ||||
| 	fdput(efile); | ||||
| 
 | ||||
| 	return 0; | ||||
| 
 | ||||
| out_put_css: | ||||
| 	css_put(event->css); | ||||
| out_put_cfile: | ||||
| 	fdput(cfile); | ||||
| out_put_eventfd: | ||||
| 	eventfd_ctx_put(event->eventfd); | ||||
| out_put_efile: | ||||
| 	fdput(efile); | ||||
| out_kfree: | ||||
| 	kfree(event); | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, | ||||
| 				      struct cftype *cft) | ||||
| { | ||||
|  | @ -4135,11 +3894,6 @@ static struct cftype cgroup_base_files[] = { | |||
| 		.release = cgroup_pidlist_release, | ||||
| 		.mode = S_IRUGO | S_IWUSR, | ||||
| 	}, | ||||
| 	{ | ||||
| 		.name = "cgroup.event_control", | ||||
| 		.write_string = cgroup_write_event_control, | ||||
| 		.mode = S_IWUGO, | ||||
| 	}, | ||||
| 	{ | ||||
| 		.name = "cgroup.clone_children", | ||||
| 		.flags = CFTYPE_INSANE, | ||||
|  | @ -4610,7 +4364,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
| 	__releases(&cgroup_mutex) __acquires(&cgroup_mutex) | ||||
| { | ||||
| 	struct dentry *d = cgrp->dentry; | ||||
| 	struct cgroup_event *event, *tmp; | ||||
| 	struct cgroup_subsys *ss; | ||||
| 	struct cgroup *child; | ||||
| 	bool empty; | ||||
|  | @ -4685,18 +4438,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
| 	dget(d); | ||||
| 	cgroup_d_remove_dir(d); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Unregister events and notify userspace. | ||||
| 	 * Notify userspace about cgroup removing only after rmdir of cgroup | ||||
| 	 * directory to avoid race between userspace and kernelspace. | ||||
| 	 */ | ||||
| 	spin_lock(&cgrp->event_list_lock); | ||||
| 	list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { | ||||
| 		list_del_init(&event->list); | ||||
| 		schedule_work(&event->remove); | ||||
| 	} | ||||
| 	spin_unlock(&cgrp->event_list_lock); | ||||
| 
 | ||||
| 	return 0; | ||||
| }; | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										355
									
								
								mm/memcontrol.c
									
										
									
									
									
								
							
							
						
						
									
										355
									
								
								mm/memcontrol.c
									
										
									
									
									
								
							|  | @ -45,6 +45,7 @@ | |||
| #include <linux/swapops.h> | ||||
| #include <linux/spinlock.h> | ||||
| #include <linux/eventfd.h> | ||||
| #include <linux/poll.h> | ||||
| #include <linux/sort.h> | ||||
| #include <linux/fs.h> | ||||
| #include <linux/seq_file.h> | ||||
|  | @ -55,6 +56,7 @@ | |||
| #include <linux/cpu.h> | ||||
| #include <linux/oom.h> | ||||
| #include <linux/lockdep.h> | ||||
| #include <linux/file.h> | ||||
| #include "internal.h" | ||||
| #include <net/sock.h> | ||||
| #include <net/ip.h> | ||||
|  | @ -227,6 +229,46 @@ struct mem_cgroup_eventfd_list { | |||
| 	struct eventfd_ctx *eventfd; | ||||
| }; | ||||
| 
 | ||||
| /*
 | ||||
|  * cgroup_event represents events which userspace want to receive. | ||||
|  */ | ||||
| struct mem_cgroup_event { | ||||
| 	/*
 | ||||
| 	 * memcg which the event belongs to. | ||||
| 	 */ | ||||
| 	struct mem_cgroup *memcg; | ||||
| 	/*
 | ||||
| 	 * eventfd to signal userspace about the event. | ||||
| 	 */ | ||||
| 	struct eventfd_ctx *eventfd; | ||||
| 	/*
 | ||||
| 	 * Each of these stored in a list by the cgroup. | ||||
| 	 */ | ||||
| 	struct list_head list; | ||||
| 	/*
 | ||||
| 	 * register_event() callback will be used to add new userspace | ||||
| 	 * waiter for changes related to this event.  Use eventfd_signal() | ||||
| 	 * on eventfd to send notification to userspace. | ||||
| 	 */ | ||||
| 	int (*register_event)(struct mem_cgroup *memcg, | ||||
| 			      struct eventfd_ctx *eventfd, const char *args); | ||||
| 	/*
 | ||||
| 	 * unregister_event() callback will be called when userspace closes | ||||
| 	 * the eventfd or on cgroup removing.  This callback must be set, | ||||
| 	 * if you want provide notification functionality. | ||||
| 	 */ | ||||
| 	void (*unregister_event)(struct mem_cgroup *memcg, | ||||
| 				 struct eventfd_ctx *eventfd); | ||||
| 	/*
 | ||||
| 	 * All fields below needed to unregister event when | ||||
| 	 * userspace closes eventfd. | ||||
| 	 */ | ||||
| 	poll_table pt; | ||||
| 	wait_queue_head_t *wqh; | ||||
| 	wait_queue_t wait; | ||||
| 	struct work_struct remove; | ||||
| }; | ||||
| 
 | ||||
| static void mem_cgroup_threshold(struct mem_cgroup *memcg); | ||||
| static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); | ||||
| 
 | ||||
|  | @ -331,6 +373,10 @@ struct mem_cgroup { | |||
| 	atomic_t	numainfo_updating; | ||||
| #endif | ||||
| 
 | ||||
| 	/* List of events which userspace want to receive */ | ||||
| 	struct list_head event_list; | ||||
| 	spinlock_t event_list_lock; | ||||
| 
 | ||||
| 	struct mem_cgroup_per_node *nodeinfo[0]; | ||||
| 	/* WARNING: nodeinfo must be the last member here */ | ||||
| }; | ||||
|  | @ -490,11 +536,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) | |||
| 	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; | ||||
| } | ||||
| 
 | ||||
| struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css) | ||||
| { | ||||
| 	return &mem_cgroup_from_css(css)->vmpressure; | ||||
| } | ||||
| 
 | ||||
| static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) | ||||
| { | ||||
| 	return (memcg == root_mem_cgroup); | ||||
|  | @ -5648,13 +5689,11 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) | |||
| 		mem_cgroup_oom_notify_cb(iter); | ||||
| } | ||||
| 
 | ||||
| static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, | ||||
| 	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) | ||||
| static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, | ||||
| 	struct eventfd_ctx *eventfd, const char *args, enum res_type type) | ||||
| { | ||||
| 	struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||||
| 	struct mem_cgroup_thresholds *thresholds; | ||||
| 	struct mem_cgroup_threshold_ary *new; | ||||
| 	enum res_type type = MEMFILE_TYPE(cft->private); | ||||
| 	u64 threshold, usage; | ||||
| 	int i, size, ret; | ||||
| 
 | ||||
|  | @ -5731,13 +5770,23 @@ unlock: | |||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, | ||||
| 	struct cftype *cft, struct eventfd_ctx *eventfd) | ||||
| static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, | ||||
| 	struct eventfd_ctx *eventfd, const char *args) | ||||
| { | ||||
| 	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); | ||||
| } | ||||
| 
 | ||||
| static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, | ||||
| 	struct eventfd_ctx *eventfd, const char *args) | ||||
| { | ||||
| 	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); | ||||
| } | ||||
| 
 | ||||
| static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | ||||
| 	struct eventfd_ctx *eventfd, enum res_type type) | ||||
| { | ||||
| 	struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||||
| 	struct mem_cgroup_thresholds *thresholds; | ||||
| 	struct mem_cgroup_threshold_ary *new; | ||||
| 	enum res_type type = MEMFILE_TYPE(cft->private); | ||||
| 	u64 usage; | ||||
| 	int i, j, size; | ||||
| 
 | ||||
|  | @ -5810,14 +5859,23 @@ unlock: | |||
| 	mutex_unlock(&memcg->thresholds_lock); | ||||
| } | ||||
| 
 | ||||
| static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, | ||||
| 	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) | ||||
| static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | ||||
| 	struct eventfd_ctx *eventfd) | ||||
| { | ||||
| 	struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||||
| 	struct mem_cgroup_eventfd_list *event; | ||||
| 	enum res_type type = MEMFILE_TYPE(cft->private); | ||||
| 	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); | ||||
| } | ||||
| 
 | ||||
| static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | ||||
| 	struct eventfd_ctx *eventfd) | ||||
| { | ||||
| 	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); | ||||
| } | ||||
| 
 | ||||
| static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, | ||||
| 	struct eventfd_ctx *eventfd, const char *args) | ||||
| { | ||||
| 	struct mem_cgroup_eventfd_list *event; | ||||
| 
 | ||||
| 	BUG_ON(type != _OOM_TYPE); | ||||
| 	event = kmalloc(sizeof(*event),	GFP_KERNEL); | ||||
| 	if (!event) | ||||
| 		return -ENOMEM; | ||||
|  | @ -5835,14 +5893,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, | |||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, | ||||
| 	struct cftype *cft, struct eventfd_ctx *eventfd) | ||||
| static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, | ||||
| 	struct eventfd_ctx *eventfd) | ||||
| { | ||||
| 	struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||||
| 	struct mem_cgroup_eventfd_list *ev, *tmp; | ||||
| 	enum res_type type = MEMFILE_TYPE(cft->private); | ||||
| 
 | ||||
| 	BUG_ON(type != _OOM_TYPE); | ||||
| 
 | ||||
| 	spin_lock(&memcg_oom_lock); | ||||
| 
 | ||||
|  | @ -5959,13 +6013,233 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) | |||
| } | ||||
| #endif | ||||
| 
 | ||||
| /*
 | ||||
|  * DO NOT USE IN NEW FILES. | ||||
|  * | ||||
|  * "cgroup.event_control" implementation. | ||||
|  * | ||||
|  * This is way over-engineered.  It tries to support fully configurable | ||||
|  * events for each user.  Such level of flexibility is completely | ||||
|  * unnecessary especially in the light of the planned unified hierarchy. | ||||
|  * | ||||
|  * Please deprecate this and replace with something simpler if at all | ||||
|  * possible. | ||||
|  */ | ||||
| 
 | ||||
| /*
 | ||||
|  * Unregister event and free resources. | ||||
|  * | ||||
|  * Gets called from workqueue. | ||||
|  */ | ||||
| static void memcg_event_remove(struct work_struct *work) | ||||
| { | ||||
| 	struct mem_cgroup_event *event = | ||||
| 		container_of(work, struct mem_cgroup_event, remove); | ||||
| 	struct mem_cgroup *memcg = event->memcg; | ||||
| 
 | ||||
| 	remove_wait_queue(event->wqh, &event->wait); | ||||
| 
 | ||||
| 	event->unregister_event(memcg, event->eventfd); | ||||
| 
 | ||||
| 	/* Notify userspace the event is going away. */ | ||||
| 	eventfd_signal(event->eventfd, 1); | ||||
| 
 | ||||
| 	eventfd_ctx_put(event->eventfd); | ||||
| 	kfree(event); | ||||
| 	css_put(&memcg->css); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Gets called on POLLHUP on eventfd when user closes it. | ||||
|  * | ||||
|  * Called with wqh->lock held and interrupts disabled. | ||||
|  */ | ||||
| static int memcg_event_wake(wait_queue_t *wait, unsigned mode, | ||||
| 			    int sync, void *key) | ||||
| { | ||||
| 	struct mem_cgroup_event *event = | ||||
| 		container_of(wait, struct mem_cgroup_event, wait); | ||||
| 	struct mem_cgroup *memcg = event->memcg; | ||||
| 	unsigned long flags = (unsigned long)key; | ||||
| 
 | ||||
| 	if (flags & POLLHUP) { | ||||
| 		/*
 | ||||
| 		 * If the event has been detached at cgroup removal, we | ||||
| 		 * can simply return knowing the other side will cleanup | ||||
| 		 * for us. | ||||
| 		 * | ||||
| 		 * We can't race against event freeing since the other | ||||
| 		 * side will require wqh->lock via remove_wait_queue(), | ||||
| 		 * which we hold. | ||||
| 		 */ | ||||
| 		spin_lock(&memcg->event_list_lock); | ||||
| 		if (!list_empty(&event->list)) { | ||||
| 			list_del_init(&event->list); | ||||
| 			/*
 | ||||
| 			 * We are in atomic context, but cgroup_event_remove() | ||||
| 			 * may sleep, so we have to call it in workqueue. | ||||
| 			 */ | ||||
| 			schedule_work(&event->remove); | ||||
| 		} | ||||
| 		spin_unlock(&memcg->event_list_lock); | ||||
| 	} | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static void memcg_event_ptable_queue_proc(struct file *file, | ||||
| 		wait_queue_head_t *wqh, poll_table *pt) | ||||
| { | ||||
| 	struct mem_cgroup_event *event = | ||||
| 		container_of(pt, struct mem_cgroup_event, pt); | ||||
| 
 | ||||
| 	event->wqh = wqh; | ||||
| 	add_wait_queue(wqh, &event->wait); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * DO NOT USE IN NEW FILES. | ||||
|  * | ||||
|  * Parse input and register new cgroup event handler. | ||||
|  * | ||||
|  * Input must be in format '<event_fd> <control_fd> <args>'. | ||||
|  * Interpretation of args is defined by control file implementation. | ||||
|  */ | ||||
| static int memcg_write_event_control(struct cgroup_subsys_state *css, | ||||
| 				     struct cftype *cft, const char *buffer) | ||||
| { | ||||
| 	struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||||
| 	struct mem_cgroup_event *event; | ||||
| 	struct cgroup_subsys_state *cfile_css; | ||||
| 	unsigned int efd, cfd; | ||||
| 	struct fd efile; | ||||
| 	struct fd cfile; | ||||
| 	const char *name; | ||||
| 	char *endp; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	efd = simple_strtoul(buffer, &endp, 10); | ||||
| 	if (*endp != ' ') | ||||
| 		return -EINVAL; | ||||
| 	buffer = endp + 1; | ||||
| 
 | ||||
| 	cfd = simple_strtoul(buffer, &endp, 10); | ||||
| 	if ((*endp != ' ') && (*endp != '\0')) | ||||
| 		return -EINVAL; | ||||
| 	buffer = endp + 1; | ||||
| 
 | ||||
| 	event = kzalloc(sizeof(*event), GFP_KERNEL); | ||||
| 	if (!event) | ||||
| 		return -ENOMEM; | ||||
| 
 | ||||
| 	event->memcg = memcg; | ||||
| 	INIT_LIST_HEAD(&event->list); | ||||
| 	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); | ||||
| 	init_waitqueue_func_entry(&event->wait, memcg_event_wake); | ||||
| 	INIT_WORK(&event->remove, memcg_event_remove); | ||||
| 
 | ||||
| 	efile = fdget(efd); | ||||
| 	if (!efile.file) { | ||||
| 		ret = -EBADF; | ||||
| 		goto out_kfree; | ||||
| 	} | ||||
| 
 | ||||
| 	event->eventfd = eventfd_ctx_fileget(efile.file); | ||||
| 	if (IS_ERR(event->eventfd)) { | ||||
| 		ret = PTR_ERR(event->eventfd); | ||||
| 		goto out_put_efile; | ||||
| 	} | ||||
| 
 | ||||
| 	cfile = fdget(cfd); | ||||
| 	if (!cfile.file) { | ||||
| 		ret = -EBADF; | ||||
| 		goto out_put_eventfd; | ||||
| 	} | ||||
| 
 | ||||
| 	/* the process need read permission on control file */ | ||||
| 	/* AV: shouldn't we check that it's been opened for read instead? */ | ||||
| 	ret = inode_permission(file_inode(cfile.file), MAY_READ); | ||||
| 	if (ret < 0) | ||||
| 		goto out_put_cfile; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Determine the event callbacks and set them in @event.  This used | ||||
| 	 * to be done via struct cftype but cgroup core no longer knows | ||||
| 	 * about these events.  The following is crude but the whole thing | ||||
| 	 * is for compatibility anyway. | ||||
| 	 * | ||||
| 	 * DO NOT ADD NEW FILES. | ||||
| 	 */ | ||||
| 	name = cfile.file->f_dentry->d_name.name; | ||||
| 
 | ||||
| 	if (!strcmp(name, "memory.usage_in_bytes")) { | ||||
| 		event->register_event = mem_cgroup_usage_register_event; | ||||
| 		event->unregister_event = mem_cgroup_usage_unregister_event; | ||||
| 	} else if (!strcmp(name, "memory.oom_control")) { | ||||
| 		event->register_event = mem_cgroup_oom_register_event; | ||||
| 		event->unregister_event = mem_cgroup_oom_unregister_event; | ||||
| 	} else if (!strcmp(name, "memory.pressure_level")) { | ||||
| 		event->register_event = vmpressure_register_event; | ||||
| 		event->unregister_event = vmpressure_unregister_event; | ||||
| 	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { | ||||
| 		event->register_event = memsw_cgroup_usage_register_event; | ||||
| 		event->unregister_event = memsw_cgroup_usage_unregister_event; | ||||
| 	} else { | ||||
| 		ret = -EINVAL; | ||||
| 		goto out_put_cfile; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Verify @cfile should belong to @css.  Also, remaining events are | ||||
| 	 * automatically removed on cgroup destruction but the removal is | ||||
| 	 * asynchronous, so take an extra ref on @css. | ||||
| 	 */ | ||||
| 	rcu_read_lock(); | ||||
| 
 | ||||
| 	ret = -EINVAL; | ||||
| 	cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, | ||||
| 				 &mem_cgroup_subsys); | ||||
| 	if (cfile_css == css && css_tryget(css)) | ||||
| 		ret = 0; | ||||
| 
 | ||||
| 	rcu_read_unlock(); | ||||
| 	if (ret) | ||||
| 		goto out_put_cfile; | ||||
| 
 | ||||
| 	ret = event->register_event(memcg, event->eventfd, buffer); | ||||
| 	if (ret) | ||||
| 		goto out_put_css; | ||||
| 
 | ||||
| 	efile.file->f_op->poll(efile.file, &event->pt); | ||||
| 
 | ||||
| 	spin_lock(&memcg->event_list_lock); | ||||
| 	list_add(&event->list, &memcg->event_list); | ||||
| 	spin_unlock(&memcg->event_list_lock); | ||||
| 
 | ||||
| 	fdput(cfile); | ||||
| 	fdput(efile); | ||||
| 
 | ||||
| 	return 0; | ||||
| 
 | ||||
| out_put_css: | ||||
| 	css_put(css); | ||||
| out_put_cfile: | ||||
| 	fdput(cfile); | ||||
| out_put_eventfd: | ||||
| 	eventfd_ctx_put(event->eventfd); | ||||
| out_put_efile: | ||||
| 	fdput(efile); | ||||
| out_kfree: | ||||
| 	kfree(event); | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static struct cftype mem_cgroup_files[] = { | ||||
| 	{ | ||||
| 		.name = "usage_in_bytes", | ||||
| 		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | ||||
| 		.read = mem_cgroup_read, | ||||
| 		.register_event = mem_cgroup_usage_register_event, | ||||
| 		.unregister_event = mem_cgroup_usage_unregister_event, | ||||
| 	}, | ||||
| 	{ | ||||
| 		.name = "max_usage_in_bytes", | ||||
|  | @ -6005,6 +6279,12 @@ static struct cftype mem_cgroup_files[] = { | |||
| 		.write_u64 = mem_cgroup_hierarchy_write, | ||||
| 		.read_u64 = mem_cgroup_hierarchy_read, | ||||
| 	}, | ||||
| 	{ | ||||
| 		.name = "cgroup.event_control",		/* XXX: for compat */ | ||||
| 		.write_string = memcg_write_event_control, | ||||
| 		.flags = CFTYPE_NO_PREFIX, | ||||
| 		.mode = S_IWUGO, | ||||
| 	}, | ||||
| 	{ | ||||
| 		.name = "swappiness", | ||||
| 		.read_u64 = mem_cgroup_swappiness_read, | ||||
|  | @ -6019,14 +6299,10 @@ static struct cftype mem_cgroup_files[] = { | |||
| 		.name = "oom_control", | ||||
| 		.read_map = mem_cgroup_oom_control_read, | ||||
| 		.write_u64 = mem_cgroup_oom_control_write, | ||||
| 		.register_event = mem_cgroup_oom_register_event, | ||||
| 		.unregister_event = mem_cgroup_oom_unregister_event, | ||||
| 		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), | ||||
| 	}, | ||||
| 	{ | ||||
| 		.name = "pressure_level", | ||||
| 		.register_event = vmpressure_register_event, | ||||
| 		.unregister_event = vmpressure_unregister_event, | ||||
| 	}, | ||||
| #ifdef CONFIG_NUMA | ||||
| 	{ | ||||
|  | @ -6074,8 +6350,6 @@ static struct cftype memsw_cgroup_files[] = { | |||
| 		.name = "memsw.usage_in_bytes", | ||||
| 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | ||||
| 		.read = mem_cgroup_read, | ||||
| 		.register_event = mem_cgroup_usage_register_event, | ||||
| 		.unregister_event = mem_cgroup_usage_unregister_event, | ||||
| 	}, | ||||
| 	{ | ||||
| 		.name = "memsw.max_usage_in_bytes", | ||||
|  | @ -6265,6 +6539,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
| 	mutex_init(&memcg->thresholds_lock); | ||||
| 	spin_lock_init(&memcg->move_lock); | ||||
| 	vmpressure_init(&memcg->vmpressure); | ||||
| 	INIT_LIST_HEAD(&memcg->event_list); | ||||
| 	spin_lock_init(&memcg->event_list_lock); | ||||
| 
 | ||||
| 	return &memcg->css; | ||||
| 
 | ||||
|  | @ -6340,6 +6616,19 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) | |||
| static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | ||||
| { | ||||
| 	struct mem_cgroup *memcg = mem_cgroup_from_css(css); | ||||
| 	struct mem_cgroup_event *event, *tmp; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Unregister events and notify userspace. | ||||
| 	 * Notify userspace about cgroup removing only after rmdir of cgroup | ||||
| 	 * directory to avoid race between userspace and kernelspace. | ||||
| 	 */ | ||||
| 	spin_lock(&memcg->event_list_lock); | ||||
| 	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { | ||||
| 		list_del_init(&event->list); | ||||
| 		schedule_work(&event->remove); | ||||
| 	} | ||||
| 	spin_unlock(&memcg->event_list_lock); | ||||
| 
 | ||||
| 	kmem_cgroup_css_offline(memcg); | ||||
| 
 | ||||
|  |  | |||
|  | @ -278,8 +278,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) | |||
| 
 | ||||
| /**
 | ||||
|  * vmpressure_register_event() - Bind vmpressure notifications to an eventfd | ||||
|  * @css:	css that is interested in vmpressure notifications | ||||
|  * @cft:	cgroup control files handle | ||||
|  * @memcg:	memcg that is interested in vmpressure notifications | ||||
|  * @eventfd:	eventfd context to link notifications with | ||||
|  * @args:	event arguments (used to set up a pressure level threshold) | ||||
|  * | ||||
|  | @ -289,15 +288,12 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) | |||
|  * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or | ||||
|  * "critical"). | ||||
|  * | ||||
|  * This function should not be used directly, just pass it to (struct | ||||
|  * cftype).register_event, and then cgroup core will handle everything by | ||||
|  * itself. | ||||
|  * To be used as memcg event method. | ||||
|  */ | ||||
| int vmpressure_register_event(struct cgroup_subsys_state *css, | ||||
| 			      struct cftype *cft, struct eventfd_ctx *eventfd, | ||||
| 			      const char *args) | ||||
| int vmpressure_register_event(struct mem_cgroup *memcg, | ||||
| 			      struct eventfd_ctx *eventfd, const char *args) | ||||
| { | ||||
| 	struct vmpressure *vmpr = css_to_vmpressure(css); | ||||
| 	struct vmpressure *vmpr = memcg_to_vmpressure(memcg); | ||||
| 	struct vmpressure_event *ev; | ||||
| 	int level; | ||||
| 
 | ||||
|  | @ -325,23 +321,19 @@ int vmpressure_register_event(struct cgroup_subsys_state *css, | |||
| 
 | ||||
| /**
 | ||||
|  * vmpressure_unregister_event() - Unbind eventfd from vmpressure | ||||
|  * @css:	css handle | ||||
|  * @cft:	cgroup control files handle | ||||
|  * @memcg:	memcg handle | ||||
|  * @eventfd:	eventfd context that was used to link vmpressure with the @cg | ||||
|  * | ||||
|  * This function does internal manipulations to detach the @eventfd from | ||||
|  * the vmpressure notifications, and then frees internal resources | ||||
|  * associated with the @eventfd (but the @eventfd itself is not freed). | ||||
|  * | ||||
|  * This function should not be used directly, just pass it to (struct | ||||
|  * cftype).unregister_event, and then cgroup core will handle everything | ||||
|  * by itself. | ||||
|  * To be used as memcg event method. | ||||
|  */ | ||||
| void vmpressure_unregister_event(struct cgroup_subsys_state *css, | ||||
| 				 struct cftype *cft, | ||||
| void vmpressure_unregister_event(struct mem_cgroup *memcg, | ||||
| 				 struct eventfd_ctx *eventfd) | ||||
| { | ||||
| 	struct vmpressure *vmpr = css_to_vmpressure(css); | ||||
| 	struct vmpressure *vmpr = memcg_to_vmpressure(memcg); | ||||
| 	struct vmpressure_event *ev; | ||||
| 
 | ||||
| 	mutex_lock(&vmpr->events_lock); | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Tejun Heo
				Tejun Heo