cgroup: fix cgroup_path() vs rename() race
rename() will change dentry->d_name. The result of this race can
be worse than seeing partially rewritten name, but we might access
a stale pointer because rename() will re-allocate memory to hold
a longer name.
As accessing dentry->name must be protected by dentry->d_lock or
parent inode's i_mutex, while on the other hand cgroup-path() can
be called with some irq-safe spinlocks held, we can't generate
cgroup path using dentry->d_name.
Alternatively we make a copy of dentry->d_name and save it in
cgrp->name when a cgroup is created, and update cgrp->name at
rename().
v5: use flexible array instead of zero-size array.
v4: - allocate root_cgroup_name and all root_cgroup->name points to it.
    - add cgroup_name() wrapper.
v3: use kfree_rcu() instead of synchronize_rcu() in user-visible path.
v2: make cgrp->name RCU safe.
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
	
	
This commit is contained in:
		
					parent
					
						
							
								6dbe51c251
							
						
					
				
			
			
				commit
				
					
						65dff759d2
					
				
			
		
					 3 changed files with 100 additions and 32 deletions
				
			
		|  | @ -247,9 +247,7 @@ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) | ||||||
| { | { | ||||||
| 	int ret; | 	int ret; | ||||||
| 
 | 
 | ||||||
| 	rcu_read_lock(); |  | ||||||
| 	ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); | 	ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); | ||||||
| 	rcu_read_unlock(); |  | ||||||
| 	if (ret) | 	if (ret) | ||||||
| 		strncpy(buf, "<unavailable>", buflen); | 		strncpy(buf, "<unavailable>", buflen); | ||||||
| 	return ret; | 	return ret; | ||||||
|  |  | ||||||
|  | @ -150,6 +150,11 @@ enum { | ||||||
| 	CGRP_CPUSET_CLONE_CHILDREN, | 	CGRP_CPUSET_CLONE_CHILDREN, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | struct cgroup_name { | ||||||
|  | 	struct rcu_head rcu_head; | ||||||
|  | 	char name[]; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
| struct cgroup { | struct cgroup { | ||||||
| 	unsigned long flags;		/* "unsigned long" so bitops work */ | 	unsigned long flags;		/* "unsigned long" so bitops work */ | ||||||
| 
 | 
 | ||||||
|  | @ -172,6 +177,19 @@ struct cgroup { | ||||||
| 	struct cgroup *parent;		/* my parent */ | 	struct cgroup *parent;		/* my parent */ | ||||||
| 	struct dentry *dentry;		/* cgroup fs entry, RCU protected */ | 	struct dentry *dentry;		/* cgroup fs entry, RCU protected */ | ||||||
| 
 | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * This is a copy of dentry->d_name, and it's needed because | ||||||
|  | 	 * we can't use dentry->d_name in cgroup_path(). | ||||||
|  | 	 * | ||||||
|  | 	 * You must acquire rcu_read_lock() to access cgrp->name, and | ||||||
|  | 	 * the only place that can change it is rename(), which is | ||||||
|  | 	 * protected by parent dir's i_mutex. | ||||||
|  | 	 * | ||||||
|  | 	 * Normally you should use cgroup_name() wrapper rather than | ||||||
|  | 	 * access it directly. | ||||||
|  | 	 */ | ||||||
|  | 	struct cgroup_name __rcu *name; | ||||||
|  | 
 | ||||||
| 	/* Private pointers for each registered subsystem */ | 	/* Private pointers for each registered subsystem */ | ||||||
| 	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; | 	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; | ||||||
| 
 | 
 | ||||||
|  | @ -404,6 +422,12 @@ struct cgroup_scanner { | ||||||
| 	void *data; | 	void *data; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | /* Caller should hold rcu_read_lock() */ | ||||||
|  | static inline const char *cgroup_name(const struct cgroup *cgrp) | ||||||
|  | { | ||||||
|  | 	return rcu_dereference(cgrp->name)->name; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); | int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); | ||||||
| int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); | int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
							
								
								
									
										106
									
								
								kernel/cgroup.c
									
										
									
									
									
								
							
							
						
						
									
										106
									
								
								kernel/cgroup.c
									
										
									
									
									
								
							|  | @ -238,6 +238,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock); | ||||||
| /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ | /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ | ||||||
| #define dummytop (&rootnode.top_cgroup) | #define dummytop (&rootnode.top_cgroup) | ||||||
| 
 | 
 | ||||||
|  | static struct cgroup_name root_cgroup_name = { .name = "/" }; | ||||||
|  | 
 | ||||||
| /* This flag indicates whether tasks in the fork and exit paths should
 | /* This flag indicates whether tasks in the fork and exit paths should
 | ||||||
|  * check for fork/exit handlers to call. This avoids us having to do |  * check for fork/exit handlers to call. This avoids us having to do | ||||||
|  * extra work in the fork/exit path if none of the subsystems need to |  * extra work in the fork/exit path if none of the subsystems need to | ||||||
|  | @ -859,6 +861,17 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) | ||||||
| 	return inode; | 	return inode; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) | ||||||
|  | { | ||||||
|  | 	struct cgroup_name *name; | ||||||
|  | 
 | ||||||
|  | 	name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL); | ||||||
|  | 	if (!name) | ||||||
|  | 		return NULL; | ||||||
|  | 	strcpy(name->name, dentry->d_name.name); | ||||||
|  | 	return name; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static void cgroup_free_fn(struct work_struct *work) | static void cgroup_free_fn(struct work_struct *work) | ||||||
| { | { | ||||||
| 	struct cgroup *cgrp = container_of(work, struct cgroup, free_work); | 	struct cgroup *cgrp = container_of(work, struct cgroup, free_work); | ||||||
|  | @ -889,6 +902,7 @@ static void cgroup_free_fn(struct work_struct *work) | ||||||
| 	simple_xattrs_free(&cgrp->xattrs); | 	simple_xattrs_free(&cgrp->xattrs); | ||||||
| 
 | 
 | ||||||
| 	ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); | 	ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); | ||||||
|  | 	kfree(rcu_dereference_raw(cgrp->name)); | ||||||
| 	kfree(cgrp); | 	kfree(cgrp); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -1421,6 +1435,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) | ||||||
| 	INIT_LIST_HEAD(&root->allcg_list); | 	INIT_LIST_HEAD(&root->allcg_list); | ||||||
| 	root->number_of_cgroups = 1; | 	root->number_of_cgroups = 1; | ||||||
| 	cgrp->root = root; | 	cgrp->root = root; | ||||||
|  | 	cgrp->name = &root_cgroup_name; | ||||||
| 	cgrp->top_cgroup = cgrp; | 	cgrp->top_cgroup = cgrp; | ||||||
| 	init_cgroup_housekeeping(cgrp); | 	init_cgroup_housekeeping(cgrp); | ||||||
| 	list_add_tail(&cgrp->allcg_node, &root->allcg_list); | 	list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||||||
|  | @ -1769,49 +1784,45 @@ static struct kobject *cgroup_kobj; | ||||||
|  * @buf: the buffer to write the path into |  * @buf: the buffer to write the path into | ||||||
|  * @buflen: the length of the buffer |  * @buflen: the length of the buffer | ||||||
|  * |  * | ||||||
|  * Called with cgroup_mutex held or else with an RCU-protected cgroup |  * Writes path of cgroup into buf.  Returns 0 on success, -errno on error. | ||||||
|  * reference.  Writes path of cgroup into buf.  Returns 0 on success, |  * | ||||||
|  * -errno on error. |  * We can't generate cgroup path using dentry->d_name, as accessing | ||||||
|  |  * dentry->name must be protected by irq-unsafe dentry->d_lock or parent | ||||||
|  |  * inode's i_mutex, while on the other hand cgroup_path() can be called | ||||||
|  |  * with some irq-safe spinlocks held. | ||||||
|  */ |  */ | ||||||
| int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | ||||||
| { | { | ||||||
| 	struct dentry *dentry = cgrp->dentry; | 	int ret = -ENAMETOOLONG; | ||||||
| 	char *start; | 	char *start; | ||||||
| 
 | 
 | ||||||
| 	rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), |  | ||||||
| 			   "cgroup_path() called without proper locking"); |  | ||||||
| 
 |  | ||||||
| 	if (cgrp == dummytop) { |  | ||||||
| 		/*
 |  | ||||||
| 		 * Inactive subsystems have no dentry for their root |  | ||||||
| 		 * cgroup |  | ||||||
| 		 */ |  | ||||||
| 		strcpy(buf, "/"); |  | ||||||
| 		return 0; |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	start = buf + buflen - 1; | 	start = buf + buflen - 1; | ||||||
| 
 |  | ||||||
| 	*start = '\0'; | 	*start = '\0'; | ||||||
| 	for (;;) { |  | ||||||
| 		int len = dentry->d_name.len; |  | ||||||
| 
 | 
 | ||||||
|  | 	rcu_read_lock(); | ||||||
|  | 	while (cgrp) { | ||||||
|  | 		const char *name = cgroup_name(cgrp); | ||||||
|  | 		int len; | ||||||
|  | 
 | ||||||
|  | 		len = strlen(name); | ||||||
| 		if ((start -= len) < buf) | 		if ((start -= len) < buf) | ||||||
| 			return -ENAMETOOLONG; | 			goto out; | ||||||
| 		memcpy(start, dentry->d_name.name, len); | 		memcpy(start, name, len); | ||||||
| 		cgrp = cgrp->parent; | 
 | ||||||
| 		if (!cgrp) | 		if (!cgrp->parent) | ||||||
| 			break; | 			break; | ||||||
| 
 | 
 | ||||||
| 		dentry = cgrp->dentry; |  | ||||||
| 		if (!cgrp->parent) |  | ||||||
| 			continue; |  | ||||||
| 		if (--start < buf) | 		if (--start < buf) | ||||||
| 			return -ENAMETOOLONG; | 			goto out; | ||||||
| 		*start = '/'; | 		*start = '/'; | ||||||
|  | 
 | ||||||
|  | 		cgrp = cgrp->parent; | ||||||
| 	} | 	} | ||||||
|  | 	ret = 0; | ||||||
| 	memmove(buf, start, buf + buflen - start); | 	memmove(buf, start, buf + buflen - start); | ||||||
| 	return 0; | out: | ||||||
|  | 	rcu_read_unlock(); | ||||||
|  | 	return ret; | ||||||
| } | } | ||||||
| EXPORT_SYMBOL_GPL(cgroup_path); | EXPORT_SYMBOL_GPL(cgroup_path); | ||||||
| 
 | 
 | ||||||
|  | @ -2537,13 +2548,40 @@ static int cgroup_file_release(struct inode *inode, struct file *file) | ||||||
| static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, | static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, | ||||||
| 			    struct inode *new_dir, struct dentry *new_dentry) | 			    struct inode *new_dir, struct dentry *new_dentry) | ||||||
| { | { | ||||||
|  | 	int ret; | ||||||
|  | 	struct cgroup_name *name, *old_name; | ||||||
|  | 	struct cgroup *cgrp; | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * It's convinient to use parent dir's i_mutex to protected | ||||||
|  | 	 * cgrp->name. | ||||||
|  | 	 */ | ||||||
|  | 	lockdep_assert_held(&old_dir->i_mutex); | ||||||
|  | 
 | ||||||
| 	if (!S_ISDIR(old_dentry->d_inode->i_mode)) | 	if (!S_ISDIR(old_dentry->d_inode->i_mode)) | ||||||
| 		return -ENOTDIR; | 		return -ENOTDIR; | ||||||
| 	if (new_dentry->d_inode) | 	if (new_dentry->d_inode) | ||||||
| 		return -EEXIST; | 		return -EEXIST; | ||||||
| 	if (old_dir != new_dir) | 	if (old_dir != new_dir) | ||||||
| 		return -EIO; | 		return -EIO; | ||||||
| 	return simple_rename(old_dir, old_dentry, new_dir, new_dentry); | 
 | ||||||
|  | 	cgrp = __d_cgrp(old_dentry); | ||||||
|  | 
 | ||||||
|  | 	name = cgroup_alloc_name(new_dentry); | ||||||
|  | 	if (!name) | ||||||
|  | 		return -ENOMEM; | ||||||
|  | 
 | ||||||
|  | 	ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry); | ||||||
|  | 	if (ret) { | ||||||
|  | 		kfree(name); | ||||||
|  | 		return ret; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	old_name = cgrp->name; | ||||||
|  | 	rcu_assign_pointer(cgrp->name, name); | ||||||
|  | 
 | ||||||
|  | 	kfree_rcu(old_name, rcu_head); | ||||||
|  | 	return 0; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static struct simple_xattrs *__d_xattrs(struct dentry *dentry) | static struct simple_xattrs *__d_xattrs(struct dentry *dentry) | ||||||
|  | @ -4158,6 +4196,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | ||||||
| 			     umode_t mode) | 			     umode_t mode) | ||||||
| { | { | ||||||
| 	struct cgroup *cgrp; | 	struct cgroup *cgrp; | ||||||
|  | 	struct cgroup_name *name; | ||||||
| 	struct cgroupfs_root *root = parent->root; | 	struct cgroupfs_root *root = parent->root; | ||||||
| 	int err = 0; | 	int err = 0; | ||||||
| 	struct cgroup_subsys *ss; | 	struct cgroup_subsys *ss; | ||||||
|  | @ -4168,9 +4207,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | ||||||
| 	if (!cgrp) | 	if (!cgrp) | ||||||
| 		return -ENOMEM; | 		return -ENOMEM; | ||||||
| 
 | 
 | ||||||
|  | 	name = cgroup_alloc_name(dentry); | ||||||
|  | 	if (!name) | ||||||
|  | 		goto err_free_cgrp; | ||||||
|  | 	rcu_assign_pointer(cgrp->name, name); | ||||||
|  | 
 | ||||||
| 	cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); | 	cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); | ||||||
| 	if (cgrp->id < 0) | 	if (cgrp->id < 0) | ||||||
| 		goto err_free_cgrp; | 		goto err_free_name; | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Only live parents can have children.  Note that the liveliness | 	 * Only live parents can have children.  Note that the liveliness | ||||||
|  | @ -4276,6 +4320,8 @@ err_free_all: | ||||||
| 	deactivate_super(sb); | 	deactivate_super(sb); | ||||||
| err_free_id: | err_free_id: | ||||||
| 	ida_simple_remove(&root->cgroup_ida, cgrp->id); | 	ida_simple_remove(&root->cgroup_ida, cgrp->id); | ||||||
|  | err_free_name: | ||||||
|  | 	kfree(rcu_dereference_raw(cgrp->name)); | ||||||
| err_free_cgrp: | err_free_cgrp: | ||||||
| 	kfree(cgrp); | 	kfree(cgrp); | ||||||
| 	return err; | 	return err; | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Li Zefan
				Li Zefan