cgroup_freezer: replace freezer->lock with freezer_mutex
After96d365e0b8("cgroup: make css_set_lock a rwsem and rename it to css_set_rwsem"), css task iterators requires sleepable context as it may block on css_set_rwsem. I missed that cgroup_freezer was iterating tasks under IRQ-safe spinlock freezer->lock. This leads to errors like the following on freezer state reads and transitions. BUG: sleeping function called from invalid context at /work /os/work/kernel/locking/rwsem.c:20 in_atomic(): 0, irqs_disabled(): 0, pid: 462, name: bash 5 locks held by bash/462: #0: (sb_writers#7){.+.+.+}, at: [<ffffffff811f0843>] vfs_write+0x1a3/0x1c0 #1: (&of->mutex){+.+.+.}, at: [<ffffffff8126d78b>] kernfs_fop_write+0xbb/0x170 #2: (s_active#70){.+.+.+}, at: [<ffffffff8126d793>] kernfs_fop_write+0xc3/0x170 #3: (freezer_mutex){+.+...}, at: [<ffffffff81135981>] freezer_write+0x61/0x1e0 #4: (rcu_read_lock){......}, at: [<ffffffff81135973>] freezer_write+0x53/0x1e0 Preemption disabled at:[<ffffffff81104404>] console_unlock+0x1e4/0x460 CPU: 3 PID: 462 Comm: bash Not tainted 3.15.0-rc1-work+ #10 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 ffff88000916a6d0 ffff88000e0a3da0 ffffffff81cf8c96 0000000000000000 ffff88000e0a3dc8 ffffffff810cf4f2 ffffffff82388040 ffff880013aaf740 0000000000000002 ffff88000e0a3de8 ffffffff81d05974 0000000000000246 Call Trace: [<ffffffff81cf8c96>] dump_stack+0x4e/0x7a [<ffffffff810cf4f2>] __might_sleep+0x162/0x260 [<ffffffff81d05974>] down_read+0x24/0x60 [<ffffffff81133e87>] css_task_iter_start+0x27/0x70 [<ffffffff8113584d>] freezer_apply_state+0x5d/0x130 [<ffffffff81135a16>] freezer_write+0xf6/0x1e0 [<ffffffff8112eb88>] cgroup_file_write+0xd8/0x230 [<ffffffff8126d7b7>] kernfs_fop_write+0xe7/0x170 [<ffffffff811f0756>] vfs_write+0xb6/0x1c0 [<ffffffff811f121d>] SyS_write+0x4d/0xc0 [<ffffffff81d08292>] system_call_fastpath+0x16/0x1b freezer->lock used to be used in hot paths but that time is long gone and there's no reason for the lock to be IRQ-safe spinlock or even per-cgroup. In fact, given the fact that a cgroup may contain large number of tasks, it's not a good idea to iterate over them while holding IRQ-safe spinlock. Let's simplify locking by replacing per-cgroup freezer->lock with global freezer_mutex. This also makes the comments explaining the intricacies of policy inheritance and the locking around it as the states are protected by a common mutex. The conversion is mostly straight-forward. The followings are worth mentioning. * freezer_css_online() no longer needs double locking. * freezer_attach() now performs propagation simply while holding freezer_mutex. update_if_frozen() race no longer exists and the comment is removed. * freezer_fork() now tests whether the task is in root cgroup using the new task_css_is_root() without doing rcu_read_lock/unlock(). If not, it grabs freezer_mutex and performs the operation. * freezer_read() and freezer_change_state() grab freezer_mutex across the whole operation and pin the css while iterating so that each descendant processing happens in sleepable context. Fixes:96d365e0b8("cgroup: make css_set_lock a rwsem and rename it to css_set_rwsem") Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com>
This commit is contained in:
		
					parent
					
						
							
								5024ae29cd
							
						
					
				
			
			
				commit
				
					
						e5ced8ebb1
					
				
			
		
					 1 changed files with 46 additions and 66 deletions
				
			
		|  | @ -21,6 +21,7 @@ | |||
| #include <linux/uaccess.h> | ||||
| #include <linux/freezer.h> | ||||
| #include <linux/seq_file.h> | ||||
| #include <linux/mutex.h> | ||||
| 
 | ||||
| /*
 | ||||
|  * A cgroup is freezing if any FREEZING flags are set.  FREEZING_SELF is | ||||
|  | @ -42,9 +43,10 @@ enum freezer_state_flags { | |||
| struct freezer { | ||||
| 	struct cgroup_subsys_state	css; | ||||
| 	unsigned int			state; | ||||
| 	spinlock_t			lock; | ||||
| }; | ||||
| 
 | ||||
| static DEFINE_MUTEX(freezer_mutex); | ||||
| 
 | ||||
| static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) | ||||
| { | ||||
| 	return css ? container_of(css, struct freezer, css) : NULL; | ||||
|  | @ -93,7 +95,6 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css) | |||
| 	if (!freezer) | ||||
| 		return ERR_PTR(-ENOMEM); | ||||
| 
 | ||||
| 	spin_lock_init(&freezer->lock); | ||||
| 	return &freezer->css; | ||||
| } | ||||
| 
 | ||||
|  | @ -110,14 +111,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) | |||
| 	struct freezer *freezer = css_freezer(css); | ||||
| 	struct freezer *parent = parent_freezer(freezer); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * The following double locking and freezing state inheritance | ||||
| 	 * guarantee that @cgroup can never escape ancestors' freezing | ||||
| 	 * states.  See css_for_each_descendant_pre() for details. | ||||
| 	 */ | ||||
| 	if (parent) | ||||
| 		spin_lock_irq(&parent->lock); | ||||
| 	spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING); | ||||
| 	mutex_lock(&freezer_mutex); | ||||
| 
 | ||||
| 	freezer->state |= CGROUP_FREEZER_ONLINE; | ||||
| 
 | ||||
|  | @ -126,10 +120,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) | |||
| 		atomic_inc(&system_freezing_cnt); | ||||
| 	} | ||||
| 
 | ||||
| 	spin_unlock(&freezer->lock); | ||||
| 	if (parent) | ||||
| 		spin_unlock_irq(&parent->lock); | ||||
| 
 | ||||
| 	mutex_unlock(&freezer_mutex); | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
|  | @ -144,14 +135,14 @@ static void freezer_css_offline(struct cgroup_subsys_state *css) | |||
| { | ||||
| 	struct freezer *freezer = css_freezer(css); | ||||
| 
 | ||||
| 	spin_lock_irq(&freezer->lock); | ||||
| 	mutex_lock(&freezer_mutex); | ||||
| 
 | ||||
| 	if (freezer->state & CGROUP_FREEZING) | ||||
| 		atomic_dec(&system_freezing_cnt); | ||||
| 
 | ||||
| 	freezer->state = 0; | ||||
| 
 | ||||
| 	spin_unlock_irq(&freezer->lock); | ||||
| 	mutex_unlock(&freezer_mutex); | ||||
| } | ||||
| 
 | ||||
| static void freezer_css_free(struct cgroup_subsys_state *css) | ||||
|  | @ -175,7 +166,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, | |||
| 	struct task_struct *task; | ||||
| 	bool clear_frozen = false; | ||||
| 
 | ||||
| 	spin_lock_irq(&freezer->lock); | ||||
| 	mutex_lock(&freezer_mutex); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Make the new tasks conform to the current state of @new_css. | ||||
|  | @ -197,21 +188,13 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, | |||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	spin_unlock_irq(&freezer->lock); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Propagate FROZEN clearing upwards.  We may race with | ||||
| 	 * update_if_frozen(), but as long as both work bottom-up, either | ||||
| 	 * update_if_frozen() sees child's FROZEN cleared or we clear the | ||||
| 	 * parent's FROZEN later.  No parent w/ !FROZEN children can be | ||||
| 	 * left FROZEN. | ||||
| 	 */ | ||||
| 	/* propagate FROZEN clearing upwards */ | ||||
| 	while (clear_frozen && (freezer = parent_freezer(freezer))) { | ||||
| 		spin_lock_irq(&freezer->lock); | ||||
| 		freezer->state &= ~CGROUP_FROZEN; | ||||
| 		clear_frozen = freezer->state & CGROUP_FREEZING; | ||||
| 		spin_unlock_irq(&freezer->lock); | ||||
| 	} | ||||
| 
 | ||||
| 	mutex_unlock(&freezer_mutex); | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  | @ -228,9 +211,6 @@ static void freezer_fork(struct task_struct *task) | |||
| { | ||||
| 	struct freezer *freezer; | ||||
| 
 | ||||
| 	rcu_read_lock(); | ||||
| 	freezer = task_freezer(task); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * The root cgroup is non-freezable, so we can skip locking the | ||||
| 	 * freezer.  This is safe regardless of race with task migration. | ||||
|  | @ -238,24 +218,18 @@ static void freezer_fork(struct task_struct *task) | |||
| 	 * to do.  If we lost and root is the new cgroup, noop is still the | ||||
| 	 * right thing to do. | ||||
| 	 */ | ||||
| 	if (!parent_freezer(freezer)) | ||||
| 		goto out; | ||||
| 	if (task_css_is_root(task, freezer_cgrp_id)) | ||||
| 		return; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Grab @freezer->lock and freeze @task after verifying @task still | ||||
| 	 * belongs to @freezer and it's freezing.  The former is for the | ||||
| 	 * case where we have raced against task migration and lost and | ||||
| 	 * @task is already in a different cgroup which may not be frozen. | ||||
| 	 * This isn't strictly necessary as freeze_task() is allowed to be | ||||
| 	 * called spuriously but let's do it anyway for, if nothing else, | ||||
| 	 * documentation. | ||||
| 	 */ | ||||
| 	spin_lock_irq(&freezer->lock); | ||||
| 	if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING)) | ||||
| 	mutex_lock(&freezer_mutex); | ||||
| 	rcu_read_lock(); | ||||
| 
 | ||||
| 	freezer = task_freezer(task); | ||||
| 	if (freezer->state & CGROUP_FREEZING) | ||||
| 		freeze_task(task); | ||||
| 	spin_unlock_irq(&freezer->lock); | ||||
| out: | ||||
| 
 | ||||
| 	rcu_read_unlock(); | ||||
| 	mutex_unlock(&freezer_mutex); | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  | @ -281,22 +255,22 @@ static void update_if_frozen(struct cgroup_subsys_state *css) | |||
| 	struct css_task_iter it; | ||||
| 	struct task_struct *task; | ||||
| 
 | ||||
| 	WARN_ON_ONCE(!rcu_read_lock_held()); | ||||
| 
 | ||||
| 	spin_lock_irq(&freezer->lock); | ||||
| 	lockdep_assert_held(&freezer_mutex); | ||||
| 
 | ||||
| 	if (!(freezer->state & CGROUP_FREEZING) || | ||||
| 	    (freezer->state & CGROUP_FROZEN)) | ||||
| 		goto out_unlock; | ||||
| 		return; | ||||
| 
 | ||||
| 	/* are all (live) children frozen? */ | ||||
| 	rcu_read_lock(); | ||||
| 	css_for_each_child(pos, css) { | ||||
| 		struct freezer *child = css_freezer(pos); | ||||
| 
 | ||||
| 		if ((child->state & CGROUP_FREEZER_ONLINE) && | ||||
| 		    !(child->state & CGROUP_FROZEN)) | ||||
| 			goto out_unlock; | ||||
| 			return; | ||||
| 	} | ||||
| 	rcu_read_unlock(); | ||||
| 
 | ||||
| 	/* are all tasks frozen? */ | ||||
| 	css_task_iter_start(css, &it); | ||||
|  | @ -317,21 +291,29 @@ static void update_if_frozen(struct cgroup_subsys_state *css) | |||
| 	freezer->state |= CGROUP_FROZEN; | ||||
| out_iter_end: | ||||
| 	css_task_iter_end(&it); | ||||
| out_unlock: | ||||
| 	spin_unlock_irq(&freezer->lock); | ||||
| } | ||||
| 
 | ||||
| static int freezer_read(struct seq_file *m, void *v) | ||||
| { | ||||
| 	struct cgroup_subsys_state *css = seq_css(m), *pos; | ||||
| 
 | ||||
| 	mutex_lock(&freezer_mutex); | ||||
| 	rcu_read_lock(); | ||||
| 
 | ||||
| 	/* update states bottom-up */ | ||||
| 	css_for_each_descendant_post(pos, css) | ||||
| 	css_for_each_descendant_post(pos, css) { | ||||
| 		if (!css_tryget(pos)) | ||||
| 			continue; | ||||
| 		rcu_read_unlock(); | ||||
| 
 | ||||
| 		update_if_frozen(pos); | ||||
| 
 | ||||
| 		rcu_read_lock(); | ||||
| 		css_put(pos); | ||||
| 	} | ||||
| 
 | ||||
| 	rcu_read_unlock(); | ||||
| 	mutex_unlock(&freezer_mutex); | ||||
| 
 | ||||
| 	seq_puts(m, freezer_state_strs(css_freezer(css)->state)); | ||||
| 	seq_putc(m, '\n'); | ||||
|  | @ -373,7 +355,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, | |||
| 				unsigned int state) | ||||
| { | ||||
| 	/* also synchronizes against task migration, see freezer_attach() */ | ||||
| 	lockdep_assert_held(&freezer->lock); | ||||
| 	lockdep_assert_held(&freezer_mutex); | ||||
| 
 | ||||
| 	if (!(freezer->state & CGROUP_FREEZER_ONLINE)) | ||||
| 		return; | ||||
|  | @ -414,31 +396,29 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) | |||
| 	 * descendant will try to inherit its parent's FREEZING state as | ||||
| 	 * CGROUP_FREEZING_PARENT. | ||||
| 	 */ | ||||
| 	mutex_lock(&freezer_mutex); | ||||
| 	rcu_read_lock(); | ||||
| 	css_for_each_descendant_pre(pos, &freezer->css) { | ||||
| 		struct freezer *pos_f = css_freezer(pos); | ||||
| 		struct freezer *parent = parent_freezer(pos_f); | ||||
| 
 | ||||
| 		spin_lock_irq(&pos_f->lock); | ||||
| 		if (!css_tryget(pos)) | ||||
| 			continue; | ||||
| 		rcu_read_unlock(); | ||||
| 
 | ||||
| 		if (pos_f == freezer) { | ||||
| 		if (pos_f == freezer) | ||||
| 			freezer_apply_state(pos_f, freeze, | ||||
| 					    CGROUP_FREEZING_SELF); | ||||
| 		} else { | ||||
| 			/*
 | ||||
| 			 * Our update to @parent->state is already visible | ||||
| 			 * which is all we need.  No need to lock @parent. | ||||
| 			 * For more info on synchronization, see | ||||
| 			 * freezer_post_create(). | ||||
| 			 */ | ||||
| 		else | ||||
| 			freezer_apply_state(pos_f, | ||||
| 					    parent->state & CGROUP_FREEZING, | ||||
| 					    CGROUP_FREEZING_PARENT); | ||||
| 		} | ||||
| 
 | ||||
| 		spin_unlock_irq(&pos_f->lock); | ||||
| 		rcu_read_lock(); | ||||
| 		css_put(pos); | ||||
| 	} | ||||
| 	rcu_read_unlock(); | ||||
| 	mutex_unlock(&freezer_mutex); | ||||
| } | ||||
| 
 | ||||
| static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Tejun Heo
				Tejun Heo