softlockup: decouple hung tasks check from softlockup detection
Decoupling allows: * hung tasks check to happen at very low priority * hung tasks check and softlockup to be enabled/disabled independently at compile and/or run-time * individual panic settings to be enabled disabled independently at compile and/or run-time * softlockup threshold to be reduced without increasing hung tasks poll frequency (hung task check is expensive relative to softlock watchdog) * hung task check to be zero over-head when disabled at run-time Signed-off-by: Mandeep Singh Baines <msb@google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
		
					parent
					
						
							
								c903ff8379
							
						
					
				
			
			
				commit
				
					
						e162b39a36
					
				
			
		
					 6 changed files with 261 additions and 105 deletions
				
			
		|  | @ -297,9 +297,6 @@ extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write, | |||
| 				    struct file *filp, void __user *buffer, | ||||
| 				    size_t *lenp, loff_t *ppos); | ||||
| extern unsigned int  softlockup_panic; | ||||
| extern unsigned long sysctl_hung_task_check_count; | ||||
| extern unsigned long sysctl_hung_task_timeout_secs; | ||||
| extern unsigned long sysctl_hung_task_warnings; | ||||
| extern int softlockup_thresh; | ||||
| #else | ||||
| static inline void softlockup_tick(void) | ||||
|  | @ -316,6 +313,15 @@ static inline void touch_all_softlockup_watchdogs(void) | |||
| } | ||||
| #endif | ||||
| 
 | ||||
| #ifdef CONFIG_DETECT_HUNG_TASK | ||||
| extern unsigned int  sysctl_hung_task_panic; | ||||
| extern unsigned long sysctl_hung_task_check_count; | ||||
| extern unsigned long sysctl_hung_task_timeout_secs; | ||||
| extern unsigned long sysctl_hung_task_warnings; | ||||
| extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, | ||||
| 					 struct file *filp, void __user *buffer, | ||||
| 					 size_t *lenp, loff_t *ppos); | ||||
| #endif | ||||
| 
 | ||||
| /* Attach to any functions which should be ignored in wchan output. */ | ||||
| #define __sched		__attribute__((__section__(".sched.text"))) | ||||
|  | @ -1236,7 +1242,7 @@ struct task_struct { | |||
| /* ipc stuff */ | ||||
| 	struct sysv_sem sysvsem; | ||||
| #endif | ||||
| #ifdef CONFIG_DETECT_SOFTLOCKUP | ||||
| #ifdef CONFIG_DETECT_HUNG_TASK | ||||
| /* hung task detection */ | ||||
| 	unsigned long last_switch_timestamp; | ||||
| 	unsigned long last_switch_count; | ||||
|  |  | |||
|  | @ -74,6 +74,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o | |||
| obj-$(CONFIG_KPROBES) += kprobes.o | ||||
| obj-$(CONFIG_KGDB) += kgdb.o | ||||
| obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o | ||||
| obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o | ||||
| obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | ||||
| obj-$(CONFIG_SECCOMP) += seccomp.o | ||||
| obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | ||||
|  |  | |||
							
								
								
									
										198
									
								
								kernel/hung_task.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										198
									
								
								kernel/hung_task.c
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,198 @@ | |||
| /*
 | ||||
|  * Detect Hung Task | ||||
|  * | ||||
|  * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state | ||||
|  * | ||||
|  */ | ||||
| 
 | ||||
| #include <linux/mm.h> | ||||
| #include <linux/cpu.h> | ||||
| #include <linux/nmi.h> | ||||
| #include <linux/init.h> | ||||
| #include <linux/delay.h> | ||||
| #include <linux/freezer.h> | ||||
| #include <linux/kthread.h> | ||||
| #include <linux/lockdep.h> | ||||
| #include <linux/module.h> | ||||
| #include <linux/sysctl.h> | ||||
| 
 | ||||
| /*
 | ||||
|  * Have a reasonable limit on the number of tasks checked: | ||||
|  */ | ||||
| unsigned long __read_mostly sysctl_hung_task_check_count = 1024; | ||||
| 
 | ||||
| /*
 | ||||
|  * Zero means infinite timeout - no checking done: | ||||
|  */ | ||||
| unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; | ||||
| static unsigned long __read_mostly hung_task_poll_jiffies; | ||||
| 
 | ||||
| unsigned long __read_mostly sysctl_hung_task_warnings = 10; | ||||
| 
 | ||||
| static int __read_mostly did_panic; | ||||
| 
 | ||||
| static struct task_struct *watchdog_task; | ||||
| 
 | ||||
| /*
 | ||||
|  * Should we panic (and reboot, if panic_timeout= is set) when a | ||||
|  * hung task is detected: | ||||
|  */ | ||||
| unsigned int __read_mostly sysctl_hung_task_panic = | ||||
| 				CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE; | ||||
| 
 | ||||
| static int __init hung_task_panic_setup(char *str) | ||||
| { | ||||
| 	sysctl_hung_task_panic = simple_strtoul(str, NULL, 0); | ||||
| 
 | ||||
| 	return 1; | ||||
| } | ||||
| __setup("hung_task_panic=", hung_task_panic_setup); | ||||
| 
 | ||||
| static int | ||||
| hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) | ||||
| { | ||||
| 	did_panic = 1; | ||||
| 
 | ||||
| 	return NOTIFY_DONE; | ||||
| } | ||||
| 
 | ||||
| static struct notifier_block panic_block = { | ||||
| 	.notifier_call = hung_task_panic, | ||||
| }; | ||||
| 
 | ||||
| /*
 | ||||
|  * Returns seconds, approximately.  We don't need nanosecond | ||||
|  * resolution, and we don't need to waste time with a big divide when | ||||
|  * 2^30ns == 1.074s. | ||||
|  */ | ||||
| static unsigned long get_timestamp(void) | ||||
| { | ||||
| 	int this_cpu = raw_smp_processor_id(); | ||||
| 
 | ||||
| 	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */ | ||||
| } | ||||
| 
 | ||||
| static void check_hung_task(struct task_struct *t, unsigned long now) | ||||
| { | ||||
| 	unsigned long switch_count = t->nvcsw + t->nivcsw; | ||||
| 
 | ||||
| 	if (t->flags & PF_FROZEN) | ||||
| 		return; | ||||
| 
 | ||||
| 	if (switch_count != t->last_switch_count || !t->last_switch_timestamp) { | ||||
| 		t->last_switch_count = switch_count; | ||||
| 		t->last_switch_timestamp = now; | ||||
| 		return; | ||||
| 	} | ||||
| 	if ((long)(now - t->last_switch_timestamp) < | ||||
| 					sysctl_hung_task_timeout_secs) | ||||
| 		return; | ||||
| 	if (!sysctl_hung_task_warnings) | ||||
| 		return; | ||||
| 	sysctl_hung_task_warnings--; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Ok, the task did not get scheduled for more than 2 minutes, | ||||
| 	 * complain: | ||||
| 	 */ | ||||
| 	printk(KERN_ERR "INFO: task %s:%d blocked for more than " | ||||
| 			"%ld seconds.\n", t->comm, t->pid, | ||||
| 			sysctl_hung_task_timeout_secs); | ||||
| 	printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" | ||||
| 			" disables this message.\n"); | ||||
| 	sched_show_task(t); | ||||
| 	__debug_show_held_locks(t); | ||||
| 
 | ||||
| 	t->last_switch_timestamp = now; | ||||
| 	touch_nmi_watchdog(); | ||||
| 
 | ||||
| 	if (sysctl_hung_task_panic) | ||||
| 		panic("hung_task: blocked tasks"); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for | ||||
|  * a really long time (120 seconds). If that happens, print out | ||||
|  * a warning. | ||||
|  */ | ||||
| static void check_hung_uninterruptible_tasks(void) | ||||
| { | ||||
| 	int max_count = sysctl_hung_task_check_count; | ||||
| 	unsigned long now = get_timestamp(); | ||||
| 	struct task_struct *g, *t; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If the system crashed already then all bets are off, | ||||
| 	 * do not report extra hung tasks: | ||||
| 	 */ | ||||
| 	if (test_taint(TAINT_DIE) || did_panic) | ||||
| 		return; | ||||
| 
 | ||||
| 	read_lock(&tasklist_lock); | ||||
| 	do_each_thread(g, t) { | ||||
| 		if (!--max_count) | ||||
| 			goto unlock; | ||||
| 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ | ||||
| 		if (t->state == TASK_UNINTERRUPTIBLE) | ||||
| 			check_hung_task(t, now); | ||||
| 	} while_each_thread(g, t); | ||||
|  unlock: | ||||
| 	read_unlock(&tasklist_lock); | ||||
| } | ||||
| 
 | ||||
| static void update_poll_jiffies(void) | ||||
| { | ||||
| 	/* timeout of 0 will disable the watchdog */ | ||||
| 	if (sysctl_hung_task_timeout_secs == 0) | ||||
| 		hung_task_poll_jiffies = MAX_SCHEDULE_TIMEOUT; | ||||
| 	else | ||||
| 		hung_task_poll_jiffies = sysctl_hung_task_timeout_secs * HZ / 2; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Process updating of timeout sysctl | ||||
|  */ | ||||
| int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, | ||||
| 				  struct file *filp, void __user *buffer, | ||||
| 				  size_t *lenp, loff_t *ppos) | ||||
| { | ||||
| 	int ret; | ||||
| 
 | ||||
| 	ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); | ||||
| 
 | ||||
| 	if (ret || !write) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	update_poll_jiffies(); | ||||
| 
 | ||||
| 	wake_up_process(watchdog_task); | ||||
| 
 | ||||
|  out: | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * kthread which checks for tasks stuck in D state | ||||
|  */ | ||||
| static int watchdog(void *dummy) | ||||
| { | ||||
| 	set_user_nice(current, 0); | ||||
| 	update_poll_jiffies(); | ||||
| 
 | ||||
| 	for ( ; ; ) { | ||||
| 		while (schedule_timeout_interruptible(hung_task_poll_jiffies)); | ||||
| 		check_hung_uninterruptible_tasks(); | ||||
| 	} | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static int __init hung_task_init(void) | ||||
| { | ||||
| 	atomic_notifier_chain_register(&panic_notifier_list, &panic_block); | ||||
| 	watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| module_init(hung_task_init); | ||||
|  | @ -165,98 +165,12 @@ void softlockup_tick(void) | |||
| 		panic("softlockup: hung tasks"); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Have a reasonable limit on the number of tasks checked: | ||||
|  */ | ||||
| unsigned long __read_mostly sysctl_hung_task_check_count = 1024; | ||||
| 
 | ||||
| /*
 | ||||
|  * Zero means infinite timeout - no checking done: | ||||
|  */ | ||||
| unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480; | ||||
| 
 | ||||
| unsigned long __read_mostly sysctl_hung_task_warnings = 10; | ||||
| 
 | ||||
| /*
 | ||||
|  * Only do the hung-tasks check on one CPU: | ||||
|  */ | ||||
| static int check_cpu __read_mostly = -1; | ||||
| 
 | ||||
| static void check_hung_task(struct task_struct *t, unsigned long now) | ||||
| { | ||||
| 	unsigned long switch_count = t->nvcsw + t->nivcsw; | ||||
| 
 | ||||
| 	if (t->flags & PF_FROZEN) | ||||
| 		return; | ||||
| 
 | ||||
| 	if (switch_count != t->last_switch_count || !t->last_switch_timestamp) { | ||||
| 		t->last_switch_count = switch_count; | ||||
| 		t->last_switch_timestamp = now; | ||||
| 		return; | ||||
| 	} | ||||
| 	if ((long)(now - t->last_switch_timestamp) < | ||||
| 					sysctl_hung_task_timeout_secs) | ||||
| 		return; | ||||
| 	if (!sysctl_hung_task_warnings) | ||||
| 		return; | ||||
| 	sysctl_hung_task_warnings--; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Ok, the task did not get scheduled for more than 2 minutes, | ||||
| 	 * complain: | ||||
| 	 */ | ||||
| 	printk(KERN_ERR "INFO: task %s:%d blocked for more than " | ||||
| 			"%ld seconds.\n", t->comm, t->pid, | ||||
| 			sysctl_hung_task_timeout_secs); | ||||
| 	printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" | ||||
| 			" disables this message.\n"); | ||||
| 	sched_show_task(t); | ||||
| 	__debug_show_held_locks(t); | ||||
| 
 | ||||
| 	t->last_switch_timestamp = now; | ||||
| 	touch_nmi_watchdog(); | ||||
| 
 | ||||
| 	if (softlockup_panic) | ||||
| 		panic("softlockup: blocked tasks"); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for | ||||
|  * a really long time (120 seconds). If that happens, print out | ||||
|  * a warning. | ||||
|  */ | ||||
| static void check_hung_uninterruptible_tasks(int this_cpu) | ||||
| { | ||||
| 	int max_count = sysctl_hung_task_check_count; | ||||
| 	unsigned long now = get_timestamp(this_cpu); | ||||
| 	struct task_struct *g, *t; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If the system crashed already then all bets are off, | ||||
| 	 * do not report extra hung tasks: | ||||
| 	 */ | ||||
| 	if (test_taint(TAINT_DIE) || did_panic) | ||||
| 		return; | ||||
| 
 | ||||
| 	read_lock(&tasklist_lock); | ||||
| 	do_each_thread(g, t) { | ||||
| 		if (!--max_count) | ||||
| 			goto unlock; | ||||
| 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ | ||||
| 		if (t->state == TASK_UNINTERRUPTIBLE) | ||||
| 			check_hung_task(t, now); | ||||
| 	} while_each_thread(g, t); | ||||
|  unlock: | ||||
| 	read_unlock(&tasklist_lock); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * The watchdog thread - runs every second and touches the timestamp. | ||||
|  */ | ||||
| static int watchdog(void *__bind_cpu) | ||||
| { | ||||
| 	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | ||||
| 	int this_cpu = (long)__bind_cpu; | ||||
| 
 | ||||
| 	sched_setscheduler(current, SCHED_FIFO, ¶m); | ||||
| 
 | ||||
|  | @ -276,11 +190,6 @@ static int watchdog(void *__bind_cpu) | |||
| 		if (kthread_should_stop()) | ||||
| 			break; | ||||
| 
 | ||||
| 		if (this_cpu == check_cpu) { | ||||
| 			if (sysctl_hung_task_timeout_secs) | ||||
| 				check_hung_uninterruptible_tasks(this_cpu); | ||||
| 		} | ||||
| 
 | ||||
| 		set_current_state(TASK_INTERRUPTIBLE); | ||||
| 	} | ||||
| 	__set_current_state(TASK_RUNNING); | ||||
|  | @ -312,18 +221,9 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 		break; | ||||
| 	case CPU_ONLINE: | ||||
| 	case CPU_ONLINE_FROZEN: | ||||
| 		check_cpu = cpumask_any(cpu_online_mask); | ||||
| 		wake_up_process(per_cpu(watchdog_task, hotcpu)); | ||||
| 		break; | ||||
| #ifdef CONFIG_HOTPLUG_CPU | ||||
| 	case CPU_DOWN_PREPARE: | ||||
| 	case CPU_DOWN_PREPARE_FROZEN: | ||||
| 		if (hotcpu == check_cpu) { | ||||
| 			/* Pick any other online cpu. */ | ||||
| 			check_cpu = cpumask_any_but(cpu_online_mask, hotcpu); | ||||
| 		} | ||||
| 		break; | ||||
| 
 | ||||
| 	case CPU_UP_CANCELED: | ||||
| 	case CPU_UP_CANCELED_FROZEN: | ||||
| 		if (!per_cpu(watchdog_task, hotcpu)) | ||||
|  |  | |||
|  | @ -805,6 +805,19 @@ static struct ctl_table kern_table[] = { | |||
| 		.extra1		= &neg_one, | ||||
| 		.extra2		= &sixty, | ||||
| 	}, | ||||
| #endif | ||||
| #ifdef CONFIG_DETECT_HUNG_TASK | ||||
| 	{ | ||||
| 		.ctl_name	= CTL_UNNUMBERED, | ||||
| 		.procname	= "hung_task_panic", | ||||
| 		.data		= &sysctl_hung_task_panic, | ||||
| 		.maxlen		= sizeof(int), | ||||
| 		.mode		= 0644, | ||||
| 		.proc_handler	= &proc_dointvec_minmax, | ||||
| 		.strategy	= &sysctl_intvec, | ||||
| 		.extra1		= &zero, | ||||
| 		.extra2		= &one, | ||||
| 	}, | ||||
| 	{ | ||||
| 		.ctl_name	= CTL_UNNUMBERED, | ||||
| 		.procname	= "hung_task_check_count", | ||||
|  | @ -820,7 +833,7 @@ static struct ctl_table kern_table[] = { | |||
| 		.data		= &sysctl_hung_task_timeout_secs, | ||||
| 		.maxlen		= sizeof(unsigned long), | ||||
| 		.mode		= 0644, | ||||
| 		.proc_handler	= &proc_doulongvec_minmax, | ||||
| 		.proc_handler	= &proc_dohung_task_timeout_secs, | ||||
| 		.strategy	= &sysctl_intvec, | ||||
| 	}, | ||||
| 	{ | ||||
|  |  | |||
|  | @ -186,6 +186,44 @@ config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE | |||
| 	default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC | ||||
| 	default 1 if BOOTPARAM_SOFTLOCKUP_PANIC | ||||
| 
 | ||||
| config DETECT_HUNG_TASK | ||||
| 	bool "Detect Hung Tasks" | ||||
| 	depends on DEBUG_KERNEL | ||||
| 	default y | ||||
| 	help | ||||
| 	  Say Y here to enable the kernel to detect "hung tasks", | ||||
| 	  which are bugs that cause the task to be stuck in | ||||
| 	  uninterruptible "D" state indefinitiley. | ||||
| 
 | ||||
| 	  When a hung task is detected, the kernel will print the | ||||
| 	  current stack trace (which you should report), but the | ||||
| 	  task will stay in uninterruptible state. If lockdep is | ||||
| 	  enabled then all held locks will also be reported. This | ||||
| 	  feature has negligible overhead. | ||||
| 
 | ||||
| config BOOTPARAM_HUNG_TASK_PANIC | ||||
| 	bool "Panic (Reboot) On Hung Tasks" | ||||
| 	depends on DETECT_HUNG_TASK | ||||
| 	help | ||||
| 	  Say Y here to enable the kernel to panic on "hung tasks", | ||||
| 	  which are bugs that cause the kernel to leave a task stuck | ||||
| 	  in uninterruptible "D" state. | ||||
| 
 | ||||
| 	  The panic can be used in combination with panic_timeout, | ||||
| 	  to cause the system to reboot automatically after a | ||||
| 	  hung task has been detected. This feature is useful for | ||||
| 	  high-availability systems that have uptime guarantees and | ||||
| 	  where a hung tasks must be resolved ASAP. | ||||
| 
 | ||||
| 	  Say N if unsure. | ||||
| 
 | ||||
| config BOOTPARAM_HUNG_TASK_PANIC_VALUE | ||||
| 	int | ||||
| 	depends on DETECT_HUNG_TASK | ||||
| 	range 0 1 | ||||
| 	default 0 if !BOOTPARAM_HUNG_TASK_PANIC | ||||
| 	default 1 if BOOTPARAM_HUNG_TASK_PANIC | ||||
| 
 | ||||
| config SCHED_DEBUG | ||||
| 	bool "Collect scheduler debugging info" | ||||
| 	depends on DEBUG_KERNEL && PROC_FS | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Mandeep Singh Baines
				Mandeep Singh Baines