Skip to content

Commit

Permalink
[PATCH] detect soft lockups
Browse files Browse the repository at this point in the history
This patch adds a new kernel debug feature: CONFIG_DETECT_SOFTLOCKUP.

When enabled then per-CPU watchdog threads are started, which try to run
once per second.  If they get delayed for more than 10 seconds then a
callback from the timer interrupt detects this condition and prints out a
warning message and a stack dump (once per lockup incident).  The feature
is otherwise non-intrusive, it doesnt try to unlock the box in any way, it
only gets the debug info out, automatically, and on all CPUs affected by
the lockup.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Signed-Off-By: Matthias Urlichs <smurf@smurf.noris.de>
Signed-off-by: Richard Purdie <rpurdie@rpsys.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
  • Loading branch information
Ingo Molnar authored and Linus Torvalds committed Sep 7, 2005
1 parent 4732efb commit 8446f1d
Show file tree
Hide file tree
Showing 12 changed files with 201 additions and 0 deletions.
5 changes: 5 additions & 0 deletions arch/i386/kernel/nmi.c
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,11 @@ void touch_nmi_watchdog (void)
*/
for (i = 0; i < NR_CPUS; i++)
alert_counter[i] = 0;

/*
* Tickle the softlockup detector too:
*/
touch_softlockup_watchdog();
}

extern void die_nmi(struct pt_regs *, const char *msg);
Expand Down
1 change: 1 addition & 0 deletions arch/i386/kernel/time.c
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,7 @@ static int timer_resume(struct sys_device *dev)
last_timer->resume();
cur_timer = last_timer;
last_timer = NULL;
touch_softlockup_watchdog();
return 0;
}

Expand Down
2 changes: 2 additions & 0 deletions arch/x86_64/kernel/nmi.c
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,8 @@ void touch_nmi_watchdog (void)
*/
for (i = 0; i < NR_CPUS; i++)
per_cpu(nmi_touch, i) = 1;

touch_softlockup_watchdog();
}

void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)
Expand Down
1 change: 1 addition & 0 deletions arch/x86_64/kernel/time.c
Original file line number Diff line number Diff line change
Expand Up @@ -1041,6 +1041,7 @@ static int timer_resume(struct sys_device *dev)
write_sequnlock_irqrestore(&xtime_lock,flags);
jiffies += sleep_length;
wall_jiffies += sleep_length;
touch_softlockup_watchdog();
return 0;
}

Expand Down
1 change: 1 addition & 0 deletions drivers/mtd/nand/nand_base.c
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,7 @@ static void nand_wait_ready(struct mtd_info *mtd)
do {
if (this->dev_ready(mtd))
return;
touch_softlockup_watchdog();
} while (time_before(jiffies, timeo));
}

Expand Down
17 changes: 17 additions & 0 deletions include/linux/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,23 @@ extern void trap_init(void);
extern void update_process_times(int user);
extern void scheduler_tick(void);

#ifdef CONFIG_DETECT_SOFTLOCKUP
extern void softlockup_tick(struct pt_regs *regs);
extern void spawn_softlockup_task(void);
extern void touch_softlockup_watchdog(void);
#else
static inline void softlockup_tick(struct pt_regs *regs)
{
}
static inline void spawn_softlockup_task(void)
{
}
static inline void touch_softlockup_watchdog(void)
{
}
#endif


/* Attach to any functions which should be ignored in wchan output. */
#define __sched __attribute__((__section__(".sched.text")))
/* Is this address in the __sched functions? */
Expand Down
1 change: 1 addition & 0 deletions init/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,7 @@ static void do_pre_smp_initcalls(void)
migration_init();
#endif
spawn_ksoftirqd();
spawn_softlockup_task();
}

static void run_init_process(char *init_filename)
Expand Down
1 change: 1 addition & 0 deletions kernel/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ obj-$(CONFIG_AUDIT) += audit.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_SYSFS) += ksysfs.o
obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_SECCOMP) += seccomp.o
Expand Down
1 change: 1 addition & 0 deletions kernel/power/swsusp.c
Original file line number Diff line number Diff line change
Expand Up @@ -1059,6 +1059,7 @@ int swsusp_resume(void)
BUG_ON(!error);
restore_processor_state();
restore_highmem();
touch_softlockup_watchdog();
device_power_up();
local_irq_enable();
return error;
Expand Down
151 changes: 151 additions & 0 deletions kernel/softlockup.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
/*
* Detect Soft Lockups
*
* started by Ingo Molnar, (C) 2005, Red Hat
*
* this code detects soft lockups: incidents in where on a CPU
* the kernel does not reschedule for 10 seconds or more.
*/

#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/notifier.h>
#include <linux/module.h>

static DEFINE_SPINLOCK(print_lock);

static DEFINE_PER_CPU(unsigned long, timestamp) = 0;
static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0;
static DEFINE_PER_CPU(struct task_struct *, watchdog_task);

static int did_panic = 0;
static int softlock_panic(struct notifier_block *this, unsigned long event,
void *ptr)
{
did_panic = 1;

return NOTIFY_DONE;
}

static struct notifier_block panic_block = {
.notifier_call = softlock_panic,
};

void touch_softlockup_watchdog(void)
{
per_cpu(timestamp, raw_smp_processor_id()) = jiffies;
}
EXPORT_SYMBOL(touch_softlockup_watchdog);

/*
* This callback runs from the timer interrupt, and checks
* whether the watchdog thread has hung or not:
*/
void softlockup_tick(struct pt_regs *regs)
{
int this_cpu = smp_processor_id();
unsigned long timestamp = per_cpu(timestamp, this_cpu);

if (per_cpu(print_timestamp, this_cpu) == timestamp)
return;

/* Do not cause a second panic when there already was one */
if (did_panic)
return;

if (time_after(jiffies, timestamp + 10*HZ)) {
per_cpu(print_timestamp, this_cpu) = timestamp;

spin_lock(&print_lock);
printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
this_cpu);
show_regs(regs);
spin_unlock(&print_lock);
}
}

/*
* The watchdog thread - runs every second and touches the timestamp.
*/
static int watchdog(void * __bind_cpu)
{
struct sched_param param = { .sched_priority = 99 };
int this_cpu = (long) __bind_cpu;

printk("softlockup thread %d started up.\n", this_cpu);

sched_setscheduler(current, SCHED_FIFO, &param);
current->flags |= PF_NOFREEZE;

set_current_state(TASK_INTERRUPTIBLE);

/*
* Run briefly once per second - if this gets delayed for
* more than 10 seconds then the debug-printout triggers
* in softlockup_tick():
*/
while (!kthread_should_stop()) {
msleep_interruptible(1000);
touch_softlockup_watchdog();
}
__set_current_state(TASK_RUNNING);

return 0;
}

/*
* Create/destroy watchdog threads as CPUs come and go:
*/
static int __devinit
cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
int hotcpu = (unsigned long)hcpu;
struct task_struct *p;

switch (action) {
case CPU_UP_PREPARE:
BUG_ON(per_cpu(watchdog_task, hotcpu));
p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
if (IS_ERR(p)) {
printk("watchdog for %i failed\n", hotcpu);
return NOTIFY_BAD;
}
per_cpu(watchdog_task, hotcpu) = p;
kthread_bind(p, hotcpu);
break;
case CPU_ONLINE:

wake_up_process(per_cpu(watchdog_task, hotcpu));
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
/* Unbind so it can run. Fall thru. */
kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id());
case CPU_DEAD:
p = per_cpu(watchdog_task, hotcpu);
per_cpu(watchdog_task, hotcpu) = NULL;
kthread_stop(p);
break;
#endif /* CONFIG_HOTPLUG_CPU */
}
return NOTIFY_OK;
}

static struct notifier_block __devinitdata cpu_nfb = {
.notifier_call = cpu_callback
};

__init void spawn_softlockup_task(void)
{
void *cpu = (void *)(long)smp_processor_id();

cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
register_cpu_notifier(&cpu_nfb);

notifier_chain_register(&panic_notifier_list, &panic_block);
}

1 change: 1 addition & 0 deletions kernel/timer.c
Original file line number Diff line number Diff line change
Expand Up @@ -950,6 +950,7 @@ void do_timer(struct pt_regs *regs)
{
jiffies_64++;
update_times();
softlockup_tick(regs);
}

#ifdef __ARCH_WANT_SYS_ALARM
Expand Down
19 changes: 19 additions & 0 deletions lib/Kconfig.debug
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,25 @@ config LOG_BUF_SHIFT
13 => 8 KB
12 => 4 KB

config DETECT_SOFTLOCKUP
bool "Detect Soft Lockups"
depends on DEBUG_KERNEL
default y
help
Say Y here to enable the kernel to detect "soft lockups",
which are bugs that cause the kernel to loop in kernel
mode for more than 10 seconds, without giving other tasks a
chance to run.

When a soft-lockup is detected, the kernel will print the
current stack trace (which you should report), but the
system will stay locked up. This feature has negligible
overhead.

(Note that "hard lockups" are separate type of bugs that
can be detected via the NMI-watchdog, on platforms that
support it.)

config SCHEDSTATS
bool "Collect scheduler statistics"
depends on DEBUG_KERNEL && PROC_FS
Expand Down

0 comments on commit 8446f1d

Please sign in to comment.