Release 4.16 arch/powerpc/kernel/watchdog.c
// SPDX-License-Identifier: GPL-2.0
/*
* Watchdog support on powerpc systems.
*
* Copyright 2017, IBM Corporation.
*
* This uses code from arch/sparc/kernel/nmi.c and kernel/watchdog.c
*/
#define pr_fmt(fmt) "watchdog: " fmt
#include <linux/kernel.h>
#include <linux/param.h>
#include <linux/init.h>
#include <linux/percpu.h>
#include <linux/cpu.h>
#include <linux/nmi.h>
#include <linux/module.h>
#include <linux/export.h>
#include <linux/kprobes.h>
#include <linux/hardirq.h>
#include <linux/reboot.h>
#include <linux/slab.h>
#include <linux/kdebug.h>
#include <linux/sched/debug.h>
#include <linux/delay.h>
#include <linux/smp.h>
#include <asm/paca.h>
/*
* The powerpc watchdog ensures that each CPU is able to service timers.
* The watchdog sets up a simple timer on each CPU to run once per timer
* period, and updates a per-cpu timestamp and a "pending" cpumask. This is
* the heartbeat.
*
* Then there are two systems to check that the heartbeat is still running.
* The local soft-NMI, and the SMP checker.
*
* The soft-NMI checker can detect lockups on the local CPU. When interrupts
* are disabled with local_irq_disable(), platforms that use soft-masking
* can leave hardware interrupts enabled and handle them with a masked
* interrupt handler. The masked handler can send the timer interrupt to the
* watchdog's soft_nmi_interrupt(), which appears to Linux as an NMI
* interrupt, and can be used to detect CPUs stuck with IRQs disabled.
*
* The soft-NMI checker will compare the heartbeat timestamp for this CPU
* with the current time, and take action if the difference exceeds the
* watchdog threshold.
*
* The limitation of the soft-NMI watchdog is that it does not work when
* interrupts are hard disabled or otherwise not being serviced. This is
* solved by also having a SMP watchdog where all CPUs check all other
* CPUs heartbeat.
*
* The SMP checker can detect lockups on other CPUs. A gobal "pending"
* cpumask is kept, containing all CPUs which enable the watchdog. Each
* CPU clears their pending bit in their heartbeat timer. When the bitmask
* becomes empty, the last CPU to clear its pending bit updates a global
* timestamp and refills the pending bitmask.
*
* In the heartbeat timer, if any CPU notices that the global timestamp has
* not been updated for a period exceeding the watchdog threshold, then it
* means the CPU(s) with their bit still set in the pending mask have had
* their heartbeat stop, and action is taken.
*
* Some platforms implement true NMI IPIs, which can by used by the SMP
* watchdog to detect an unresponsive CPU and pull it out of its stuck
* state with the NMI IPI, to get crash/debug data from it. This way the
* SMP watchdog can detect hardware interrupts off lockups.
*/
static cpumask_t wd_cpus_enabled __read_mostly;
static u64 wd_panic_timeout_tb __read_mostly;
/* timebase ticks until panic */
static u64 wd_smp_panic_timeout_tb __read_mostly;
/* panic other CPUs */
static u64 wd_timer_period_ms __read_mostly;
/* interval between heartbeat */
static DEFINE_PER_CPU(struct timer_list, wd_timer);
static DEFINE_PER_CPU(u64, wd_timer_tb);
/* SMP checker bits */
static unsigned long __wd_smp_lock;
static cpumask_t wd_smp_cpus_pending;
static cpumask_t wd_smp_cpus_stuck;
static u64 wd_smp_last_reset_tb;
static inline void wd_smp_lock(unsigned long *flags)
{
/*
* Avoid locking layers if possible.
* This may be called from low level interrupt handlers at some
* point in future.
*/
raw_local_irq_save(*flags);
hard_irq_disable(); /* Make it soft-NMI safe */
while (unlikely(test_and_set_bit_lock(0, &__wd_smp_lock))) {
raw_local_irq_restore(*flags);
spin_until_cond(!test_bit(0, &__wd_smp_lock));
raw_local_irq_save(*flags);
hard_irq_disable();
}
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Nicholas Piggin | 65 | 100.00% | 2 | 100.00% |
Total | 65 | 100.00% | 2 | 100.00% |
static inline void wd_smp_unlock(unsigned long *flags)
{
clear_bit_unlock(0, &__wd_smp_lock);
raw_local_irq_restore(*flags);
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Nicholas Piggin | 26 | 100.00% | 2 | 100.00% |
Total | 26 | 100.00% | 2 | 100.00% |
static void wd_lockup_ipi(struct pt_regs *regs)
{
pr_emerg("CPU %d Hard LOCKUP\n", raw_smp_processor_id());
print_modules();
print_irqtrace_events(current);
if (regs)
show_regs(regs);
else
dump_stack();
/* Do not panic from here because that can recurse into NMI IPI layer */
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Nicholas Piggin | 40 | 97.56% | 2 | 66.67% |
Michael Ellerman | 1 | 2.44% | 1 | 33.33% |
Total | 41 | 100.00% | 3 | 100.00% |
static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
{
cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, cpumask);
cpumask_andnot(&wd_smp_cpus_pending, &wd_smp_cpus_pending, cpumask);
if (cpumask_empty(&wd_smp_cpus_pending)) {
wd_smp_last_reset_tb = tb;
cpumask_andnot(&wd_smp_cpus_pending,
&wd_cpus_enabled,
&wd_smp_cpus_stuck);
}
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Nicholas Piggin | 63 | 100.00% | 2 | 100.00% |
Total | 63 | 100.00% | 2 | 100.00% |
static void set_cpu_stuck(int cpu, u64 tb)
{
set_cpumask_stuck(cpumask_of(cpu), tb);
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Nicholas Piggin | 22 | 100.00% | 1 | 100.00% |
Total | 22 | 100.00% | 1 | 100.00% |
static void watchdog_smp_panic(int cpu, u64 tb)
{
unsigned long flags;
int c;
wd_smp_lock(&flags);
/* Double check some things under lock */
if ((s64)(tb - wd_smp_last_reset_tb) < (s64)wd_smp_panic_timeout_tb)
goto out;
if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending))
goto out;
if (cpumask_weight(&wd_smp_cpus_pending) == 0)
goto out;
pr_emerg("CPU %d detected hard LOCKUP on other CPUs %*pbl\n",
cpu, cpumask_pr_args(&wd_smp_cpus_pending));
if (!sysctl_hardlockup_all_cpu_backtrace) {
/*
* Try to trigger the stuck CPUs, unless we are going to
* get a backtrace on all of them anyway.
*/
for_each_cpu(c, &wd_smp_cpus_pending) {
if (c == cpu)
continue;
smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
}
smp_flush_nmi_ipi(1000000);
}
/* Take the stuck CPUs out of the watch group */
set_cpumask_stuck(&wd_smp_cpus_pending, tb);
wd_smp_unlock(&flags);
printk_safe_flush();
/*
* printk_safe_flush() seems to require another print
* before anything actually goes out to console.
*/
if (sysctl_hardlockup_all_cpu_backtrace)
trigger_allbutself_cpu_backtrace();
if (hardlockup_panic)
nmi_panic(NULL, "Hard LOCKUP");
return;
out:
wd_smp_unlock(&flags);
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Nicholas Piggin | 166 | 99.40% | 4 | 80.00% |
Michael Ellerman | 1 | 0.60% | 1 | 20.00% |
Total | 167 | 100.00% | 5 | 100.00% |
static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
{
if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) {
if (unlikely(cpumask_test_cpu(cpu, &wd_smp_cpus_stuck))) {
unsigned long flags;
pr_emerg("CPU %d became unstuck\n", cpu);
wd_smp_lock(&flags);
cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck);
wd_smp_unlock(&flags);
}
return;
}
cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
if (cpumask_empty(&wd_smp_cpus_pending)) {
unsigned long flags;
wd_smp_lock(&flags);
if (cpumask_empty(&wd_smp_cpus_pending)) {
wd_smp_last_reset_tb = tb;
cpumask_andnot(&wd_smp_cpus_pending,
&wd_cpus_enabled,
&wd_smp_cpus_stuck);
}
wd_smp_unlock(&flags);
}
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Nicholas Piggin | 131 | 99.24% | 1 | 50.00% |
Michael Ellerman | 1 | 0.76% | 1 | 50.00% |
Total | 132 | 100.00% | 2 | 100.00% |
static void watchdog_timer_interrupt(int cpu)
{
u64 tb = get_tb();
per_cpu(wd_timer_tb, cpu) = tb;
wd_smp_clear_cpu_pending(cpu, tb);
if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb)
watchdog_smp_panic(cpu, tb);
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Nicholas Piggin | 54 | 100.00% | 1 | 100.00% |
Total | 54 | 100.00% | 1 | 100.00% |
void soft_nmi_interrupt(struct pt_regs *regs)
{
unsigned long flags;
int cpu = raw_smp_processor_id();
u64 tb;
if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
return;
nmi_enter();
__this_cpu_inc(irq_stat.soft_nmi_irqs);
tb = get_tb();
if (tb - per_cpu(wd_timer_tb, cpu) >= wd_panic_timeout_tb) {
per_cpu(wd_timer_tb, cpu) = tb;
wd_smp_lock(&flags);
if (cpumask_test_cpu(cpu, &wd_smp_cpus_stuck)) {
wd_smp_unlock(&flags);
goto out;
}
set_cpu_stuck(cpu, tb);
pr_emerg("CPU %d self-detected hard LOCKUP @ %pS\n", cpu, (void *)regs->nip);
print_modules();
print_irqtrace_events(current);
show_regs(regs);
wd_smp_unlock(&flags);
if (sysctl_hardlockup_all_cpu_backtrace)
trigger_allbutself_cpu_backtrace();
if (hardlockup_panic)
nmi_panic(regs, "Hard LOCKUP");
}
if (wd_panic_timeout_tb < 0x7fffffff)
mtspr(SPRN_DEC, wd_panic_timeout_tb);
out:
nmi_exit();
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Nicholas Piggin | 169 | 94.94% | 2 | 66.67% |
Michael Ellerman | 9 | 5.06% | 1 | 33.33% |
Total | 178 | 100.00% | 3 | 100.00% |
static void wd_timer_reset(unsigned int cpu, struct timer_list *t)
{
t->expires = jiffies + msecs_to_jiffies(wd_timer_period_ms);
if (wd_timer_period_ms > 1000)
t->expires = __round_jiffies_up(t->expires, cpu);
add_timer_on(t, cpu);
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Nicholas Piggin | 52 | 100.00% | 1 | 100.00% |
Total | 52 | 100.00% | 1 | 100.00% |
static void wd_timer_fn(struct timer_list *t)
{
int cpu = smp_processor_id();
watchdog_timer_interrupt(cpu);
wd_timer_reset(cpu, t);
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Nicholas Piggin | 27 | 93.10% | 1 | 50.00% |
Kees Cook | 2 | 6.90% | 1 | 50.00% |
Total | 29 | 100.00% | 2 | 100.00% |
void arch_touch_nmi_watchdog(void)
{
unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000;
int cpu = smp_processor_id();
u64 tb = get_tb();
if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) {
per_cpu(wd_timer_tb, cpu) = tb;
wd_smp_clear_cpu_pending(cpu, tb);
}
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Nicholas Piggin | 60 | 100.00% | 3 | 100.00% |
Total | 60 | 100.00% | 3 | 100.00% |
EXPORT_SYMBOL(arch_touch_nmi_watchdog);
static void start_watchdog_timer_on(unsigned int cpu)
{
struct timer_list *t = per_cpu_ptr(&wd_timer, cpu);
per_cpu(wd_timer_tb, cpu) = get_tb();
timer_setup(t, wd_timer_fn, TIMER_PINNED);
wd_timer_reset(cpu, t);
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Nicholas Piggin | 47 | 95.92% | 1 | 50.00% |
Kees Cook | 2 | 4.08% | 1 | 50.00% |
Total | 49 | 100.00% | 2 | 100.00% |
static void stop_watchdog_timer_on(unsigned int cpu)
{
struct timer_list *t = per_cpu_ptr(&wd_timer, cpu);
del_timer_sync(t);
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Nicholas Piggin | 28 | 100.00% | 1 | 100.00% |
Total | 28 | 100.00% | 1 | 100.00% |
static int start_wd_on_cpu(unsigned int cpu)
{
unsigned long flags;
if (cpumask_test_cpu(cpu, &wd_cpus_enabled)) {
WARN_ON(1);
return 0;
}
if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
return 0;
if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
return 0;
wd_smp_lock(&flags);
cpumask_set_cpu(cpu, &wd_cpus_enabled);
if (cpumask_weight(&wd_cpus_enabled) == 1) {
cpumask_set_cpu(cpu, &wd_smp_cpus_pending);
wd_smp_last_reset_tb = get_tb();
}
wd_smp_unlock(&flags);
start_watchdog_timer_on(cpu);
return 0;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Nicholas Piggin | 113 | 100.00% | 2 | 100.00% |
Total | 113 | 100.00% | 2 | 100.00% |
static int stop_wd_on_cpu(unsigned int cpu)
{
unsigned long flags;
if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
return 0; /* Can happen in CPU unplug case */
stop_watchdog_timer_on(cpu);
wd_smp_lock(&flags);
cpumask_clear_cpu(cpu, &wd_cpus_enabled);
wd_smp_unlock(&flags);
wd_smp_clear_cpu_pending(cpu, get_tb());
return 0;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Nicholas Piggin | 65 | 100.00% | 2 | 100.00% |
Total | 65 | 100.00% | 2 | 100.00% |
static void watchdog_calc_timeouts(void)
{
wd_panic_timeout_tb = watchdog_thresh * ppc_tb_freq;
/* Have the SMP detector trigger a bit later */
wd_smp_panic_timeout_tb = wd_panic_timeout_tb * 3 / 2;
/* 2/5 is the factor that the perf based detector uses */
wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Nicholas Piggin | 34 | 100.00% | 1 | 100.00% |
Total | 34 | 100.00% | 1 | 100.00% |
void watchdog_nmi_stop(void)
{
int cpu;
for_each_cpu(cpu, &wd_cpus_enabled)
stop_wd_on_cpu(cpu);
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Nicholas Piggin | 18 | 85.71% | 1 | 33.33% |
Thomas Gleixner | 3 | 14.29% | 2 | 66.67% |
Total | 21 | 100.00% | 3 | 100.00% |
void watchdog_nmi_start(void)
{
int cpu;
watchdog_calc_timeouts();
for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask)
start_wd_on_cpu(cpu);
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Nicholas Piggin | 13 | 50.00% | 1 | 33.33% |
Thomas Gleixner | 13 | 50.00% | 2 | 66.67% |
Total | 26 | 100.00% | 3 | 100.00% |
/*
* Invoked from core watchdog init.
*/
int __init watchdog_nmi_probe(void)
{
int err;
err = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
"powerpc/watchdog:online",
start_wd_on_cpu, stop_wd_on_cpu);
if (err < 0) {
pr_warn("could not be initialized");
return err;
}
return 0;
}
Contributors
Person | Tokens | Prop | Commits | CommitProp |
Nicholas Piggin | 35 | 81.40% | 1 | 33.33% |
Thomas Gleixner | 7 | 16.28% | 1 | 33.33% |
Michael Ellerman | 1 | 2.33% | 1 | 33.33% |
Total | 43 | 100.00% | 3 | 100.00% |
Overall Contributors
Person | Tokens | Prop | Commits | CommitProp |
Nicholas Piggin | 1344 | 96.48% | 11 | 61.11% |
Thomas Gleixner | 24 | 1.72% | 3 | 16.67% |
Michael Ellerman | 20 | 1.44% | 2 | 11.11% |
Kees Cook | 4 | 0.29% | 1 | 5.56% |
Greg Kroah-Hartman | 1 | 0.07% | 1 | 5.56% |
Total | 1393 | 100.00% | 18 | 100.00% |
Information contained on this website is for historical information purposes only and does not indicate or represent copyright ownership.