137 lines
5.6 KiB
Diff
137 lines
5.6 KiB
Diff
|
From d9b35625c1c2a18cab7b4a6959f4f257b751b8ac Mon Sep 17 00:00:00 2001
|
||
|
From: Thomas Gleixner <tglx@linutronix.de>
|
||
|
Date: Wed, 7 Mar 2012 20:51:03 +0100
|
||
|
Subject: [PATCH 215/365] rt: Introduce cpu_chill()
|
||
|
|
||
|
Retry loops on RT might loop forever when the modifying side was
|
||
|
preempted. Add cpu_chill() to replace cpu_relax(). cpu_chill()
|
||
|
defaults to cpu_relax() for non RT. On RT it puts the looping task to
|
||
|
sleep for a tick so the preempted task can make progress.
|
||
|
|
||
|
Steven Rostedt changed it to use a hrtimer instead of msleep():
|
||
|
|
|
||
|
|Ulrich Obergfell pointed out that cpu_chill() calls msleep() which is woken
|
||
|
|up by the ksoftirqd running the TIMER softirq. But as the cpu_chill() is
|
||
|
|called from softirq context, it may block the ksoftirqd() from running, in
|
||
|
|which case, it may never wake up the msleep() causing the deadlock.
|
||
|
|
|
||
|
|I checked the vmcore, and irq/74-qla2xxx is stuck in the msleep() call,
|
||
|
|running on CPU 8. The one ksoftirqd that is stuck, happens to be the one that
|
||
|
|runs on CPU 8, and it is blocked on a lock held by irq/74-qla2xxx. As that
|
||
|
|ksoftirqd is the one that will wake up irq/74-qla2xxx, and it happens to be
|
||
|
|blocked on a lock that irq/74-qla2xxx holds, we have our deadlock.
|
||
|
|
|
||
|
|The solution is not to convert the cpu_chill() back to a cpu_relax() as that
|
||
|
|will re-create a possible live lock that the cpu_chill() fixed earlier, and may
|
||
|
|also leave this bug open on other softirqs. The fix is to remove the
|
||
|
|dependency on ksoftirqd from cpu_chill(). That is, instead of calling
|
||
|
|msleep() that requires ksoftirqd to wake it up, use the
|
||
|
|hrtimer_nanosleep() code that does the wakeup from hard irq context.
|
||
|
|
|
||
|
||Looks to be the lock of the block softirq. I don't have the core dump
|
||
|
||anymore, but from what I could tell the ksoftirqd was blocked on the
|
||
|
||block softirq lock, where the block softirq handler did a msleep
|
||
|
||(called by the qla2xxx interrupt handler).
|
||
|
||
|
||
|
||Looking at trigger_softirq() in block/blk-softirq.c, it can do a
|
||
|
||smp_callfunction() to another cpu to run the block softirq. If that
|
||
|
||happens to be the cpu where the qla2xx irq handler is doing the block
|
||
|
||softirq and is in a middle of a msleep(), I believe the ksoftirqd will
|
||
|
||try to run the softirq. If it does that, then BOOM, it's deadlocked
|
||
|
||because the ksoftirqd will never run the timer softirq either.
|
||
|
|
|
||
|
||I should have also stated that it was only one lock that was involved.
|
||
|
||But the lock owner was doing a msleep() that requires a wakeup by
|
||
|
||ksoftirqd to continue. If ksoftirqd happens to be blocked on a lock
|
||
|
||held by the msleep() caller, then you have your deadlock.
|
||
|
||
|
||
|
||It's best not to have any softirqs going to sleep requiring another
|
||
|
||softirq to wake it up. Note, if we ever require a timer softirq to do a
|
||
|
||cpu_chill() it will most definitely hit this deadlock.
|
||
|
|
||
|
+ bigeasy: add PF_NOFREEZE:
|
||
|
| [....] Waiting for /dev to be fully populated...
|
||
|
| =====================================
|
||
|
| [ BUG: udevd/229 still has locks held! ]
|
||
|
| 3.12.11-rt17 #23 Not tainted
|
||
|
| -------------------------------------
|
||
|
| 1 lock held by udevd/229:
|
||
|
| #0: (&type->i_mutex_dir_key#2){+.+.+.}, at: lookup_slow+0x28/0x98
|
||
|
|
|
||
|
| stack backtrace:
|
||
|
| CPU: 0 PID: 229 Comm: udevd Not tainted 3.12.11-rt17 #23
|
||
|
| (unwind_backtrace+0x0/0xf8) from (show_stack+0x10/0x14)
|
||
|
| (show_stack+0x10/0x14) from (dump_stack+0x74/0xbc)
|
||
|
| (dump_stack+0x74/0xbc) from (do_nanosleep+0x120/0x160)
|
||
|
| (do_nanosleep+0x120/0x160) from (hrtimer_nanosleep+0x90/0x110)
|
||
|
| (hrtimer_nanosleep+0x90/0x110) from (cpu_chill+0x30/0x38)
|
||
|
| (cpu_chill+0x30/0x38) from (dentry_kill+0x158/0x1ec)
|
||
|
| (dentry_kill+0x158/0x1ec) from (dput+0x74/0x15c)
|
||
|
| (dput+0x74/0x15c) from (lookup_real+0x4c/0x50)
|
||
|
| (lookup_real+0x4c/0x50) from (__lookup_hash+0x34/0x44)
|
||
|
| (__lookup_hash+0x34/0x44) from (lookup_slow+0x38/0x98)
|
||
|
| (lookup_slow+0x38/0x98) from (path_lookupat+0x208/0x7fc)
|
||
|
| (path_lookupat+0x208/0x7fc) from (filename_lookup+0x20/0x60)
|
||
|
| (filename_lookup+0x20/0x60) from (user_path_at_empty+0x50/0x7c)
|
||
|
| (user_path_at_empty+0x50/0x7c) from (user_path_at+0x14/0x1c)
|
||
|
| (user_path_at+0x14/0x1c) from (vfs_fstatat+0x48/0x94)
|
||
|
| (vfs_fstatat+0x48/0x94) from (SyS_stat64+0x14/0x30)
|
||
|
| (SyS_stat64+0x14/0x30) from (ret_fast_syscall+0x0/0x48)
|
||
|
|
||
|
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
|
||
|
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
|
||
|
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
|
||
|
---
|
||
|
include/linux/delay.h | 6 ++++++
|
||
|
kernel/time/hrtimer.c | 19 +++++++++++++++++++
|
||
|
2 files changed, 25 insertions(+)
|
||
|
|
||
|
diff --git a/include/linux/delay.h b/include/linux/delay.h
|
||
|
index a6ecb34cf547..37caab306336 100644
|
||
|
--- a/include/linux/delay.h
|
||
|
+++ b/include/linux/delay.h
|
||
|
@@ -52,4 +52,10 @@ static inline void ssleep(unsigned int seconds)
|
||
|
msleep(seconds * 1000);
|
||
|
}
|
||
|
|
||
|
+#ifdef CONFIG_PREEMPT_RT_FULL
|
||
|
+extern void cpu_chill(void);
|
||
|
+#else
|
||
|
+# define cpu_chill() cpu_relax()
|
||
|
+#endif
|
||
|
+
|
||
|
#endif /* defined(_LINUX_DELAY_H) */
|
||
|
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
|
||
|
index 34c38da46ad2..a652e5bcbc37 100644
|
||
|
--- a/kernel/time/hrtimer.c
|
||
|
+++ b/kernel/time/hrtimer.c
|
||
|
@@ -1788,6 +1788,25 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
|
||
|
return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
|
||
|
}
|
||
|
|
||
|
+#ifdef CONFIG_PREEMPT_RT_FULL
|
||
|
+/*
|
||
|
+ * Sleep for 1 ms in hope whoever holds what we want will let it go.
|
||
|
+ */
|
||
|
+void cpu_chill(void)
|
||
|
+{
|
||
|
+ struct timespec tu = {
|
||
|
+ .tv_nsec = NSEC_PER_MSEC,
|
||
|
+ };
|
||
|
+ unsigned int freeze_flag = current->flags & PF_NOFREEZE;
|
||
|
+
|
||
|
+ current->flags |= PF_NOFREEZE;
|
||
|
+ hrtimer_nanosleep(&tu, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
|
||
|
+ if (!freeze_flag)
|
||
|
+ current->flags &= ~PF_NOFREEZE;
|
||
|
+}
|
||
|
+EXPORT_SYMBOL(cpu_chill);
|
||
|
+#endif
|
||
|
+
|
||
|
/*
|
||
|
* Functions related to boot-time initialization:
|
||
|
*/
|
||
|
--
|
||
|
2.28.0
|
||
|
|