166 lines
5.3 KiB
Diff
166 lines
5.3 KiB
Diff
|
From 6f2a1cd494fc877c123404a647f371434d47a2b2 Mon Sep 17 00:00:00 2001
|
||
|
From: Steven Rostedt <rostedt@goodmis.org>
|
||
|
Date: Fri, 27 Feb 2015 15:20:37 +0100
|
||
|
Subject: [PATCH 204/365] x86/mce: use swait queue for mce wakeups
|
||
|
|
||
|
We had a customer report a lockup on a 3.0-rt kernel that had the
|
||
|
following backtrace:
|
||
|
|
||
|
[ffff88107fca3e80] rt_spin_lock_slowlock at ffffffff81499113
|
||
|
[ffff88107fca3f40] rt_spin_lock at ffffffff81499a56
|
||
|
[ffff88107fca3f50] __wake_up at ffffffff81043379
|
||
|
[ffff88107fca3f80] mce_notify_irq at ffffffff81017328
|
||
|
[ffff88107fca3f90] intel_threshold_interrupt at ffffffff81019508
|
||
|
[ffff88107fca3fa0] smp_threshold_interrupt at ffffffff81019fc1
|
||
|
[ffff88107fca3fb0] threshold_interrupt at ffffffff814a1853
|
||
|
|
||
|
It actually bugged because the lock was taken by the same owner that
|
||
|
already had that lock. What happened was the thread that was setting
|
||
|
itself on a wait queue had the lock when an MCE triggered. The MCE
|
||
|
interrupt does a wake up on its wait list and grabs the same lock.
|
||
|
|
||
|
NOTE: THIS IS NOT A BUG ON MAINLINE
|
||
|
|
||
|
Sorry for yelling, but as I Cc'd mainline maintainers I want them to
|
||
|
know that this is an PREEMPT_RT bug only. I only Cc'd them for advice.
|
||
|
|
||
|
On PREEMPT_RT the wait queue locks are converted from normal
|
||
|
"spin_locks" into an rt_mutex (see the rt_spin_lock_slowlock above).
|
||
|
These are not to be taken by hard interrupt context. This usually isn't
|
||
|
a problem as most all interrupts in PREEMPT_RT are converted into
|
||
|
schedulable threads. Unfortunately that's not the case with the MCE irq.
|
||
|
|
||
|
As wait queue locks are notorious for long hold times, we can not
|
||
|
convert them to raw_spin_locks without causing issues with -rt. But
|
||
|
Thomas has created a "simple-wait" structure that uses raw spin locks
|
||
|
which may have been a good fit.
|
||
|
|
||
|
Unfortunately, wait queues are not the only issue, as the mce_notify_irq
|
||
|
also does a schedule_work(), which grabs the workqueue spin locks that
|
||
|
have the exact same issue.
|
||
|
|
||
|
Thus, this patch I'm proposing is to move the actual work of the MCE
|
||
|
interrupt into a helper thread that gets woken up on the MCE interrupt
|
||
|
and does the work in a schedulable context.
|
||
|
|
||
|
NOTE: THIS PATCH ONLY CHANGES THE BEHAVIOR WHEN PREEMPT_RT IS SET
|
||
|
|
||
|
Oops, sorry for yelling again, but I want to stress that I keep the same
|
||
|
behavior of mainline when PREEMPT_RT is not set. Thus, this only changes
|
||
|
the MCE behavior when PREEMPT_RT is configured.
|
||
|
|
||
|
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
|
||
|
[bigeasy@linutronix: make mce_notify_work() a proper prototype, use
|
||
|
kthread_run()]
|
||
|
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
|
||
|
[wagi: use work-simple framework to defer work to a kthread]
|
||
|
Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
|
||
|
---
|
||
|
arch/x86/kernel/cpu/mcheck/mce.c | 68 ++++++++++++++++++++++++++------
|
||
|
1 file changed, 56 insertions(+), 12 deletions(-)
|
||
|
|
||
|
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
|
||
|
index 6af5d3590271..decbe286b5db 100644
|
||
|
--- a/arch/x86/kernel/cpu/mcheck/mce.c
|
||
|
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
|
||
|
@@ -42,6 +42,7 @@
|
||
|
#include <linux/irq_work.h>
|
||
|
#include <linux/export.h>
|
||
|
#include <linux/jiffies.h>
|
||
|
+#include <linux/swork.h>
|
||
|
#include <linux/jump_label.h>
|
||
|
|
||
|
#include <asm/processor.h>
|
||
|
@@ -1445,6 +1446,56 @@ static void mce_do_trigger(struct work_struct *work)
|
||
|
|
||
|
static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
|
||
|
|
||
|
+static void __mce_notify_work(struct swork_event *event)
|
||
|
+{
|
||
|
+ /* Not more than two messages every minute */
|
||
|
+ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
|
||
|
+
|
||
|
+ /* wake processes polling /dev/mcelog */
|
||
|
+ wake_up_interruptible(&mce_chrdev_wait);
|
||
|
+
|
||
|
+ /*
|
||
|
+ * There is no risk of missing notifications because
|
||
|
+ * work_pending is always cleared before the function is
|
||
|
+ * executed.
|
||
|
+ */
|
||
|
+ if (mce_helper[0] && !work_pending(&mce_trigger_work))
|
||
|
+ schedule_work(&mce_trigger_work);
|
||
|
+
|
||
|
+ if (__ratelimit(&ratelimit))
|
||
|
+ pr_info(HW_ERR "Machine check events logged\n");
|
||
|
+}
|
||
|
+
|
||
|
+#ifdef CONFIG_PREEMPT_RT_FULL
|
||
|
+static bool notify_work_ready __read_mostly;
|
||
|
+static struct swork_event notify_work;
|
||
|
+
|
||
|
+static int mce_notify_work_init(void)
|
||
|
+{
|
||
|
+ int err;
|
||
|
+
|
||
|
+ err = swork_get();
|
||
|
+ if (err)
|
||
|
+ return err;
|
||
|
+
|
||
|
+ INIT_SWORK(¬ify_work, __mce_notify_work);
|
||
|
+ notify_work_ready = true;
|
||
|
+ return 0;
|
||
|
+}
|
||
|
+
|
||
|
+static void mce_notify_work(void)
|
||
|
+{
|
||
|
+ if (notify_work_ready)
|
||
|
+ swork_queue(¬ify_work);
|
||
|
+}
|
||
|
+#else
|
||
|
+static void mce_notify_work(void)
|
||
|
+{
|
||
|
+ __mce_notify_work(NULL);
|
||
|
+}
|
||
|
+static inline int mce_notify_work_init(void) { return 0; }
|
||
|
+#endif
|
||
|
+
|
||
|
/*
|
||
|
* Notify the user(s) about new machine check events.
|
||
|
* Can be called from interrupt context, but not from machine check/NMI
|
||
|
@@ -1452,19 +1503,8 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
|
||
|
*/
|
||
|
int mce_notify_irq(void)
|
||
|
{
|
||
|
- /* Not more than two messages every minute */
|
||
|
- static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
|
||
|
-
|
||
|
if (test_and_clear_bit(0, &mce_need_notify)) {
|
||
|
- /* wake processes polling /dev/mcelog */
|
||
|
- wake_up_interruptible(&mce_chrdev_wait);
|
||
|
-
|
||
|
- if (mce_helper[0])
|
||
|
- schedule_work(&mce_trigger_work);
|
||
|
-
|
||
|
- if (__ratelimit(&ratelimit))
|
||
|
- pr_info(HW_ERR "Machine check events logged\n");
|
||
|
-
|
||
|
+ mce_notify_work();
|
||
|
return 1;
|
||
|
}
|
||
|
return 0;
|
||
|
@@ -2601,6 +2641,10 @@ static __init int mcheck_init_device(void)
|
||
|
goto err_out;
|
||
|
}
|
||
|
|
||
|
+ err = mce_notify_work_init();
|
||
|
+ if (err)
|
||
|
+ goto err_out;
|
||
|
+
|
||
|
if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
|
||
|
err = -ENOMEM;
|
||
|
goto err_out;
|
||
|
--
|
||
|
2.28.0
|
||
|
|