376 lines
11 KiB
Diff
376 lines
11 KiB
Diff
From a6321fdbb0edb62c166a0bc2fcef12040badf124 Mon Sep 17 00:00:00 2001
|
|
From: Peter Zijlstra <peterz@infradead.org>
|
|
Date: Wed, 22 Mar 2017 11:35:52 +0100
|
|
Subject: [PATCH 009/365] futex: Change locking rules
|
|
|
|
Upstream commit 734009e96d1983ad739e5b656e03430b3660c913
|
|
|
|
Currently futex-pi relies on hb->lock to serialize everything. But hb->lock
|
|
creates another set of problems, especially priority inversions on RT where
|
|
hb->lock becomes a rt_mutex itself.
|
|
|
|
The rt_mutex::wait_lock is the most obvious protection for keeping the
|
|
futex user space value and the kernel internal pi_state in sync.
|
|
|
|
Rework and document the locking so rt_mutex::wait_lock is held accross all
|
|
operations which modify the user space value and the pi state.
|
|
|
|
This allows to invoke rt_mutex_unlock() (including deboost) without holding
|
|
hb->lock as a next step.
|
|
|
|
Nothing yet relies on the new locking rules.
|
|
|
|
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
|
|
Cc: juri.lelli@arm.com
|
|
Cc: bigeasy@linutronix.de
|
|
Cc: xlpang@redhat.com
|
|
Cc: rostedt@goodmis.org
|
|
Cc: mathieu.desnoyers@efficios.com
|
|
Cc: jdesfossez@efficios.com
|
|
Cc: dvhart@infradead.org
|
|
Cc: bristot@redhat.com
|
|
Link: http://lkml.kernel.org/r/20170322104151.751993333@infradead.org
|
|
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
|
|
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
|
|
---
|
|
kernel/futex.c | 165 +++++++++++++++++++++++++++++++++++++++----------
|
|
1 file changed, 132 insertions(+), 33 deletions(-)
|
|
|
|
diff --git a/kernel/futex.c b/kernel/futex.c
|
|
index c126218ac5ed..c0d5e64b37b2 100644
|
|
--- a/kernel/futex.c
|
|
+++ b/kernel/futex.c
|
|
@@ -989,6 +989,39 @@ void exit_pi_state_list(struct task_struct *curr)
|
|
*
|
|
* [10] There is no transient state which leaves owner and user space
|
|
* TID out of sync.
|
|
+ *
|
|
+ *
|
|
+ * Serialization and lifetime rules:
|
|
+ *
|
|
+ * hb->lock:
|
|
+ *
|
|
+ * hb -> futex_q, relation
|
|
+ * futex_q -> pi_state, relation
|
|
+ *
|
|
+ * (cannot be raw because hb can contain arbitrary amount
|
|
+ * of futex_q's)
|
|
+ *
|
|
+ * pi_mutex->wait_lock:
|
|
+ *
|
|
+ * {uval, pi_state}
|
|
+ *
|
|
+ * (and pi_mutex 'obviously')
|
|
+ *
|
|
+ * p->pi_lock:
|
|
+ *
|
|
+ * p->pi_state_list -> pi_state->list, relation
|
|
+ *
|
|
+ * pi_state->refcount:
|
|
+ *
|
|
+ * pi_state lifetime
|
|
+ *
|
|
+ *
|
|
+ * Lock order:
|
|
+ *
|
|
+ * hb->lock
|
|
+ * pi_mutex->wait_lock
|
|
+ * p->pi_lock
|
|
+ *
|
|
*/
|
|
|
|
/*
|
|
@@ -996,10 +1029,12 @@ void exit_pi_state_list(struct task_struct *curr)
|
|
* the pi_state against the user space value. If correct, attach to
|
|
* it.
|
|
*/
|
|
-static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
|
|
+static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
|
|
+ struct futex_pi_state *pi_state,
|
|
struct futex_pi_state **ps)
|
|
{
|
|
pid_t pid = uval & FUTEX_TID_MASK;
|
|
+ int ret, uval2;
|
|
|
|
/*
|
|
* Userspace might have messed up non-PI and PI futexes [3]
|
|
@@ -1007,8 +1042,33 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
|
|
if (unlikely(!pi_state))
|
|
return -EINVAL;
|
|
|
|
+ /*
|
|
+ * We get here with hb->lock held, and having found a
|
|
+ * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
|
|
+ * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
|
|
+ * which in turn means that futex_lock_pi() still has a reference on
|
|
+ * our pi_state.
|
|
+ */
|
|
WARN_ON(!atomic_read(&pi_state->refcount));
|
|
|
|
+ /*
|
|
+ * Now that we have a pi_state, we can acquire wait_lock
|
|
+ * and do the state validation.
|
|
+ */
|
|
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
|
|
+
|
|
+ /*
|
|
+ * Since {uval, pi_state} is serialized by wait_lock, and our current
|
|
+ * uval was read without holding it, it can have changed. Verify it
|
|
+ * still is what we expect it to be, otherwise retry the entire
|
|
+ * operation.
|
|
+ */
|
|
+ if (get_futex_value_locked(&uval2, uaddr))
|
|
+ goto out_efault;
|
|
+
|
|
+ if (uval != uval2)
|
|
+ goto out_eagain;
|
|
+
|
|
/*
|
|
* Handle the owner died case:
|
|
*/
|
|
@@ -1024,11 +1084,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
|
|
* is not 0. Inconsistent state. [5]
|
|
*/
|
|
if (pid)
|
|
- return -EINVAL;
|
|
+ goto out_einval;
|
|
/*
|
|
* Take a ref on the state and return success. [4]
|
|
*/
|
|
- goto out_state;
|
|
+ goto out_attach;
|
|
}
|
|
|
|
/*
|
|
@@ -1040,14 +1100,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
|
|
* Take a ref on the state and return success. [6]
|
|
*/
|
|
if (!pid)
|
|
- goto out_state;
|
|
+ goto out_attach;
|
|
} else {
|
|
/*
|
|
* If the owner died bit is not set, then the pi_state
|
|
* must have an owner. [7]
|
|
*/
|
|
if (!pi_state->owner)
|
|
- return -EINVAL;
|
|
+ goto out_einval;
|
|
}
|
|
|
|
/*
|
|
@@ -1056,11 +1116,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
|
|
* user space TID. [9/10]
|
|
*/
|
|
if (pid != task_pid_vnr(pi_state->owner))
|
|
- return -EINVAL;
|
|
-out_state:
|
|
+ goto out_einval;
|
|
+
|
|
+out_attach:
|
|
atomic_inc(&pi_state->refcount);
|
|
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
|
|
*ps = pi_state;
|
|
return 0;
|
|
+
|
|
+out_einval:
|
|
+ ret = -EINVAL;
|
|
+ goto out_error;
|
|
+
|
|
+out_eagain:
|
|
+ ret = -EAGAIN;
|
|
+ goto out_error;
|
|
+
|
|
+out_efault:
|
|
+ ret = -EFAULT;
|
|
+ goto out_error;
|
|
+
|
|
+out_error:
|
|
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
|
|
+ return ret;
|
|
}
|
|
|
|
/*
|
|
@@ -1111,6 +1189,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
|
|
|
|
/*
|
|
* No existing pi state. First waiter. [2]
|
|
+ *
|
|
+ * This creates pi_state, we have hb->lock held, this means nothing can
|
|
+ * observe this state, wait_lock is irrelevant.
|
|
*/
|
|
pi_state = alloc_pi_state();
|
|
|
|
@@ -1135,7 +1216,8 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
|
|
return 0;
|
|
}
|
|
|
|
-static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
|
|
+static int lookup_pi_state(u32 __user *uaddr, u32 uval,
|
|
+ struct futex_hash_bucket *hb,
|
|
union futex_key *key, struct futex_pi_state **ps)
|
|
{
|
|
struct futex_q *top_waiter = futex_top_waiter(hb, key);
|
|
@@ -1145,7 +1227,7 @@ static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
|
|
* attach to the pi_state when the validation succeeds.
|
|
*/
|
|
if (top_waiter)
|
|
- return attach_to_pi_state(uval, top_waiter->pi_state, ps);
|
|
+ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
|
|
|
|
/*
|
|
* We are the first waiter - try to look up the owner based on
|
|
@@ -1164,7 +1246,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
|
|
if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
|
|
return -EFAULT;
|
|
|
|
- /*If user space value changed, let the caller retry */
|
|
+ /* If user space value changed, let the caller retry */
|
|
return curval != uval ? -EAGAIN : 0;
|
|
}
|
|
|
|
@@ -1220,7 +1302,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
|
|
*/
|
|
top_waiter = futex_top_waiter(hb, key);
|
|
if (top_waiter)
|
|
- return attach_to_pi_state(uval, top_waiter->pi_state, ps);
|
|
+ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
|
|
|
|
/*
|
|
* No waiter and user TID is 0. We are here because the
|
|
@@ -1352,6 +1434,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter
|
|
|
|
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
|
|
ret = -EFAULT;
|
|
+
|
|
} else if (curval != uval) {
|
|
/*
|
|
* If a unconditional UNLOCK_PI operation (user space did not
|
|
@@ -1364,6 +1447,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter
|
|
else
|
|
ret = -EINVAL;
|
|
}
|
|
+
|
|
if (ret) {
|
|
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
|
|
return ret;
|
|
@@ -1889,7 +1973,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
|
|
* If that call succeeds then we have pi_state and an
|
|
* initial refcount on it.
|
|
*/
|
|
- ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
|
|
+ ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
|
|
}
|
|
|
|
switch (ret) {
|
|
@@ -2188,10 +2272,13 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
|
|
{
|
|
u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
|
|
struct futex_pi_state *pi_state = q->pi_state;
|
|
- struct task_struct *oldowner = pi_state->owner;
|
|
u32 uval, uninitialized_var(curval), newval;
|
|
+ struct task_struct *oldowner;
|
|
int ret;
|
|
|
|
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
|
|
+
|
|
+ oldowner = pi_state->owner;
|
|
/* Owner died? */
|
|
if (!pi_state->owner)
|
|
newtid |= FUTEX_OWNER_DIED;
|
|
@@ -2207,11 +2294,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
|
|
* because we can fault here. Imagine swapped out pages or a fork
|
|
* that marked all the anonymous memory readonly for cow.
|
|
*
|
|
- * Modifying pi_state _before_ the user space value would
|
|
- * leave the pi_state in an inconsistent state when we fault
|
|
- * here, because we need to drop the hash bucket lock to
|
|
- * handle the fault. This might be observed in the PID check
|
|
- * in lookup_pi_state.
|
|
+ * Modifying pi_state _before_ the user space value would leave the
|
|
+ * pi_state in an inconsistent state when we fault here, because we
|
|
+ * need to drop the locks to handle the fault. This might be observed
|
|
+ * in the PID check in lookup_pi_state.
|
|
*/
|
|
retry:
|
|
if (get_futex_value_locked(&uval, uaddr))
|
|
@@ -2232,47 +2318,60 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
|
|
* itself.
|
|
*/
|
|
if (pi_state->owner != NULL) {
|
|
- raw_spin_lock_irq(&pi_state->owner->pi_lock);
|
|
+ raw_spin_lock(&pi_state->owner->pi_lock);
|
|
WARN_ON(list_empty(&pi_state->list));
|
|
list_del_init(&pi_state->list);
|
|
- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
|
|
+ raw_spin_unlock(&pi_state->owner->pi_lock);
|
|
}
|
|
|
|
pi_state->owner = newowner;
|
|
|
|
- raw_spin_lock_irq(&newowner->pi_lock);
|
|
+ raw_spin_lock(&newowner->pi_lock);
|
|
WARN_ON(!list_empty(&pi_state->list));
|
|
list_add(&pi_state->list, &newowner->pi_state_list);
|
|
- raw_spin_unlock_irq(&newowner->pi_lock);
|
|
+ raw_spin_unlock(&newowner->pi_lock);
|
|
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
|
|
+
|
|
return 0;
|
|
|
|
/*
|
|
- * To handle the page fault we need to drop the hash bucket
|
|
- * lock here. That gives the other task (either the highest priority
|
|
- * waiter itself or the task which stole the rtmutex) the
|
|
- * chance to try the fixup of the pi_state. So once we are
|
|
- * back from handling the fault we need to check the pi_state
|
|
- * after reacquiring the hash bucket lock and before trying to
|
|
- * do another fixup. When the fixup has been done already we
|
|
- * simply return.
|
|
+ * To handle the page fault we need to drop the locks here. That gives
|
|
+ * the other task (either the highest priority waiter itself or the
|
|
+ * task which stole the rtmutex) the chance to try the fixup of the
|
|
+ * pi_state. So once we are back from handling the fault we need to
|
|
+ * check the pi_state after reacquiring the locks and before trying to
|
|
+ * do another fixup. When the fixup has been done already we simply
|
|
+ * return.
|
|
+ *
|
|
+ * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
|
|
+ * drop hb->lock since the caller owns the hb -> futex_q relation.
|
|
+ * Dropping the pi_mutex->wait_lock requires the state revalidate.
|
|
*/
|
|
handle_fault:
|
|
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
|
|
spin_unlock(q->lock_ptr);
|
|
|
|
ret = fault_in_user_writeable(uaddr);
|
|
|
|
spin_lock(q->lock_ptr);
|
|
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
|
|
|
|
/*
|
|
* Check if someone else fixed it for us:
|
|
*/
|
|
- if (pi_state->owner != oldowner)
|
|
- return 0;
|
|
+ if (pi_state->owner != oldowner) {
|
|
+ ret = 0;
|
|
+ goto out_unlock;
|
|
+ }
|
|
|
|
if (ret)
|
|
- return ret;
|
|
+ goto out_unlock;
|
|
|
|
goto retry;
|
|
+
|
|
+out_unlock:
|
|
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
|
|
+ return ret;
|
|
}
|
|
|
|
static long futex_wait_restart(struct restart_block *restart);
|
|
--
|
|
2.28.0
|
|
|