// SPDX-License-Identifier: GPL-2.0-or-later12#include <linux/plist.h>3#include <linux/sched/signal.h>45#include "futex.h"6#include "../locking/rtmutex_common.h"78/*9* On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an10* underlying rtmutex. The task which is about to be requeued could have11* just woken up (timeout, signal). After the wake up the task has to12* acquire hash bucket lock, which is held by the requeue code. As a task13* can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking14* and the hash bucket lock blocking would collide and corrupt state.15*16* On !PREEMPT_RT this is not a problem and everything could be serialized17* on hash bucket lock, but aside of having the benefit of common code,18* this allows to avoid doing the requeue when the task is already on the19* way out and taking the hash bucket lock of the original uaddr1 when the20* requeue has been completed.21*22* The following state transitions are valid:23*24* On the waiter side:25* Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_IGNORE26* Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_WAIT27*28* On the requeue side:29* Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_INPROGRESS30* Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_DONE/LOCKED31* Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_NONE (requeue failed)32* Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_DONE/LOCKED33* Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_IGNORE (requeue failed)34*35* The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this36* signals that the waiter is already on the way out. It also means that37* the waiter is still on the 'wait' futex, i.e. uaddr1.38*39* The waiter side signals early wakeup to the requeue side either through40* setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending41* on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately42* proceed to take the hash bucket lock of uaddr1. If it set state to WAIT,43* which means the wakeup is interleaving with a requeue in progress it has44* to wait for the requeue side to change the state. Either to DONE/LOCKED45* or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex46* and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by47* the requeue side when the requeue attempt failed via deadlock detection48* and therefore the waiter q is still on the uaddr1 futex.49*/50enum {51Q_REQUEUE_PI_NONE = 0,52Q_REQUEUE_PI_IGNORE,53Q_REQUEUE_PI_IN_PROGRESS,54Q_REQUEUE_PI_WAIT,55Q_REQUEUE_PI_DONE,56Q_REQUEUE_PI_LOCKED,57};5859const struct futex_q futex_q_init = {60/* list gets initialized in futex_queue()*/61.wake = futex_wake_mark,62.key = FUTEX_KEY_INIT,63.bitset = FUTEX_BITSET_MATCH_ANY,64.requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE),65};6667/**68* requeue_futex() - Requeue a futex_q from one hb to another69* @q: the futex_q to requeue70* @hb1: the source hash_bucket71* @hb2: the target hash_bucket72* @key2: the new key for the requeued futex_q73*/74static inline75void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,76struct futex_hash_bucket *hb2, union futex_key *key2)77{7879/*80* If key1 and key2 hash to the same bucket, no need to81* requeue.82*/83if (likely(&hb1->chain != &hb2->chain)) {84plist_del(&q->list, &hb1->chain);85futex_hb_waiters_dec(hb1);86futex_hb_waiters_inc(hb2);87plist_add(&q->list, &hb2->chain);88q->lock_ptr = &hb2->lock;89/*90* hb1 and hb2 belong to the same futex_hash_bucket_private91* because if we managed get a reference on hb1 then it can't be92* replaced. Therefore we avoid put(hb1)+get(hb2) here.93*/94}95q->key = *key2;96}9798static inline bool futex_requeue_pi_prepare(struct futex_q *q,99struct futex_pi_state *pi_state)100{101int old, new;102103/*104* Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has105* already set Q_REQUEUE_PI_IGNORE to signal that requeue should106* ignore the waiter.107*/108old = atomic_read_acquire(&q->requeue_state);109do {110if (old == Q_REQUEUE_PI_IGNORE)111return false;112113/*114* futex_proxy_trylock_atomic() might have set it to115* IN_PROGRESS and a interleaved early wake to WAIT.116*117* It was considered to have an extra state for that118* trylock, but that would just add more conditionals119* all over the place for a dubious value.120*/121if (old != Q_REQUEUE_PI_NONE)122break;123124new = Q_REQUEUE_PI_IN_PROGRESS;125} while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));126127q->pi_state = pi_state;128return true;129}130131static inline void futex_requeue_pi_complete(struct futex_q *q, int locked)132{133int old, new;134135old = atomic_read_acquire(&q->requeue_state);136do {137if (old == Q_REQUEUE_PI_IGNORE)138return;139140if (locked >= 0) {141/* Requeue succeeded. Set DONE or LOCKED */142WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS &&143old != Q_REQUEUE_PI_WAIT);144new = Q_REQUEUE_PI_DONE + locked;145} else if (old == Q_REQUEUE_PI_IN_PROGRESS) {146/* Deadlock, no early wakeup interleave */147new = Q_REQUEUE_PI_NONE;148} else {149/* Deadlock, early wakeup interleave. */150WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT);151new = Q_REQUEUE_PI_IGNORE;152}153} while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));154155#ifdef CONFIG_PREEMPT_RT156/* If the waiter interleaved with the requeue let it know */157if (unlikely(old == Q_REQUEUE_PI_WAIT))158rcuwait_wake_up(&q->requeue_wait);159#endif160}161162static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q)163{164int old, new;165166old = atomic_read_acquire(&q->requeue_state);167do {168/* Is requeue done already? */169if (old >= Q_REQUEUE_PI_DONE)170return old;171172/*173* If not done, then tell the requeue code to either ignore174* the waiter or to wake it up once the requeue is done.175*/176new = Q_REQUEUE_PI_WAIT;177if (old == Q_REQUEUE_PI_NONE)178new = Q_REQUEUE_PI_IGNORE;179} while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));180181/* If the requeue was in progress, wait for it to complete */182if (old == Q_REQUEUE_PI_IN_PROGRESS) {183#ifdef CONFIG_PREEMPT_RT184rcuwait_wait_event(&q->requeue_wait,185atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT,186TASK_UNINTERRUPTIBLE);187#else188(void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT);189#endif190}191192/*193* Requeue is now either prohibited or complete. Reread state194* because during the wait above it might have changed. Nothing195* will modify q->requeue_state after this point.196*/197return atomic_read(&q->requeue_state);198}199200/**201* requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue202* @q: the futex_q203* @key: the key of the requeue target futex204* @hb: the hash_bucket of the requeue target futex205*206* During futex_requeue, with requeue_pi=1, it is possible to acquire the207* target futex if it is uncontended or via a lock steal.208*209* 1) Set @q::key to the requeue target futex key so the waiter can detect210* the wakeup on the right futex.211*212* 2) Dequeue @q from the hash bucket.213*214* 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock215* acquisition.216*217* 4) Set the q->lock_ptr to the requeue target hb->lock for the case that218* the waiter has to fixup the pi state.219*220* 5) Complete the requeue state so the waiter can make progress. After221* this point the waiter task can return from the syscall immediately in222* case that the pi state does not have to be fixed up.223*224* 6) Wake the waiter task.225*226* Must be called with both q->lock_ptr and hb->lock held.227*/228static inline229void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,230struct futex_hash_bucket *hb)231{232struct task_struct *task;233234q->key = *key;235__futex_unqueue(q);236237WARN_ON(!q->rt_waiter);238q->rt_waiter = NULL;239/*240* Acquire a reference for the waiter to ensure valid241* futex_q::lock_ptr.242*/243futex_hash_get(hb);244q->drop_hb_ref = true;245q->lock_ptr = &hb->lock;246task = READ_ONCE(q->task);247248/* Signal locked state to the waiter */249futex_requeue_pi_complete(q, 1);250wake_up_state(task, TASK_NORMAL);251}252253/**254* futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter255* @pifutex: the user address of the to futex256* @hb1: the from futex hash bucket, must be locked by the caller257* @hb2: the to futex hash bucket, must be locked by the caller258* @key1: the from futex key259* @key2: the to futex key260* @ps: address to store the pi_state pointer261* @exiting: Pointer to store the task pointer of the owner task262* which is in the middle of exiting263* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)264*265* Try and get the lock on behalf of the top waiter if we can do it atomically.266* Wake the top waiter if we succeed. If the caller specified set_waiters,267* then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.268* hb1 and hb2 must be held by the caller.269*270* @exiting is only set when the return value is -EBUSY. If so, this holds271* a refcount on the exiting task on return and the caller needs to drop it272* after waiting for the exit to complete.273*274* Return:275* - 0 - failed to acquire the lock atomically;276* - >0 - acquired the lock, return value is vpid of the top_waiter277* - <0 - error278*/279static int280futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,281struct futex_hash_bucket *hb2, union futex_key *key1,282union futex_key *key2, struct futex_pi_state **ps,283struct task_struct **exiting, int set_waiters)284{285struct futex_q *top_waiter;286u32 curval;287int ret;288289if (futex_get_value_locked(&curval, pifutex))290return -EFAULT;291292if (unlikely(should_fail_futex(true)))293return -EFAULT;294295/*296* Find the top_waiter and determine if there are additional waiters.297* If the caller intends to requeue more than 1 waiter to pifutex,298* force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,299* as we have means to handle the possible fault. If not, don't set300* the bit unnecessarily as it will force the subsequent unlock to enter301* the kernel.302*/303top_waiter = futex_top_waiter(hb1, key1);304305/* There are no waiters, nothing for us to do. */306if (!top_waiter)307return 0;308309/*310* Ensure that this is a waiter sitting in futex_wait_requeue_pi()311* and waiting on the 'waitqueue' futex which is always !PI.312*/313if (!top_waiter->rt_waiter || top_waiter->pi_state)314return -EINVAL;315316/* Ensure we requeue to the expected futex. */317if (!futex_match(top_waiter->requeue_pi_key, key2))318return -EINVAL;319320/* Ensure that this does not race against an early wakeup */321if (!futex_requeue_pi_prepare(top_waiter, NULL))322return -EAGAIN;323324/*325* Try to take the lock for top_waiter and set the FUTEX_WAITERS bit326* in the contended case or if @set_waiters is true.327*328* In the contended case PI state is attached to the lock owner. If329* the user space lock can be acquired then PI state is attached to330* the new owner (@top_waiter->task) when @set_waiters is true.331*/332ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,333exiting, set_waiters);334if (ret == 1) {335/*336* Lock was acquired in user space and PI state was337* attached to @top_waiter->task. That means state is fully338* consistent and the waiter can return to user space339* immediately after the wakeup.340*/341requeue_pi_wake_futex(top_waiter, key2, hb2);342} else if (ret < 0) {343/* Rewind top_waiter::requeue_state */344futex_requeue_pi_complete(top_waiter, ret);345} else {346/*347* futex_lock_pi_atomic() did not acquire the user space348* futex, but managed to establish the proxy lock and pi349* state. top_waiter::requeue_state cannot be fixed up here350* because the waiter is not enqueued on the rtmutex351* yet. This is handled at the callsite depending on the352* result of rt_mutex_start_proxy_lock() which is353* guaranteed to be reached with this function returning 0.354*/355}356return ret;357}358359/**360* futex_requeue() - Requeue waiters from uaddr1 to uaddr2361* @uaddr1: source futex user address362* @flags1: futex flags (FLAGS_SHARED, etc.)363* @uaddr2: target futex user address364* @flags2: futex flags (FLAGS_SHARED, etc.)365* @nr_wake: number of waiters to wake (must be 1 for requeue_pi)366* @nr_requeue: number of waiters to requeue (0-INT_MAX)367* @cmpval: @uaddr1 expected value (or %NULL)368* @requeue_pi: if we are attempting to requeue from a non-pi futex to a369* pi futex (pi to pi requeue is not supported)370*371* Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire372* uaddr2 atomically on behalf of the top waiter.373*374* Return:375* - >=0 - on success, the number of tasks requeued or woken;376* - <0 - on error377*/378int futex_requeue(u32 __user *uaddr1, unsigned int flags1,379u32 __user *uaddr2, unsigned int flags2,380int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi)381{382union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;383int task_count = 0, ret;384struct futex_pi_state *pi_state = NULL;385struct futex_q *this, *next;386DEFINE_WAKE_Q(wake_q);387388if (nr_wake < 0 || nr_requeue < 0)389return -EINVAL;390391/*392* When PI not supported: return -ENOSYS if requeue_pi is true,393* consequently the compiler knows requeue_pi is always false past394* this point which will optimize away all the conditional code395* further down.396*/397if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)398return -ENOSYS;399400if (requeue_pi) {401/*402* Requeue PI only works on two distinct uaddrs. This403* check is only valid for private futexes. See below.404*/405if (uaddr1 == uaddr2)406return -EINVAL;407408/*409* futex_requeue() allows the caller to define the number410* of waiters to wake up via the @nr_wake argument. With411* REQUEUE_PI, waking up more than one waiter is creating412* more problems than it solves. Waking up a waiter makes413* only sense if the PI futex @uaddr2 is uncontended as414* this allows the requeue code to acquire the futex415* @uaddr2 before waking the waiter. The waiter can then416* return to user space without further action. A secondary417* wakeup would just make the futex_wait_requeue_pi()418* handling more complex, because that code would have to419* look up pi_state and do more or less all the handling420* which the requeue code has to do for the to be requeued421* waiters. So restrict the number of waiters to wake to422* one, and only wake it up when the PI futex is423* uncontended. Otherwise requeue it and let the unlock of424* the PI futex handle the wakeup.425*426* All REQUEUE_PI users, e.g. pthread_cond_signal() and427* pthread_cond_broadcast() must use nr_wake=1.428*/429if (nr_wake != 1)430return -EINVAL;431432/*433* requeue_pi requires a pi_state, try to allocate it now434* without any locks in case it fails.435*/436if (refill_pi_state_cache())437return -ENOMEM;438}439440retry:441ret = get_futex_key(uaddr1, flags1, &key1, FUTEX_READ);442if (unlikely(ret != 0))443return ret;444ret = get_futex_key(uaddr2, flags2, &key2,445requeue_pi ? FUTEX_WRITE : FUTEX_READ);446if (unlikely(ret != 0))447return ret;448449/*450* The check above which compares uaddrs is not sufficient for451* shared futexes. We need to compare the keys:452*/453if (requeue_pi && futex_match(&key1, &key2))454return -EINVAL;455456retry_private:457if (1) {458CLASS(hb, hb1)(&key1);459CLASS(hb, hb2)(&key2);460461futex_hb_waiters_inc(hb2);462double_lock_hb(hb1, hb2);463464if (likely(cmpval != NULL)) {465u32 curval;466467ret = futex_get_value_locked(&curval, uaddr1);468469if (unlikely(ret)) {470futex_hb_waiters_dec(hb2);471double_unlock_hb(hb1, hb2);472473ret = get_user(curval, uaddr1);474if (ret)475return ret;476477if (!(flags1 & FLAGS_SHARED))478goto retry_private;479480goto retry;481}482if (curval != *cmpval) {483ret = -EAGAIN;484goto out_unlock;485}486}487488if (requeue_pi) {489struct task_struct *exiting = NULL;490491/*492* Attempt to acquire uaddr2 and wake the top waiter. If we493* intend to requeue waiters, force setting the FUTEX_WAITERS494* bit. We force this here where we are able to easily handle495* faults rather in the requeue loop below.496*497* Updates topwaiter::requeue_state if a top waiter exists.498*/499ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,500&key2, &pi_state,501&exiting, nr_requeue);502503/*504* At this point the top_waiter has either taken uaddr2 or505* is waiting on it. In both cases pi_state has been506* established and an initial refcount on it. In case of an507* error there's nothing.508*509* The top waiter's requeue_state is up to date:510*511* - If the lock was acquired atomically (ret == 1), then512* the state is Q_REQUEUE_PI_LOCKED.513*514* The top waiter has been dequeued and woken up and can515* return to user space immediately. The kernel/user516* space state is consistent. In case that there must be517* more waiters requeued the WAITERS bit in the user518* space futex is set so the top waiter task has to go519* into the syscall slowpath to unlock the futex. This520* will block until this requeue operation has been521* completed and the hash bucket locks have been522* dropped.523*524* - If the trylock failed with an error (ret < 0) then525* the state is either Q_REQUEUE_PI_NONE, i.e. "nothing526* happened", or Q_REQUEUE_PI_IGNORE when there was an527* interleaved early wakeup.528*529* - If the trylock did not succeed (ret == 0) then the530* state is either Q_REQUEUE_PI_IN_PROGRESS or531* Q_REQUEUE_PI_WAIT if an early wakeup interleaved.532* This will be cleaned up in the loop below, which533* cannot fail because futex_proxy_trylock_atomic() did534* the same sanity checks for requeue_pi as the loop535* below does.536*/537switch (ret) {538case 0:539/* We hold a reference on the pi state. */540break;541542case 1:543/*544* futex_proxy_trylock_atomic() acquired the user space545* futex. Adjust task_count.546*/547task_count++;548ret = 0;549break;550551/*552* If the above failed, then pi_state is NULL and553* waiter::requeue_state is correct.554*/555case -EFAULT:556futex_hb_waiters_dec(hb2);557double_unlock_hb(hb1, hb2);558ret = fault_in_user_writeable(uaddr2);559if (!ret)560goto retry;561return ret;562case -EBUSY:563case -EAGAIN:564/*565* Two reasons for this:566* - EBUSY: Owner is exiting and we just wait for the567* exit to complete.568* - EAGAIN: The user space value changed.569*/570futex_hb_waiters_dec(hb2);571double_unlock_hb(hb1, hb2);572/*573* Handle the case where the owner is in the middle of574* exiting. Wait for the exit to complete otherwise575* this task might loop forever, aka. live lock.576*/577wait_for_owner_exiting(ret, exiting);578cond_resched();579goto retry;580default:581goto out_unlock;582}583}584585plist_for_each_entry_safe(this, next, &hb1->chain, list) {586if (task_count - nr_wake >= nr_requeue)587break;588589if (!futex_match(&this->key, &key1))590continue;591592/*593* FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always594* be paired with each other and no other futex ops.595*596* We should never be requeueing a futex_q with a pi_state,597* which is awaiting a futex_unlock_pi().598*/599if ((requeue_pi && !this->rt_waiter) ||600(!requeue_pi && this->rt_waiter) ||601this->pi_state) {602ret = -EINVAL;603break;604}605606/* Plain futexes just wake or requeue and are done */607if (!requeue_pi) {608if (++task_count <= nr_wake)609this->wake(&wake_q, this);610else611requeue_futex(this, hb1, hb2, &key2);612continue;613}614615/* Ensure we requeue to the expected futex for requeue_pi. */616if (!futex_match(this->requeue_pi_key, &key2)) {617ret = -EINVAL;618break;619}620621/*622* Requeue nr_requeue waiters and possibly one more in the case623* of requeue_pi if we couldn't acquire the lock atomically.624*625* Prepare the waiter to take the rt_mutex. Take a refcount626* on the pi_state and store the pointer in the futex_q627* object of the waiter.628*/629get_pi_state(pi_state);630631/* Don't requeue when the waiter is already on the way out. */632if (!futex_requeue_pi_prepare(this, pi_state)) {633/*634* Early woken waiter signaled that it is on the635* way out. Drop the pi_state reference and try the636* next waiter. @this->pi_state is still NULL.637*/638put_pi_state(pi_state);639continue;640}641642ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,643this->rt_waiter,644this->task);645646if (ret == 1) {647/*648* We got the lock. We do neither drop the refcount649* on pi_state nor clear this->pi_state because the650* waiter needs the pi_state for cleaning up the651* user space value. It will drop the refcount652* after doing so. this::requeue_state is updated653* in the wakeup as well.654*/655requeue_pi_wake_futex(this, &key2, hb2);656task_count++;657} else if (!ret) {658/* Waiter is queued, move it to hb2 */659requeue_futex(this, hb1, hb2, &key2);660futex_requeue_pi_complete(this, 0);661task_count++;662} else {663/*664* rt_mutex_start_proxy_lock() detected a potential665* deadlock when we tried to queue that waiter.666* Drop the pi_state reference which we took above667* and remove the pointer to the state from the668* waiters futex_q object.669*/670this->pi_state = NULL;671put_pi_state(pi_state);672futex_requeue_pi_complete(this, ret);673/*674* We stop queueing more waiters and let user space675* deal with the mess.676*/677break;678}679}680681/*682* We took an extra initial reference to the pi_state in683* futex_proxy_trylock_atomic(). We need to drop it here again.684*/685put_pi_state(pi_state);686687out_unlock:688futex_hb_waiters_dec(hb2);689double_unlock_hb(hb1, hb2);690}691wake_up_q(&wake_q);692return ret ? ret : task_count;693}694695/**696* handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex697* @hb: the hash_bucket futex_q was original enqueued on698* @q: the futex_q woken while waiting to be requeued699* @timeout: the timeout associated with the wait (NULL if none)700*701* Determine the cause for the early wakeup.702*703* Return:704* -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR705*/706static inline707int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,708struct futex_q *q,709struct hrtimer_sleeper *timeout)710{711int ret;712713/*714* With the hb lock held, we avoid races while we process the wakeup.715* We only need to hold hb (and not hb2) to ensure atomicity as the716* wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.717* It can't be requeued from uaddr2 to something else since we don't718* support a PI aware source futex for requeue.719*/720WARN_ON_ONCE(&hb->lock != q->lock_ptr);721722/*723* We were woken prior to requeue by a timeout or a signal.724* Unqueue the futex_q and determine which it was.725*/726plist_del(&q->list, &hb->chain);727futex_hb_waiters_dec(hb);728729/* Handle spurious wakeups gracefully */730ret = -EWOULDBLOCK;731if (timeout && !timeout->task)732ret = -ETIMEDOUT;733else if (signal_pending(current))734ret = -ERESTARTNOINTR;735return ret;736}737738/**739* futex_wait_requeue_pi() - Wait on uaddr and take uaddr2740* @uaddr: the futex we initially wait on (non-pi)741* @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be742* the same type, no requeueing from private to shared, etc.743* @val: the expected value of uaddr744* @abs_time: absolute timeout745* @bitset: 32 bit wakeup bitset set by userspace, defaults to all746* @uaddr2: the pi futex we will take prior to returning to user-space747*748* The caller will wait on uaddr and will be requeued by futex_requeue() to749* uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake750* on uaddr2 and complete the acquisition of the rt_mutex prior to returning to751* userspace. This ensures the rt_mutex maintains an owner when it has waiters;752* without one, the pi logic would not know which task to boost/deboost, if753* there was a need to.754*755* We call schedule in futex_wait_queue() when we enqueue and return there756* via the following--757* 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()758* 2) wakeup on uaddr2 after a requeue759* 3) signal760* 4) timeout761*762* If 3, cleanup and return -ERESTARTNOINTR.763*764* If 2, we may then block on trying to take the rt_mutex and return via:765* 5) successful lock766* 6) signal767* 7) timeout768* 8) other lock acquisition failure769*770* If 6, return -EWOULDBLOCK (restarting the syscall would do the same).771*772* If 4 or 7, we cleanup and return with -ETIMEDOUT.773*774* Return:775* - 0 - On success;776* - <0 - On error777*/778int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,779u32 val, ktime_t *abs_time, u32 bitset,780u32 __user *uaddr2)781{782struct hrtimer_sleeper timeout, *to;783struct rt_mutex_waiter rt_waiter;784union futex_key key2 = FUTEX_KEY_INIT;785struct futex_q q = futex_q_init;786struct rt_mutex_base *pi_mutex;787int res, ret;788789if (!IS_ENABLED(CONFIG_FUTEX_PI))790return -ENOSYS;791792if (uaddr == uaddr2)793return -EINVAL;794795if (!bitset)796return -EINVAL;797798to = futex_setup_timer(abs_time, &timeout, flags,799current->timer_slack_ns);800801/*802* The waiter is allocated on our stack, manipulated by the requeue803* code while we sleep on uaddr.804*/805rt_mutex_init_waiter(&rt_waiter);806807ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);808if (unlikely(ret != 0))809goto out;810811q.bitset = bitset;812q.rt_waiter = &rt_waiter;813q.requeue_pi_key = &key2;814815/*816* Prepare to wait on uaddr. On success, it holds hb->lock and q817* is initialized.818*/819ret = futex_wait_setup(uaddr, val, flags, &q, &key2, current);820if (ret)821goto out;822823/* Queue the futex_q, drop the hb lock, wait for wakeup. */824futex_do_wait(&q, to);825826switch (futex_requeue_pi_wakeup_sync(&q)) {827case Q_REQUEUE_PI_IGNORE:828{829CLASS(hb, hb)(&q.key);830/* The waiter is still on uaddr1 */831spin_lock(&hb->lock);832ret = handle_early_requeue_pi_wakeup(hb, &q, to);833spin_unlock(&hb->lock);834}835break;836837case Q_REQUEUE_PI_LOCKED:838/* The requeue acquired the lock */839if (q.pi_state && (q.pi_state->owner != current)) {840futex_q_lockptr_lock(&q);841ret = fixup_pi_owner(uaddr2, &q, true);842/*843* Drop the reference to the pi state which the844* requeue_pi() code acquired for us.845*/846put_pi_state(q.pi_state);847spin_unlock(q.lock_ptr);848/*849* Adjust the return value. It's either -EFAULT or850* success (1) but the caller expects 0 for success.851*/852ret = ret < 0 ? ret : 0;853}854break;855856case Q_REQUEUE_PI_DONE:857/* Requeue completed. Current is 'pi_blocked_on' the rtmutex */858pi_mutex = &q.pi_state->pi_mutex;859ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);860861/*862* See futex_unlock_pi()'s cleanup: comment.863*/864if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))865ret = 0;866867futex_q_lockptr_lock(&q);868debug_rt_mutex_free_waiter(&rt_waiter);869/*870* Fixup the pi_state owner and possibly acquire the lock if we871* haven't already.872*/873res = fixup_pi_owner(uaddr2, &q, !ret);874/*875* If fixup_pi_owner() returned an error, propagate that. If it876* acquired the lock, clear -ETIMEDOUT or -EINTR.877*/878if (res)879ret = (res < 0) ? res : 0;880881futex_unqueue_pi(&q);882spin_unlock(q.lock_ptr);883884if (ret == -EINTR) {885/*886* We've already been requeued, but cannot restart887* by calling futex_lock_pi() directly. We could888* restart this syscall, but it would detect that889* the user space "val" changed and return890* -EWOULDBLOCK. Save the overhead of the restart891* and return -EWOULDBLOCK directly.892*/893ret = -EWOULDBLOCK;894}895break;896default:897BUG();898}899if (q.drop_hb_ref) {900CLASS(hb, hb)(&q.key);901/* Additional reference from requeue_pi_wake_futex() */902futex_hash_put(hb);903}904905out:906if (to) {907hrtimer_cancel(&to->timer);908destroy_hrtimer_on_stack(&to->timer);909}910return ret;911}912913914915