// SPDX-License-Identifier: GPL-2.0-or-later12#include <linux/slab.h>3#include <linux/sched/rt.h>4#include <linux/sched/task.h>56#include "futex.h"7#include "../locking/rtmutex_common.h"89/*10* PI code:11*/12int refill_pi_state_cache(void)13{14struct futex_pi_state *pi_state;1516if (likely(current->pi_state_cache))17return 0;1819pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);2021if (!pi_state)22return -ENOMEM;2324INIT_LIST_HEAD(&pi_state->list);25/* pi_mutex gets initialized later */26pi_state->owner = NULL;27refcount_set(&pi_state->refcount, 1);28pi_state->key = FUTEX_KEY_INIT;2930current->pi_state_cache = pi_state;3132return 0;33}3435static struct futex_pi_state *alloc_pi_state(void)36{37struct futex_pi_state *pi_state = current->pi_state_cache;3839WARN_ON(!pi_state);40current->pi_state_cache = NULL;4142return pi_state;43}4445static void pi_state_update_owner(struct futex_pi_state *pi_state,46struct task_struct *new_owner)47{48struct task_struct *old_owner = pi_state->owner;4950lockdep_assert_held(&pi_state->pi_mutex.wait_lock);5152if (old_owner) {53raw_spin_lock(&old_owner->pi_lock);54WARN_ON(list_empty(&pi_state->list));55list_del_init(&pi_state->list);56raw_spin_unlock(&old_owner->pi_lock);57}5859if (new_owner) {60raw_spin_lock(&new_owner->pi_lock);61WARN_ON(!list_empty(&pi_state->list));62list_add(&pi_state->list, &new_owner->pi_state_list);63pi_state->owner = new_owner;64raw_spin_unlock(&new_owner->pi_lock);65}66}6768void get_pi_state(struct futex_pi_state *pi_state)69{70WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));71}7273/*74* Drops a reference to the pi_state object and frees or caches it75* when the last reference is gone.76*/77void put_pi_state(struct futex_pi_state *pi_state)78{79if (!pi_state)80return;8182if (!refcount_dec_and_test(&pi_state->refcount))83return;8485/*86* If pi_state->owner is NULL, the owner is most probably dying87* and has cleaned up the pi_state already88*/89if (pi_state->owner) {90unsigned long flags;9192raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);93pi_state_update_owner(pi_state, NULL);94rt_mutex_proxy_unlock(&pi_state->pi_mutex);95raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);96}9798if (current->pi_state_cache) {99kfree(pi_state);100} else {101/*102* pi_state->list is already empty.103* clear pi_state->owner.104* refcount is at 0 - put it back to 1.105*/106pi_state->owner = NULL;107refcount_set(&pi_state->refcount, 1);108current->pi_state_cache = pi_state;109}110}111112/*113* We need to check the following states:114*115* Waiter | pi_state | pi->owner | uTID | uODIED | ?116*117* [1] NULL | --- | --- | 0 | 0/1 | Valid118* [2] NULL | --- | --- | >0 | 0/1 | Valid119*120* [3] Found | NULL | -- | Any | 0/1 | Invalid121*122* [4] Found | Found | NULL | 0 | 1 | Valid123* [5] Found | Found | NULL | >0 | 1 | Invalid124*125* [6] Found | Found | task | 0 | 1 | Valid126*127* [7] Found | Found | NULL | Any | 0 | Invalid128*129* [8] Found | Found | task | ==taskTID | 0/1 | Valid130* [9] Found | Found | task | 0 | 0 | Invalid131* [10] Found | Found | task | !=taskTID | 0/1 | Invalid132*133* [1] Indicates that the kernel can acquire the futex atomically. We134* came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.135*136* [2] Valid, if TID does not belong to a kernel thread. If no matching137* thread is found then it indicates that the owner TID has died.138*139* [3] Invalid. The waiter is queued on a non PI futex140*141* [4] Valid state after exit_robust_list(), which sets the user space142* value to FUTEX_WAITERS | FUTEX_OWNER_DIED.143*144* [5] The user space value got manipulated between exit_robust_list()145* and exit_pi_state_list()146*147* [6] Valid state after exit_pi_state_list() which sets the new owner in148* the pi_state but cannot access the user space value.149*150* [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.151*152* [8] Owner and user space value match153*154* [9] There is no transient state which sets the user space TID to 0155* except exit_robust_list(), but this is indicated by the156* FUTEX_OWNER_DIED bit. See [4]157*158* [10] There is no transient state which leaves owner and user space159* TID out of sync. Except one error case where the kernel is denied160* write access to the user address, see fixup_pi_state_owner().161*162*163* Serialization and lifetime rules:164*165* hb->lock:166*167* hb -> futex_q, relation168* futex_q -> pi_state, relation169*170* (cannot be raw because hb can contain arbitrary amount171* of futex_q's)172*173* pi_mutex->wait_lock:174*175* {uval, pi_state}176*177* (and pi_mutex 'obviously')178*179* p->pi_lock:180*181* p->pi_state_list -> pi_state->list, relation182* pi_mutex->owner -> pi_state->owner, relation183*184* pi_state->refcount:185*186* pi_state lifetime187*188*189* Lock order:190*191* hb->lock192* pi_mutex->wait_lock193* p->pi_lock194*195*/196197/*198* Validate that the existing waiter has a pi_state and sanity check199* the pi_state against the user space value. If correct, attach to200* it.201*/202static int attach_to_pi_state(u32 __user *uaddr, u32 uval,203struct futex_pi_state *pi_state,204struct futex_pi_state **ps)205{206pid_t pid = uval & FUTEX_TID_MASK;207u32 uval2;208int ret;209210/*211* Userspace might have messed up non-PI and PI futexes [3]212*/213if (unlikely(!pi_state))214return -EINVAL;215216/*217* We get here with hb->lock held, and having found a218* futex_top_waiter(). This means that futex_lock_pi() of said futex_q219* has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),220* which in turn means that futex_lock_pi() still has a reference on221* our pi_state.222*223* The waiter holding a reference on @pi_state also protects against224* the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()225* and futex_wait_requeue_pi() as it cannot go to 0 and consequently226* free pi_state before we can take a reference ourselves.227*/228WARN_ON(!refcount_read(&pi_state->refcount));229230/*231* Now that we have a pi_state, we can acquire wait_lock232* and do the state validation.233*/234raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);235236/*237* Since {uval, pi_state} is serialized by wait_lock, and our current238* uval was read without holding it, it can have changed. Verify it239* still is what we expect it to be, otherwise retry the entire240* operation.241*/242if (futex_get_value_locked(&uval2, uaddr))243goto out_efault;244245if (uval != uval2)246goto out_eagain;247248/*249* Handle the owner died case:250*/251if (uval & FUTEX_OWNER_DIED) {252/*253* exit_pi_state_list sets owner to NULL and wakes the254* topmost waiter. The task which acquires the255* pi_state->rt_mutex will fixup owner.256*/257if (!pi_state->owner) {258/*259* No pi state owner, but the user space TID260* is not 0. Inconsistent state. [5]261*/262if (pid)263goto out_einval;264/*265* Take a ref on the state and return success. [4]266*/267goto out_attach;268}269270/*271* If TID is 0, then either the dying owner has not272* yet executed exit_pi_state_list() or some waiter273* acquired the rtmutex in the pi state, but did not274* yet fixup the TID in user space.275*276* Take a ref on the state and return success. [6]277*/278if (!pid)279goto out_attach;280} else {281/*282* If the owner died bit is not set, then the pi_state283* must have an owner. [7]284*/285if (!pi_state->owner)286goto out_einval;287}288289/*290* Bail out if user space manipulated the futex value. If pi291* state exists then the owner TID must be the same as the292* user space TID. [9/10]293*/294if (pid != task_pid_vnr(pi_state->owner))295goto out_einval;296297out_attach:298get_pi_state(pi_state);299raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);300*ps = pi_state;301return 0;302303out_einval:304ret = -EINVAL;305goto out_error;306307out_eagain:308ret = -EAGAIN;309goto out_error;310311out_efault:312ret = -EFAULT;313goto out_error;314315out_error:316raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);317return ret;318}319320static int handle_exit_race(u32 __user *uaddr, u32 uval,321struct task_struct *tsk)322{323u32 uval2;324325/*326* If the futex exit state is not yet FUTEX_STATE_DEAD, tell the327* caller that the alleged owner is busy.328*/329if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)330return -EBUSY;331332/*333* Reread the user space value to handle the following situation:334*335* CPU0 CPU1336*337* sys_exit() sys_futex()338* do_exit() futex_lock_pi()339* futex_lock_pi_atomic()340* exit_signals(tsk) No waiters:341* tsk->flags |= PF_EXITING; *uaddr == 0x00000PID342* mm_release(tsk) Set waiter bit343* exit_robust_list(tsk) { *uaddr = 0x80000PID;344* Set owner died attach_to_pi_owner() {345* *uaddr = 0xC0000000; tsk = get_task(PID);346* } if (!tsk->flags & PF_EXITING) {347* ... attach();348* tsk->futex_state = } else {349* FUTEX_STATE_DEAD; if (tsk->futex_state !=350* FUTEX_STATE_DEAD)351* return -EAGAIN;352* return -ESRCH; <--- FAIL353* }354*355* Returning ESRCH unconditionally is wrong here because the356* user space value has been changed by the exiting task.357*358* The same logic applies to the case where the exiting task is359* already gone.360*/361if (futex_get_value_locked(&uval2, uaddr))362return -EFAULT;363364/* If the user space value has changed, try again. */365if (uval2 != uval)366return -EAGAIN;367368/*369* The exiting task did not have a robust list, the robust list was370* corrupted or the user space value in *uaddr is simply bogus.371* Give up and tell user space.372*/373return -ESRCH;374}375376static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,377struct futex_pi_state **ps)378{379/*380* No existing pi state. First waiter. [2]381*382* This creates pi_state, we have hb->lock held, this means nothing can383* observe this state, wait_lock is irrelevant.384*/385struct futex_pi_state *pi_state = alloc_pi_state();386387/*388* Initialize the pi_mutex in locked state and make @p389* the owner of it:390*/391rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);392393/* Store the key for possible exit cleanups: */394pi_state->key = *key;395396WARN_ON(!list_empty(&pi_state->list));397list_add(&pi_state->list, &p->pi_state_list);398/*399* Assignment without holding pi_state->pi_mutex.wait_lock is safe400* because there is no concurrency as the object is not published yet.401*/402pi_state->owner = p;403404*ps = pi_state;405}406/*407* Lookup the task for the TID provided from user space and attach to408* it after doing proper sanity checks.409*/410static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,411struct futex_pi_state **ps,412struct task_struct **exiting)413{414pid_t pid = uval & FUTEX_TID_MASK;415struct task_struct *p;416417/*418* We are the first waiter - try to look up the real owner and attach419* the new pi_state to it, but bail out when TID = 0 [1]420*421* The !pid check is paranoid. None of the call sites should end up422* with pid == 0, but better safe than sorry. Let the caller retry423*/424if (!pid)425return -EAGAIN;426p = find_get_task_by_vpid(pid);427if (!p)428return handle_exit_race(uaddr, uval, NULL);429430if (unlikely(p->flags & PF_KTHREAD)) {431put_task_struct(p);432return -EPERM;433}434435/*436* We need to look at the task state to figure out, whether the437* task is exiting. To protect against the change of the task state438* in futex_exit_release(), we do this protected by p->pi_lock:439*/440raw_spin_lock_irq(&p->pi_lock);441if (unlikely(p->futex_state != FUTEX_STATE_OK)) {442/*443* The task is on the way out. When the futex state is444* FUTEX_STATE_DEAD, we know that the task has finished445* the cleanup:446*/447int ret = handle_exit_race(uaddr, uval, p);448449raw_spin_unlock_irq(&p->pi_lock);450/*451* If the owner task is between FUTEX_STATE_EXITING and452* FUTEX_STATE_DEAD then store the task pointer and keep453* the reference on the task struct. The calling code will454* drop all locks, wait for the task to reach455* FUTEX_STATE_DEAD and then drop the refcount. This is456* required to prevent a live lock when the current task457* preempted the exiting task between the two states.458*/459if (ret == -EBUSY)460*exiting = p;461else462put_task_struct(p);463return ret;464}465466__attach_to_pi_owner(p, key, ps);467raw_spin_unlock_irq(&p->pi_lock);468469put_task_struct(p);470471return 0;472}473474static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)475{476int err;477u32 curval;478479if (unlikely(should_fail_futex(true)))480return -EFAULT;481482err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);483if (unlikely(err))484return err;485486/* If user space value changed, let the caller retry */487return curval != uval ? -EAGAIN : 0;488}489490/**491* futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex492* @uaddr: the pi futex user address493* @hb: the pi futex hash bucket494* @key: the futex key associated with uaddr and hb495* @ps: the pi_state pointer where we store the result of the496* lookup497* @task: the task to perform the atomic lock work for. This will498* be "current" except in the case of requeue pi.499* @exiting: Pointer to store the task pointer of the owner task500* which is in the middle of exiting501* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)502*503* Return:504* - 0 - ready to wait;505* - 1 - acquired the lock;506* - <0 - error507*508* The hb->lock must be held by the caller.509*510* @exiting is only set when the return value is -EBUSY. If so, this holds511* a refcount on the exiting task on return and the caller needs to drop it512* after waiting for the exit to complete.513*/514int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,515union futex_key *key,516struct futex_pi_state **ps,517struct task_struct *task,518struct task_struct **exiting,519int set_waiters)520{521u32 uval, newval, vpid = task_pid_vnr(task);522struct futex_q *top_waiter;523int ret;524525/*526* Read the user space value first so we can validate a few527* things before proceeding further.528*/529if (futex_get_value_locked(&uval, uaddr))530return -EFAULT;531532if (unlikely(should_fail_futex(true)))533return -EFAULT;534535/*536* Detect deadlocks.537*/538if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))539return -EDEADLK;540541if ((unlikely(should_fail_futex(true))))542return -EDEADLK;543544/*545* Lookup existing state first. If it exists, try to attach to546* its pi_state.547*/548top_waiter = futex_top_waiter(hb, key);549if (top_waiter)550return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);551552/*553* No waiter and user TID is 0. We are here because the554* waiters or the owner died bit is set or called from555* requeue_cmp_pi or for whatever reason something took the556* syscall.557*/558if (!(uval & FUTEX_TID_MASK)) {559/*560* We take over the futex. No other waiters and the user space561* TID is 0. We preserve the owner died bit.562*/563newval = uval & FUTEX_OWNER_DIED;564newval |= vpid;565566/* The futex requeue_pi code can enforce the waiters bit */567if (set_waiters)568newval |= FUTEX_WAITERS;569570ret = lock_pi_update_atomic(uaddr, uval, newval);571if (ret)572return ret;573574/*575* If the waiter bit was requested the caller also needs PI576* state attached to the new owner of the user space futex.577*578* @task is guaranteed to be alive and it cannot be exiting579* because it is either sleeping or waiting in580* futex_requeue_pi_wakeup_sync().581*582* No need to do the full attach_to_pi_owner() exercise583* because @task is known and valid.584*/585if (set_waiters) {586raw_spin_lock_irq(&task->pi_lock);587__attach_to_pi_owner(task, key, ps);588raw_spin_unlock_irq(&task->pi_lock);589}590return 1;591}592593/*594* First waiter. Set the waiters bit before attaching ourself to595* the owner. If owner tries to unlock, it will be forced into596* the kernel and blocked on hb->lock.597*/598newval = uval | FUTEX_WAITERS;599ret = lock_pi_update_atomic(uaddr, uval, newval);600if (ret)601return ret;602/*603* If the update of the user space value succeeded, we try to604* attach to the owner. If that fails, no harm done, we only605* set the FUTEX_WAITERS bit in the user space variable.606*/607return attach_to_pi_owner(uaddr, newval, key, ps, exiting);608}609610/*611* Caller must hold a reference on @pi_state.612*/613static int wake_futex_pi(u32 __user *uaddr, u32 uval,614struct futex_pi_state *pi_state,615struct rt_mutex_waiter *top_waiter)616{617struct task_struct *new_owner;618bool postunlock = false;619DEFINE_RT_WAKE_Q(wqh);620u32 curval, newval;621int ret = 0;622623new_owner = top_waiter->task;624625/*626* We pass it to the next owner. The WAITERS bit is always kept627* enabled while there is PI state around. We cleanup the owner628* died bit, because we are the owner.629*/630newval = FUTEX_WAITERS | task_pid_vnr(new_owner);631632if (unlikely(should_fail_futex(true))) {633ret = -EFAULT;634goto out_unlock;635}636637ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);638if (!ret && (curval != uval)) {639/*640* If a unconditional UNLOCK_PI operation (user space did not641* try the TID->0 transition) raced with a waiter setting the642* FUTEX_WAITERS flag between get_user() and locking the hash643* bucket lock, retry the operation.644*/645if ((FUTEX_TID_MASK & curval) == uval)646ret = -EAGAIN;647else648ret = -EINVAL;649}650651if (!ret) {652/*653* This is a point of no return; once we modified the uval654* there is no going back and subsequent operations must655* not fail.656*/657pi_state_update_owner(pi_state, new_owner);658postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);659}660661out_unlock:662raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);663664if (postunlock)665rt_mutex_postunlock(&wqh);666667return ret;668}669670static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,671struct task_struct *argowner)672{673struct futex_pi_state *pi_state = q->pi_state;674struct task_struct *oldowner, *newowner;675u32 uval, curval, newval, newtid;676int err = 0;677678oldowner = pi_state->owner;679680/*681* We are here because either:682*683* - we stole the lock and pi_state->owner needs updating to reflect684* that (@argowner == current),685*686* or:687*688* - someone stole our lock and we need to fix things to point to the689* new owner (@argowner == NULL).690*691* Either way, we have to replace the TID in the user space variable.692* This must be atomic as we have to preserve the owner died bit here.693*694* Note: We write the user space value _before_ changing the pi_state695* because we can fault here. Imagine swapped out pages or a fork696* that marked all the anonymous memory readonly for cow.697*698* Modifying pi_state _before_ the user space value would leave the699* pi_state in an inconsistent state when we fault here, because we700* need to drop the locks to handle the fault. This might be observed701* in the PID checks when attaching to PI state .702*/703retry:704if (!argowner) {705if (oldowner != current) {706/*707* We raced against a concurrent self; things are708* already fixed up. Nothing to do.709*/710return 0;711}712713if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {714/* We got the lock. pi_state is correct. Tell caller. */715return 1;716}717718/*719* The trylock just failed, so either there is an owner or720* there is a higher priority waiter than this one.721*/722newowner = rt_mutex_owner(&pi_state->pi_mutex);723/*724* If the higher priority waiter has not yet taken over the725* rtmutex then newowner is NULL. We can't return here with726* that state because it's inconsistent vs. the user space727* state. So drop the locks and try again. It's a valid728* situation and not any different from the other retry729* conditions.730*/731if (unlikely(!newowner)) {732err = -EAGAIN;733goto handle_err;734}735} else {736WARN_ON_ONCE(argowner != current);737if (oldowner == current) {738/*739* We raced against a concurrent self; things are740* already fixed up. Nothing to do.741*/742return 1;743}744newowner = argowner;745}746747newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;748/* Owner died? */749if (!pi_state->owner)750newtid |= FUTEX_OWNER_DIED;751752err = futex_get_value_locked(&uval, uaddr);753if (err)754goto handle_err;755756for (;;) {757newval = (uval & FUTEX_OWNER_DIED) | newtid;758759err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);760if (err)761goto handle_err;762763if (curval == uval)764break;765uval = curval;766}767768/*769* We fixed up user space. Now we need to fix the pi_state770* itself.771*/772pi_state_update_owner(pi_state, newowner);773774return argowner == current;775776/*777* In order to reschedule or handle a page fault, we need to drop the778* locks here. In the case of a fault, this gives the other task779* (either the highest priority waiter itself or the task which stole780* the rtmutex) the chance to try the fixup of the pi_state. So once we781* are back from handling the fault we need to check the pi_state after782* reacquiring the locks and before trying to do another fixup. When783* the fixup has been done already we simply return.784*785* Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely786* drop hb->lock since the caller owns the hb -> futex_q relation.787* Dropping the pi_mutex->wait_lock requires the state revalidate.788*/789handle_err:790raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);791spin_unlock(q->lock_ptr);792793switch (err) {794case -EFAULT:795err = fault_in_user_writeable(uaddr);796break;797798case -EAGAIN:799cond_resched();800err = 0;801break;802803default:804WARN_ON_ONCE(1);805break;806}807808futex_q_lockptr_lock(q);809raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);810811/*812* Check if someone else fixed it for us:813*/814if (pi_state->owner != oldowner)815return argowner == current;816817/* Retry if err was -EAGAIN or the fault in succeeded */818if (!err)819goto retry;820821/*822* fault_in_user_writeable() failed so user state is immutable. At823* best we can make the kernel state consistent but user state will824* be most likely hosed and any subsequent unlock operation will be825* rejected due to PI futex rule [10].826*827* Ensure that the rtmutex owner is also the pi_state owner despite828* the user space value claiming something different. There is no829* point in unlocking the rtmutex if current is the owner as it830* would need to wait until the next waiter has taken the rtmutex831* to guarantee consistent state. Keep it simple. Userspace asked832* for this wreckaged state.833*834* The rtmutex has an owner - either current or some other835* task. See the EAGAIN loop above.836*/837pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));838839return err;840}841842static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,843struct task_struct *argowner)844{845struct futex_pi_state *pi_state = q->pi_state;846int ret;847848lockdep_assert_held(q->lock_ptr);849850raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);851ret = __fixup_pi_state_owner(uaddr, q, argowner);852raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);853return ret;854}855856/**857* fixup_pi_owner() - Post lock pi_state and corner case management858* @uaddr: user address of the futex859* @q: futex_q (contains pi_state and access to the rt_mutex)860* @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)861*862* After attempting to lock an rt_mutex, this function is called to cleanup863* the pi_state owner as well as handle race conditions that may allow us to864* acquire the lock. Must be called with the hb lock held.865*866* Return:867* - 1 - success, lock taken;868* - 0 - success, lock not taken;869* - <0 - on error (-EFAULT)870*/871int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)872{873if (locked) {874/*875* Got the lock. We might not be the anticipated owner if we876* did a lock-steal - fix up the PI-state in that case:877*878* Speculative pi_state->owner read (we don't hold wait_lock);879* since we own the lock pi_state->owner == current is the880* stable state, anything else needs more attention.881*/882if (q->pi_state->owner != current)883return fixup_pi_state_owner(uaddr, q, current);884return 1;885}886887/*888* If we didn't get the lock; check if anybody stole it from us. In889* that case, we need to fix up the uval to point to them instead of890* us, otherwise bad things happen. [10]891*892* Another speculative read; pi_state->owner == current is unstable893* but needs our attention.894*/895if (q->pi_state->owner == current)896return fixup_pi_state_owner(uaddr, q, NULL);897898/*899* Paranoia check. If we did not take the lock, then we should not be900* the owner of the rt_mutex. Warn and establish consistent state.901*/902if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))903return fixup_pi_state_owner(uaddr, q, current);904905return 0;906}907908/*909* Userspace tried a 0 -> TID atomic transition of the futex value910* and failed. The kernel side here does the whole locking operation:911* if there are waiters then it will block as a consequence of relying912* on rt-mutexes, it does PI, etc. (Due to races the kernel might see913* a 0 value of the futex too.).914*915* Also serves as futex trylock_pi()'ing, and due semantics.916*/917int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)918{919struct hrtimer_sleeper timeout, *to;920struct task_struct *exiting = NULL;921struct rt_mutex_waiter rt_waiter;922struct futex_q q = futex_q_init;923DEFINE_WAKE_Q(wake_q);924int res, ret;925926if (!IS_ENABLED(CONFIG_FUTEX_PI))927return -ENOSYS;928929if (refill_pi_state_cache())930return -ENOMEM;931932to = futex_setup_timer(time, &timeout, flags, 0);933934retry:935ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE);936if (unlikely(ret != 0))937goto out;938939retry_private:940if (1) {941CLASS(hb, hb)(&q.key);942943futex_q_lock(&q, hb);944945ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,946&exiting, 0);947if (unlikely(ret)) {948/*949* Atomic work succeeded and we got the lock,950* or failed. Either way, we do _not_ block.951*/952switch (ret) {953case 1:954/* We got the lock. */955ret = 0;956goto out_unlock_put_key;957case -EFAULT:958goto uaddr_faulted;959case -EBUSY:960case -EAGAIN:961/*962* Two reasons for this:963* - EBUSY: Task is exiting and we just wait for the964* exit to complete.965* - EAGAIN: The user space value changed.966*/967futex_q_unlock(hb);968/*969* Handle the case where the owner is in the middle of970* exiting. Wait for the exit to complete otherwise971* this task might loop forever, aka. live lock.972*/973wait_for_owner_exiting(ret, exiting);974cond_resched();975goto retry;976default:977goto out_unlock_put_key;978}979}980981WARN_ON(!q.pi_state);982983/*984* Only actually queue now that the atomic ops are done:985*/986__futex_queue(&q, hb, current);987988if (trylock) {989ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);990/* Fixup the trylock return value: */991ret = ret ? 0 : -EWOULDBLOCK;992goto no_block;993}994995/*996* Caution; releasing @hb in-scope. The hb->lock is still locked997* while the reference is dropped. The reference can not be dropped998* after the unlock because if a user initiated resize is in progress999* then we might need to wake him. This can not be done after the1000* rt_mutex_pre_schedule() invocation. The hb will remain valid because1001* the thread, performing resize, will block on hb->lock during1002* the requeue.1003*/1004futex_hash_put(no_free_ptr(hb));1005/*1006* Must be done before we enqueue the waiter, here is unfortunately1007* under the hb lock, but that *should* work because it does nothing.1008*/1009rt_mutex_pre_schedule();10101011rt_mutex_init_waiter(&rt_waiter);10121013/*1014* On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not1015* hold it while doing rt_mutex_start_proxy(), because then it will1016* include hb->lock in the blocking chain, even through we'll not in1017* fact hold it while blocking. This will lead it to report -EDEADLK1018* and BUG when futex_unlock_pi() interleaves with this.1019*1020* Therefore acquire wait_lock while holding hb->lock, but drop the1021* latter before calling __rt_mutex_start_proxy_lock(). This1022* interleaves with futex_unlock_pi() -- which does a similar lock1023* handoff -- such that the latter can observe the futex_q::pi_state1024* before __rt_mutex_start_proxy_lock() is done.1025*/1026raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);1027spin_unlock(q.lock_ptr);1028/*1029* __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter1030* such that futex_unlock_pi() is guaranteed to observe the waiter when1031* it sees the futex_q::pi_state.1032*/1033ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q);1034raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q);10351036if (ret) {1037if (ret == 1)1038ret = 0;1039goto cleanup;1040}10411042if (unlikely(to))1043hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);10441045ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);10461047cleanup:1048/*1049* If we failed to acquire the lock (deadlock/signal/timeout), we must1050* unwind the above, however we canont lock hb->lock because1051* rt_mutex already has a waiter enqueued and hb->lock can itself try1052* and enqueue an rt_waiter through rtlock.1053*1054* Doing the cleanup without holding hb->lock can cause inconsistent1055* state between hb and pi_state, but only in the direction of not1056* seeing a waiter that is leaving.1057*1058* See futex_unlock_pi(), it deals with this inconsistency.1059*1060* There be dragons here, since we must deal with the inconsistency on1061* the way out (here), it is impossible to detect/warn about the race1062* the other way around (missing an incoming waiter).1063*1064* What could possibly go wrong...1065*/1066if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))1067ret = 0;10681069/*1070* Now that the rt_waiter has been dequeued, it is safe to use1071* spinlock/rtlock (which might enqueue its own rt_waiter) and fix up1072* the1073*/1074futex_q_lockptr_lock(&q);1075/*1076* Waiter is unqueued.1077*/1078rt_mutex_post_schedule();1079no_block:1080/*1081* Fixup the pi_state owner and possibly acquire the lock if we1082* haven't already.1083*/1084res = fixup_pi_owner(uaddr, &q, !ret);1085/*1086* If fixup_pi_owner() returned an error, propagate that. If it acquired1087* the lock, clear our -ETIMEDOUT or -EINTR.1088*/1089if (res)1090ret = (res < 0) ? res : 0;10911092futex_unqueue_pi(&q);1093spin_unlock(q.lock_ptr);1094if (q.drop_hb_ref) {1095CLASS(hb, hb)(&q.key);1096/* Additional reference from futex_unlock_pi() */1097futex_hash_put(hb);1098}1099goto out;11001101out_unlock_put_key:1102futex_q_unlock(hb);1103goto out;11041105uaddr_faulted:1106futex_q_unlock(hb);11071108ret = fault_in_user_writeable(uaddr);1109if (ret)1110goto out;11111112if (!(flags & FLAGS_SHARED))1113goto retry_private;11141115goto retry;1116}11171118out:1119if (to) {1120hrtimer_cancel(&to->timer);1121destroy_hrtimer_on_stack(&to->timer);1122}1123return ret != -EINTR ? ret : -ERESTARTNOINTR;1124}11251126/*1127* Userspace attempted a TID -> 0 atomic transition, and failed.1128* This is the in-kernel slowpath: we look up the PI state (if any),1129* and do the rt-mutex unlock.1130*/1131int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)1132{1133u32 curval, uval, vpid = task_pid_vnr(current);1134union futex_key key = FUTEX_KEY_INIT;1135struct futex_q *top_waiter;1136int ret;11371138if (!IS_ENABLED(CONFIG_FUTEX_PI))1139return -ENOSYS;11401141retry:1142if (get_user(uval, uaddr))1143return -EFAULT;1144/*1145* We release only a lock we actually own:1146*/1147if ((uval & FUTEX_TID_MASK) != vpid)1148return -EPERM;11491150ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE);1151if (ret)1152return ret;11531154CLASS(hb, hb)(&key);1155spin_lock(&hb->lock);1156retry_hb:11571158/*1159* Check waiters first. We do not trust user space values at1160* all and we at least want to know if user space fiddled1161* with the futex value instead of blindly unlocking.1162*/1163top_waiter = futex_top_waiter(hb, &key);1164if (top_waiter) {1165struct futex_pi_state *pi_state = top_waiter->pi_state;1166struct rt_mutex_waiter *rt_waiter;11671168ret = -EINVAL;1169if (!pi_state)1170goto out_unlock;11711172/*1173* If current does not own the pi_state then the futex is1174* inconsistent and user space fiddled with the futex value.1175*/1176if (pi_state->owner != current)1177goto out_unlock;11781179/*1180* By taking wait_lock while still holding hb->lock, we ensure1181* there is no point where we hold neither; and thereby1182* wake_futex_pi() must observe any new waiters.1183*1184* Since the cleanup: case in futex_lock_pi() removes the1185* rt_waiter without holding hb->lock, it is possible for1186* wake_futex_pi() to not find a waiter while the above does,1187* in this case the waiter is on the way out and it can be1188* ignored.1189*1190* In particular; this forces __rt_mutex_start_proxy() to1191* complete such that we're guaranteed to observe the1192* rt_waiter.1193*/1194raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);11951196/*1197* Futex vs rt_mutex waiter state -- if there are no rt_mutex1198* waiters even though futex thinks there are, then the waiter1199* is leaving. The entry needs to be removed from the list so a1200* new futex_lock_pi() is not using this stale PI-state while1201* the futex is available in user space again.1202* There can be more than one task on its way out so it needs1203* to retry.1204*/1205rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);1206if (!rt_waiter) {1207/*1208* Acquire a reference for the leaving waiter to ensure1209* valid futex_q::lock_ptr.1210*/1211futex_hash_get(hb);1212top_waiter->drop_hb_ref = true;1213__futex_unqueue(top_waiter);1214raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);1215goto retry_hb;1216}12171218get_pi_state(pi_state);1219spin_unlock(&hb->lock);12201221/* drops pi_state->pi_mutex.wait_lock */1222ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter);12231224put_pi_state(pi_state);12251226/*1227* Success, we're done! No tricky corner cases.1228*/1229if (!ret)1230return ret;1231/*1232* The atomic access to the futex value generated a1233* pagefault, so retry the user-access and the wakeup:1234*/1235if (ret == -EFAULT)1236goto pi_faulted;1237/*1238* A unconditional UNLOCK_PI op raced against a waiter1239* setting the FUTEX_WAITERS bit. Try again.1240*/1241if (ret == -EAGAIN)1242goto pi_retry;1243/*1244* wake_futex_pi has detected invalid state. Tell user1245* space.1246*/1247return ret;1248}12491250/*1251* We have no kernel internal state, i.e. no waiters in the1252* kernel. Waiters which are about to queue themselves are stuck1253* on hb->lock. So we can safely ignore them. We do neither1254* preserve the WAITERS bit not the OWNER_DIED one. We are the1255* owner.1256*/1257if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {1258spin_unlock(&hb->lock);1259switch (ret) {1260case -EFAULT:1261goto pi_faulted;12621263case -EAGAIN:1264goto pi_retry;12651266default:1267WARN_ON_ONCE(1);1268return ret;1269}1270}12711272/*1273* If uval has changed, let user space handle it.1274*/1275ret = (curval == uval) ? 0 : -EAGAIN;12761277out_unlock:1278spin_unlock(&hb->lock);1279return ret;12801281pi_retry:1282cond_resched();1283goto retry;12841285pi_faulted:12861287ret = fault_in_user_writeable(uaddr);1288if (!ret)1289goto retry;12901291return ret;1292}1293129412951296