// SPDX-License-Identifier: GPL-2.0-or-later12#include <linux/plist.h>3#include <linux/sched/task.h>4#include <linux/sched/signal.h>5#include <linux/freezer.h>67#include "futex.h"89/*10* READ this before attempting to hack on futexes!11*12* Basic futex operation and ordering guarantees13* =============================================14*15* The waiter reads the futex value in user space and calls16* futex_wait(). This function computes the hash bucket and acquires17* the hash bucket lock. After that it reads the futex user space value18* again and verifies that the data has not changed. If it has not changed19* it enqueues itself into the hash bucket, releases the hash bucket lock20* and schedules.21*22* The waker side modifies the user space value of the futex and calls23* futex_wake(). This function computes the hash bucket and acquires the24* hash bucket lock. Then it looks for waiters on that futex in the hash25* bucket and wakes them.26*27* In futex wake up scenarios where no tasks are blocked on a futex, taking28* the hb spinlock can be avoided and simply return. In order for this29* optimization to work, ordering guarantees must exist so that the waiter30* being added to the list is acknowledged when the list is concurrently being31* checked by the waker, avoiding scenarios like the following:32*33* CPU 0 CPU 134* val = *futex;35* sys_futex(WAIT, futex, val);36* futex_wait(futex, val);37* uval = *futex;38* *futex = newval;39* sys_futex(WAKE, futex);40* futex_wake(futex);41* if (queue_empty())42* return;43* if (uval == val)44* lock(hash_bucket(futex));45* queue();46* unlock(hash_bucket(futex));47* schedule();48*49* This would cause the waiter on CPU 0 to wait forever because it50* missed the transition of the user space value from val to newval51* and the waker did not find the waiter in the hash bucket queue.52*53* The correct serialization ensures that a waiter either observes54* the changed user space value before blocking or is woken by a55* concurrent waker:56*57* CPU 0 CPU 158* val = *futex;59* sys_futex(WAIT, futex, val);60* futex_wait(futex, val);61*62* waiters++; (a)63* smp_mb(); (A) <-- paired with -.64* |65* lock(hash_bucket(futex)); |66* |67* uval = *futex; |68* | *futex = newval;69* | sys_futex(WAKE, futex);70* | futex_wake(futex);71* |72* `--------> smp_mb(); (B)73* if (uval == val)74* queue();75* unlock(hash_bucket(futex));76* schedule(); if (waiters)77* lock(hash_bucket(futex));78* else wake_waiters(futex);79* waiters--; (b) unlock(hash_bucket(futex));80*81* Where (A) orders the waiters increment and the futex value read through82* atomic operations (see futex_hb_waiters_inc) and where (B) orders the write83* to futex and the waiters read (see futex_hb_waiters_pending()).84*85* This yields the following case (where X:=waiters, Y:=futex):86*87* X = Y = 088*89* w[X]=1 w[Y]=190* MB MB91* r[Y]=y r[X]=x92*93* Which guarantees that x==0 && y==0 is impossible; which translates back into94* the guarantee that we cannot both miss the futex variable change and the95* enqueue.96*97* Note that a new waiter is accounted for in (a) even when it is possible that98* the wait call can return error, in which case we backtrack from it in (b).99* Refer to the comment in futex_q_lock().100*101* Similarly, in order to account for waiters being requeued on another102* address we always increment the waiters for the destination bucket before103* acquiring the lock. It then decrements them again after releasing it -104* the code that actually moves the futex(es) between hash buckets (requeue_futex)105* will do the additional required waiter count housekeeping. This is done for106* double_lock_hb() and double_unlock_hb(), respectively.107*/108109bool __futex_wake_mark(struct futex_q *q)110{111if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))112return false;113114__futex_unqueue(q);115/*116* The waiting task can free the futex_q as soon as q->lock_ptr = NULL117* is written, without taking any locks. This is possible in the event118* of a spurious wakeup, for example. A memory barrier is required here119* to prevent the following store to lock_ptr from getting ahead of the120* plist_del in __futex_unqueue().121*/122smp_store_release(&q->lock_ptr, NULL);123124return true;125}126127/*128* The hash bucket lock must be held when this is called.129* Afterwards, the futex_q must not be accessed. Callers130* must ensure to later call wake_up_q() for the actual131* wakeups to occur.132*/133void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)134{135struct task_struct *p = q->task;136137get_task_struct(p);138139if (!__futex_wake_mark(q)) {140put_task_struct(p);141return;142}143144/*145* Queue the task for later wakeup for after we've released146* the hb->lock.147*/148wake_q_add_safe(wake_q, p);149}150151/*152* Wake up waiters matching bitset queued on this futex (uaddr).153*/154int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)155{156struct futex_q *this, *next;157union futex_key key = FUTEX_KEY_INIT;158DEFINE_WAKE_Q(wake_q);159int ret;160161if (!bitset)162return -EINVAL;163164ret = get_futex_key(uaddr, flags, &key, FUTEX_READ);165if (unlikely(ret != 0))166return ret;167168if ((flags & FLAGS_STRICT) && !nr_wake)169return 0;170171CLASS(hb, hb)(&key);172173/* Make sure we really have tasks to wakeup */174if (!futex_hb_waiters_pending(hb))175return ret;176177spin_lock(&hb->lock);178179plist_for_each_entry_safe(this, next, &hb->chain, list) {180if (futex_match (&this->key, &key)) {181if (this->pi_state || this->rt_waiter) {182ret = -EINVAL;183break;184}185186/* Check if one of the bits is set in both bitsets */187if (!(this->bitset & bitset))188continue;189190this->wake(&wake_q, this);191if (++ret >= nr_wake)192break;193}194}195196spin_unlock(&hb->lock);197wake_up_q(&wake_q);198return ret;199}200201static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)202{203unsigned int op = (encoded_op & 0x70000000) >> 28;204unsigned int cmp = (encoded_op & 0x0f000000) >> 24;205int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);206int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);207int oldval, ret;208209if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {210if (oparg < 0 || oparg > 31) {211/*212* kill this print and return -EINVAL when userspace213* is sane again214*/215pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",216current->comm, oparg);217oparg &= 31;218}219oparg = 1 << oparg;220}221222pagefault_disable();223ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);224pagefault_enable();225if (ret)226return ret;227228switch (cmp) {229case FUTEX_OP_CMP_EQ:230return oldval == cmparg;231case FUTEX_OP_CMP_NE:232return oldval != cmparg;233case FUTEX_OP_CMP_LT:234return oldval < cmparg;235case FUTEX_OP_CMP_GE:236return oldval >= cmparg;237case FUTEX_OP_CMP_LE:238return oldval <= cmparg;239case FUTEX_OP_CMP_GT:240return oldval > cmparg;241default:242return -ENOSYS;243}244}245246/*247* Wake up all waiters hashed on the physical page that is mapped248* to this virtual address:249*/250int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,251int nr_wake, int nr_wake2, int op)252{253union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;254struct futex_q *this, *next;255int ret, op_ret;256DEFINE_WAKE_Q(wake_q);257258retry:259ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ);260if (unlikely(ret != 0))261return ret;262ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);263if (unlikely(ret != 0))264return ret;265266retry_private:267if (1) {268CLASS(hb, hb1)(&key1);269CLASS(hb, hb2)(&key2);270271double_lock_hb(hb1, hb2);272op_ret = futex_atomic_op_inuser(op, uaddr2);273if (unlikely(op_ret < 0)) {274double_unlock_hb(hb1, hb2);275276if (!IS_ENABLED(CONFIG_MMU) ||277unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {278/*279* we don't get EFAULT from MMU faults if we don't have280* an MMU, but we might get them from range checking281*/282ret = op_ret;283return ret;284}285286if (op_ret == -EFAULT) {287ret = fault_in_user_writeable(uaddr2);288if (ret)289return ret;290}291292cond_resched();293if (!(flags & FLAGS_SHARED))294goto retry_private;295goto retry;296}297298plist_for_each_entry_safe(this, next, &hb1->chain, list) {299if (futex_match(&this->key, &key1)) {300if (this->pi_state || this->rt_waiter) {301ret = -EINVAL;302goto out_unlock;303}304this->wake(&wake_q, this);305if (++ret >= nr_wake)306break;307}308}309310if (op_ret > 0) {311op_ret = 0;312plist_for_each_entry_safe(this, next, &hb2->chain, list) {313if (futex_match(&this->key, &key2)) {314if (this->pi_state || this->rt_waiter) {315ret = -EINVAL;316goto out_unlock;317}318this->wake(&wake_q, this);319if (++op_ret >= nr_wake2)320break;321}322}323ret += op_ret;324}325326out_unlock:327double_unlock_hb(hb1, hb2);328}329wake_up_q(&wake_q);330return ret;331}332333static long futex_wait_restart(struct restart_block *restart);334335/**336* futex_do_wait() - wait for wakeup, timeout, or signal337* @q: the futex_q to queue up on338* @timeout: the prepared hrtimer_sleeper, or null for no timeout339*/340void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout)341{342/* Arm the timer */343if (timeout)344hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);345346/*347* If we have been removed from the hash list, then another task348* has tried to wake us, and we can skip the call to schedule().349*/350if (likely(!plist_node_empty(&q->list))) {351/*352* If the timer has already expired, current will already be353* flagged for rescheduling. Only call schedule if there354* is no timeout, or if it has yet to expire.355*/356if (!timeout || timeout->task)357schedule();358}359__set_current_state(TASK_RUNNING);360}361362/**363* futex_unqueue_multiple - Remove various futexes from their hash bucket364* @v: The list of futexes to unqueue365* @count: Number of futexes in the list366*367* Helper to unqueue a list of futexes. This can't fail.368*369* Return:370* - >=0 - Index of the last futex that was awoken;371* - -1 - No futex was awoken372*/373int futex_unqueue_multiple(struct futex_vector *v, int count)374{375int ret = -1, i;376377for (i = 0; i < count; i++) {378if (!futex_unqueue(&v[i].q))379ret = i;380}381382return ret;383}384385/**386* futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes387* @vs: The futex list to wait on388* @count: The size of the list389* @woken: Index of the last woken futex, if any. Used to notify the390* caller that it can return this index to userspace (return parameter)391*392* Prepare multiple futexes in a single step and enqueue them. This may fail if393* the futex list is invalid or if any futex was already awoken. On success the394* task is ready to interruptible sleep.395*396* Return:397* - 1 - One of the futexes was woken by another thread398* - 0 - Success399* - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL400*/401int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)402{403bool retry = false;404int ret, i;405u32 uval;406407/*408* Make sure to have a reference on the private_hash such that we409* don't block on rehash after changing the task state below.410*/411guard(private_hash)();412413/*414* Enqueuing multiple futexes is tricky, because we need to enqueue415* each futex on the list before dealing with the next one to avoid416* deadlocking on the hash bucket. But, before enqueuing, we need to417* make sure that current->state is TASK_INTERRUPTIBLE, so we don't418* lose any wake events, which cannot be done before the get_futex_key419* of the next key, because it calls get_user_pages, which can sleep.420* Thus, we fetch the list of futexes keys in two steps, by first421* pinning all the memory keys in the futex key, and only then we read422* each key and queue the corresponding futex.423*424* Private futexes doesn't need to recalculate hash in retry, so skip425* get_futex_key() when retrying.426*/427retry:428for (i = 0; i < count; i++) {429if (!(vs[i].w.flags & FLAGS_SHARED) && retry)430continue;431432ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr),433vs[i].w.flags,434&vs[i].q.key, FUTEX_READ);435436if (unlikely(ret))437return ret;438}439440set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);441442for (i = 0; i < count; i++) {443u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr;444struct futex_q *q = &vs[i].q;445u32 val = vs[i].w.val;446447if (1) {448CLASS(hb, hb)(&q->key);449450futex_q_lock(q, hb);451ret = futex_get_value_locked(&uval, uaddr);452453if (!ret && uval == val) {454/*455* The bucket lock can't be held while dealing with the456* next futex. Queue each futex at this moment so hb can457* be unlocked.458*/459futex_queue(q, hb, current);460continue;461}462463futex_q_unlock(hb);464}465__set_current_state(TASK_RUNNING);466467/*468* Even if something went wrong, if we find out that a futex469* was woken, we don't return error and return this index to470* userspace471*/472*woken = futex_unqueue_multiple(vs, i);473if (*woken >= 0)474return 1;475476if (ret) {477/*478* If we need to handle a page fault, we need to do so479* without any lock and any enqueued futex (otherwise480* we could lose some wakeup). So we do it here, after481* undoing all the work done so far. In success, we482* retry all the work.483*/484if (get_user(uval, uaddr))485return -EFAULT;486487retry = true;488goto retry;489}490491if (uval != val)492return -EWOULDBLOCK;493}494495return 0;496}497498/**499* futex_sleep_multiple - Check sleeping conditions and sleep500* @vs: List of futexes to wait for501* @count: Length of vs502* @to: Timeout503*504* Sleep if and only if the timeout hasn't expired and no futex on the list has505* been woken up.506*/507static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count,508struct hrtimer_sleeper *to)509{510if (to && !to->task)511return;512513for (; count; count--, vs++) {514if (!READ_ONCE(vs->q.lock_ptr))515return;516}517518schedule();519}520521/**522* futex_wait_multiple - Prepare to wait on and enqueue several futexes523* @vs: The list of futexes to wait on524* @count: The number of objects525* @to: Timeout before giving up and returning to userspace526*527* Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function528* sleeps on a group of futexes and returns on the first futex that is529* wake, or after the timeout has elapsed.530*531* Return:532* - >=0 - Hint to the futex that was awoken533* - <0 - On error534*/535int futex_wait_multiple(struct futex_vector *vs, unsigned int count,536struct hrtimer_sleeper *to)537{538int ret, hint = 0;539540if (to)541hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);542543while (1) {544ret = futex_wait_multiple_setup(vs, count, &hint);545if (ret) {546if (ret > 0) {547/* A futex was woken during setup */548ret = hint;549}550return ret;551}552553futex_sleep_multiple(vs, count, to);554555__set_current_state(TASK_RUNNING);556557ret = futex_unqueue_multiple(vs, count);558if (ret >= 0)559return ret;560561if (to && !to->task)562return -ETIMEDOUT;563else if (signal_pending(current))564return -ERESTARTSYS;565/*566* The final case is a spurious wakeup, for567* which just retry.568*/569}570}571572/**573* futex_wait_setup() - Prepare to wait on a futex574* @uaddr: the futex userspace address575* @val: the expected value576* @flags: futex flags (FLAGS_SHARED, etc.)577* @q: the associated futex_q578* @key2: the second futex_key if used for requeue PI579* @task: Task queueing this futex580*581* Setup the futex_q and locate the hash_bucket. Get the futex value and582* compare it with the expected value. Handle atomic faults internally.583* Return with the hb lock held on success, and unlocked on failure.584*585* Return:586* - 0 - uaddr contains val and hb has been locked;587* - <0 - On error and the hb is unlocked. A possible reason: the uaddr can not588* be read, does not contain the expected value or is not properly aligned.589*/590int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,591struct futex_q *q, union futex_key *key2,592struct task_struct *task)593{594u32 uval;595int ret;596597/*598* Access the page AFTER the hash-bucket is locked.599* Order is important:600*601* Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);602* Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }603*604* The basic logical guarantee of a futex is that it blocks ONLY605* if cond(var) is known to be true at the time of blocking, for606* any cond. If we locked the hash-bucket after testing *uaddr, that607* would open a race condition where we could block indefinitely with608* cond(var) false, which would violate the guarantee.609*610* On the other hand, we insert q and release the hash-bucket only611* after testing *uaddr. This guarantees that futex_wait() will NOT612* absorb a wakeup if *uaddr does not match the desired values613* while the syscall executes.614*/615retry:616ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ);617if (unlikely(ret != 0))618return ret;619620retry_private:621if (1) {622CLASS(hb, hb)(&q->key);623624futex_q_lock(q, hb);625626ret = futex_get_value_locked(&uval, uaddr);627628if (ret) {629futex_q_unlock(hb);630631ret = get_user(uval, uaddr);632if (ret)633return ret;634635if (!(flags & FLAGS_SHARED))636goto retry_private;637638goto retry;639}640641if (uval != val) {642futex_q_unlock(hb);643return -EWOULDBLOCK;644}645646if (key2 && futex_match(&q->key, key2)) {647futex_q_unlock(hb);648return -EINVAL;649}650651/*652* The task state is guaranteed to be set before another task can653* wake it. set_current_state() is implemented using smp_store_mb() and654* futex_queue() calls spin_unlock() upon completion, both serializing655* access to the hash list and forcing another memory barrier.656*/657if (task == current)658set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);659futex_queue(q, hb, task);660}661662return ret;663}664665int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,666struct hrtimer_sleeper *to, u32 bitset)667{668struct futex_q q = futex_q_init;669int ret;670671if (!bitset)672return -EINVAL;673674q.bitset = bitset;675676retry:677/*678* Prepare to wait on uaddr. On success, it holds hb->lock and q679* is initialized.680*/681ret = futex_wait_setup(uaddr, val, flags, &q, NULL, current);682if (ret)683return ret;684685/* futex_queue and wait for wakeup, timeout, or a signal. */686futex_do_wait(&q, to);687688/* If we were woken (and unqueued), we succeeded, whatever. */689if (!futex_unqueue(&q))690return 0;691692if (to && !to->task)693return -ETIMEDOUT;694695/*696* We expect signal_pending(current), but we might be the697* victim of a spurious wakeup as well.698*/699if (!signal_pending(current))700goto retry;701702return -ERESTARTSYS;703}704705int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset)706{707struct hrtimer_sleeper timeout, *to;708struct restart_block *restart;709int ret;710711to = futex_setup_timer(abs_time, &timeout, flags,712current->timer_slack_ns);713714ret = __futex_wait(uaddr, flags, val, to, bitset);715716/* No timeout, nothing to clean up. */717if (!to)718return ret;719720hrtimer_cancel(&to->timer);721destroy_hrtimer_on_stack(&to->timer);722723if (ret == -ERESTARTSYS) {724restart = ¤t->restart_block;725restart->futex.uaddr = uaddr;726restart->futex.val = val;727restart->futex.time = *abs_time;728restart->futex.bitset = bitset;729restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;730731return set_restart_fn(restart, futex_wait_restart);732}733734return ret;735}736737static long futex_wait_restart(struct restart_block *restart)738{739u32 __user *uaddr = restart->futex.uaddr;740ktime_t t, *tp = NULL;741742if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {743t = restart->futex.time;744tp = &t;745}746restart->fn = do_no_restart_syscall;747748return (long)futex_wait(uaddr, restart->futex.flags,749restart->futex.val, tp, restart->futex.bitset);750}751752753754