Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/kernel/futex/waitwake.c
29266 views
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
3
#include <linux/plist.h>
4
#include <linux/sched/task.h>
5
#include <linux/sched/signal.h>
6
#include <linux/freezer.h>
7
8
#include "futex.h"
9
10
/*
11
* READ this before attempting to hack on futexes!
12
*
13
* Basic futex operation and ordering guarantees
14
* =============================================
15
*
16
* The waiter reads the futex value in user space and calls
17
* futex_wait(). This function computes the hash bucket and acquires
18
* the hash bucket lock. After that it reads the futex user space value
19
* again and verifies that the data has not changed. If it has not changed
20
* it enqueues itself into the hash bucket, releases the hash bucket lock
21
* and schedules.
22
*
23
* The waker side modifies the user space value of the futex and calls
24
* futex_wake(). This function computes the hash bucket and acquires the
25
* hash bucket lock. Then it looks for waiters on that futex in the hash
26
* bucket and wakes them.
27
*
28
* In futex wake up scenarios where no tasks are blocked on a futex, taking
29
* the hb spinlock can be avoided and simply return. In order for this
30
* optimization to work, ordering guarantees must exist so that the waiter
31
* being added to the list is acknowledged when the list is concurrently being
32
* checked by the waker, avoiding scenarios like the following:
33
*
34
* CPU 0 CPU 1
35
* val = *futex;
36
* sys_futex(WAIT, futex, val);
37
* futex_wait(futex, val);
38
* uval = *futex;
39
* *futex = newval;
40
* sys_futex(WAKE, futex);
41
* futex_wake(futex);
42
* if (queue_empty())
43
* return;
44
* if (uval == val)
45
* lock(hash_bucket(futex));
46
* queue();
47
* unlock(hash_bucket(futex));
48
* schedule();
49
*
50
* This would cause the waiter on CPU 0 to wait forever because it
51
* missed the transition of the user space value from val to newval
52
* and the waker did not find the waiter in the hash bucket queue.
53
*
54
* The correct serialization ensures that a waiter either observes
55
* the changed user space value before blocking or is woken by a
56
* concurrent waker:
57
*
58
* CPU 0 CPU 1
59
* val = *futex;
60
* sys_futex(WAIT, futex, val);
61
* futex_wait(futex, val);
62
*
63
* waiters++; (a)
64
* smp_mb(); (A) <-- paired with -.
65
* |
66
* lock(hash_bucket(futex)); |
67
* |
68
* uval = *futex; |
69
* | *futex = newval;
70
* | sys_futex(WAKE, futex);
71
* | futex_wake(futex);
72
* |
73
* `--------> smp_mb(); (B)
74
* if (uval == val)
75
* queue();
76
* unlock(hash_bucket(futex));
77
* schedule(); if (waiters)
78
* lock(hash_bucket(futex));
79
* else wake_waiters(futex);
80
* waiters--; (b) unlock(hash_bucket(futex));
81
*
82
* Where (A) orders the waiters increment and the futex value read through
83
* atomic operations (see futex_hb_waiters_inc) and where (B) orders the write
84
* to futex and the waiters read (see futex_hb_waiters_pending()).
85
*
86
* This yields the following case (where X:=waiters, Y:=futex):
87
*
88
* X = Y = 0
89
*
90
* w[X]=1 w[Y]=1
91
* MB MB
92
* r[Y]=y r[X]=x
93
*
94
* Which guarantees that x==0 && y==0 is impossible; which translates back into
95
* the guarantee that we cannot both miss the futex variable change and the
96
* enqueue.
97
*
98
* Note that a new waiter is accounted for in (a) even when it is possible that
99
* the wait call can return error, in which case we backtrack from it in (b).
100
* Refer to the comment in futex_q_lock().
101
*
102
* Similarly, in order to account for waiters being requeued on another
103
* address we always increment the waiters for the destination bucket before
104
* acquiring the lock. It then decrements them again after releasing it -
105
* the code that actually moves the futex(es) between hash buckets (requeue_futex)
106
* will do the additional required waiter count housekeeping. This is done for
107
* double_lock_hb() and double_unlock_hb(), respectively.
108
*/
109
110
bool __futex_wake_mark(struct futex_q *q)
111
{
112
if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
113
return false;
114
115
__futex_unqueue(q);
116
/*
117
* The waiting task can free the futex_q as soon as q->lock_ptr = NULL
118
* is written, without taking any locks. This is possible in the event
119
* of a spurious wakeup, for example. A memory barrier is required here
120
* to prevent the following store to lock_ptr from getting ahead of the
121
* plist_del in __futex_unqueue().
122
*/
123
smp_store_release(&q->lock_ptr, NULL);
124
125
return true;
126
}
127
128
/*
129
* The hash bucket lock must be held when this is called.
130
* Afterwards, the futex_q must not be accessed. Callers
131
* must ensure to later call wake_up_q() for the actual
132
* wakeups to occur.
133
*/
134
void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
135
{
136
struct task_struct *p = q->task;
137
138
get_task_struct(p);
139
140
if (!__futex_wake_mark(q)) {
141
put_task_struct(p);
142
return;
143
}
144
145
/*
146
* Queue the task for later wakeup for after we've released
147
* the hb->lock.
148
*/
149
wake_q_add_safe(wake_q, p);
150
}
151
152
/*
153
* Wake up waiters matching bitset queued on this futex (uaddr).
154
*/
155
int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
156
{
157
struct futex_q *this, *next;
158
union futex_key key = FUTEX_KEY_INIT;
159
DEFINE_WAKE_Q(wake_q);
160
int ret;
161
162
if (!bitset)
163
return -EINVAL;
164
165
ret = get_futex_key(uaddr, flags, &key, FUTEX_READ);
166
if (unlikely(ret != 0))
167
return ret;
168
169
if ((flags & FLAGS_STRICT) && !nr_wake)
170
return 0;
171
172
CLASS(hb, hb)(&key);
173
174
/* Make sure we really have tasks to wakeup */
175
if (!futex_hb_waiters_pending(hb))
176
return ret;
177
178
spin_lock(&hb->lock);
179
180
plist_for_each_entry_safe(this, next, &hb->chain, list) {
181
if (futex_match (&this->key, &key)) {
182
if (this->pi_state || this->rt_waiter) {
183
ret = -EINVAL;
184
break;
185
}
186
187
/* Check if one of the bits is set in both bitsets */
188
if (!(this->bitset & bitset))
189
continue;
190
191
this->wake(&wake_q, this);
192
if (++ret >= nr_wake)
193
break;
194
}
195
}
196
197
spin_unlock(&hb->lock);
198
wake_up_q(&wake_q);
199
return ret;
200
}
201
202
static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
203
{
204
unsigned int op = (encoded_op & 0x70000000) >> 28;
205
unsigned int cmp = (encoded_op & 0x0f000000) >> 24;
206
int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
207
int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
208
int oldval, ret;
209
210
if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
211
if (oparg < 0 || oparg > 31) {
212
/*
213
* kill this print and return -EINVAL when userspace
214
* is sane again
215
*/
216
pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
217
current->comm, oparg);
218
oparg &= 31;
219
}
220
oparg = 1 << oparg;
221
}
222
223
pagefault_disable();
224
ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
225
pagefault_enable();
226
if (ret)
227
return ret;
228
229
switch (cmp) {
230
case FUTEX_OP_CMP_EQ:
231
return oldval == cmparg;
232
case FUTEX_OP_CMP_NE:
233
return oldval != cmparg;
234
case FUTEX_OP_CMP_LT:
235
return oldval < cmparg;
236
case FUTEX_OP_CMP_GE:
237
return oldval >= cmparg;
238
case FUTEX_OP_CMP_LE:
239
return oldval <= cmparg;
240
case FUTEX_OP_CMP_GT:
241
return oldval > cmparg;
242
default:
243
return -ENOSYS;
244
}
245
}
246
247
/*
248
* Wake up all waiters hashed on the physical page that is mapped
249
* to this virtual address:
250
*/
251
int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
252
int nr_wake, int nr_wake2, int op)
253
{
254
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
255
struct futex_q *this, *next;
256
int ret, op_ret;
257
DEFINE_WAKE_Q(wake_q);
258
259
retry:
260
ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ);
261
if (unlikely(ret != 0))
262
return ret;
263
ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
264
if (unlikely(ret != 0))
265
return ret;
266
267
retry_private:
268
if (1) {
269
CLASS(hb, hb1)(&key1);
270
CLASS(hb, hb2)(&key2);
271
272
double_lock_hb(hb1, hb2);
273
op_ret = futex_atomic_op_inuser(op, uaddr2);
274
if (unlikely(op_ret < 0)) {
275
double_unlock_hb(hb1, hb2);
276
277
if (!IS_ENABLED(CONFIG_MMU) ||
278
unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
279
/*
280
* we don't get EFAULT from MMU faults if we don't have
281
* an MMU, but we might get them from range checking
282
*/
283
ret = op_ret;
284
return ret;
285
}
286
287
if (op_ret == -EFAULT) {
288
ret = fault_in_user_writeable(uaddr2);
289
if (ret)
290
return ret;
291
}
292
293
cond_resched();
294
if (!(flags & FLAGS_SHARED))
295
goto retry_private;
296
goto retry;
297
}
298
299
plist_for_each_entry_safe(this, next, &hb1->chain, list) {
300
if (futex_match(&this->key, &key1)) {
301
if (this->pi_state || this->rt_waiter) {
302
ret = -EINVAL;
303
goto out_unlock;
304
}
305
this->wake(&wake_q, this);
306
if (++ret >= nr_wake)
307
break;
308
}
309
}
310
311
if (op_ret > 0) {
312
op_ret = 0;
313
plist_for_each_entry_safe(this, next, &hb2->chain, list) {
314
if (futex_match(&this->key, &key2)) {
315
if (this->pi_state || this->rt_waiter) {
316
ret = -EINVAL;
317
goto out_unlock;
318
}
319
this->wake(&wake_q, this);
320
if (++op_ret >= nr_wake2)
321
break;
322
}
323
}
324
ret += op_ret;
325
}
326
327
out_unlock:
328
double_unlock_hb(hb1, hb2);
329
}
330
wake_up_q(&wake_q);
331
return ret;
332
}
333
334
static long futex_wait_restart(struct restart_block *restart);
335
336
/**
337
* futex_do_wait() - wait for wakeup, timeout, or signal
338
* @q: the futex_q to queue up on
339
* @timeout: the prepared hrtimer_sleeper, or null for no timeout
340
*/
341
void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout)
342
{
343
/* Arm the timer */
344
if (timeout)
345
hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
346
347
/*
348
* If we have been removed from the hash list, then another task
349
* has tried to wake us, and we can skip the call to schedule().
350
*/
351
if (likely(!plist_node_empty(&q->list))) {
352
/*
353
* If the timer has already expired, current will already be
354
* flagged for rescheduling. Only call schedule if there
355
* is no timeout, or if it has yet to expire.
356
*/
357
if (!timeout || timeout->task)
358
schedule();
359
}
360
__set_current_state(TASK_RUNNING);
361
}
362
363
/**
364
* futex_unqueue_multiple - Remove various futexes from their hash bucket
365
* @v: The list of futexes to unqueue
366
* @count: Number of futexes in the list
367
*
368
* Helper to unqueue a list of futexes. This can't fail.
369
*
370
* Return:
371
* - >=0 - Index of the last futex that was awoken;
372
* - -1 - No futex was awoken
373
*/
374
int futex_unqueue_multiple(struct futex_vector *v, int count)
375
{
376
int ret = -1, i;
377
378
for (i = 0; i < count; i++) {
379
if (!futex_unqueue(&v[i].q))
380
ret = i;
381
}
382
383
return ret;
384
}
385
386
/**
387
* futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes
388
* @vs: The futex list to wait on
389
* @count: The size of the list
390
* @woken: Index of the last woken futex, if any. Used to notify the
391
* caller that it can return this index to userspace (return parameter)
392
*
393
* Prepare multiple futexes in a single step and enqueue them. This may fail if
394
* the futex list is invalid or if any futex was already awoken. On success the
395
* task is ready to interruptible sleep.
396
*
397
* Return:
398
* - 1 - One of the futexes was woken by another thread
399
* - 0 - Success
400
* - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL
401
*/
402
int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
403
{
404
bool retry = false;
405
int ret, i;
406
u32 uval;
407
408
/*
409
* Make sure to have a reference on the private_hash such that we
410
* don't block on rehash after changing the task state below.
411
*/
412
guard(private_hash)();
413
414
/*
415
* Enqueuing multiple futexes is tricky, because we need to enqueue
416
* each futex on the list before dealing with the next one to avoid
417
* deadlocking on the hash bucket. But, before enqueuing, we need to
418
* make sure that current->state is TASK_INTERRUPTIBLE, so we don't
419
* lose any wake events, which cannot be done before the get_futex_key
420
* of the next key, because it calls get_user_pages, which can sleep.
421
* Thus, we fetch the list of futexes keys in two steps, by first
422
* pinning all the memory keys in the futex key, and only then we read
423
* each key and queue the corresponding futex.
424
*
425
* Private futexes doesn't need to recalculate hash in retry, so skip
426
* get_futex_key() when retrying.
427
*/
428
retry:
429
for (i = 0; i < count; i++) {
430
if (!(vs[i].w.flags & FLAGS_SHARED) && retry)
431
continue;
432
433
ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr),
434
vs[i].w.flags,
435
&vs[i].q.key, FUTEX_READ);
436
437
if (unlikely(ret))
438
return ret;
439
}
440
441
set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
442
443
for (i = 0; i < count; i++) {
444
u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr;
445
struct futex_q *q = &vs[i].q;
446
u32 val = vs[i].w.val;
447
448
if (1) {
449
CLASS(hb, hb)(&q->key);
450
451
futex_q_lock(q, hb);
452
ret = futex_get_value_locked(&uval, uaddr);
453
454
if (!ret && uval == val) {
455
/*
456
* The bucket lock can't be held while dealing with the
457
* next futex. Queue each futex at this moment so hb can
458
* be unlocked.
459
*/
460
futex_queue(q, hb, current);
461
continue;
462
}
463
464
futex_q_unlock(hb);
465
}
466
__set_current_state(TASK_RUNNING);
467
468
/*
469
* Even if something went wrong, if we find out that a futex
470
* was woken, we don't return error and return this index to
471
* userspace
472
*/
473
*woken = futex_unqueue_multiple(vs, i);
474
if (*woken >= 0)
475
return 1;
476
477
if (ret) {
478
/*
479
* If we need to handle a page fault, we need to do so
480
* without any lock and any enqueued futex (otherwise
481
* we could lose some wakeup). So we do it here, after
482
* undoing all the work done so far. In success, we
483
* retry all the work.
484
*/
485
if (get_user(uval, uaddr))
486
return -EFAULT;
487
488
retry = true;
489
goto retry;
490
}
491
492
if (uval != val)
493
return -EWOULDBLOCK;
494
}
495
496
return 0;
497
}
498
499
/**
500
* futex_sleep_multiple - Check sleeping conditions and sleep
501
* @vs: List of futexes to wait for
502
* @count: Length of vs
503
* @to: Timeout
504
*
505
* Sleep if and only if the timeout hasn't expired and no futex on the list has
506
* been woken up.
507
*/
508
static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count,
509
struct hrtimer_sleeper *to)
510
{
511
if (to && !to->task)
512
return;
513
514
for (; count; count--, vs++) {
515
if (!READ_ONCE(vs->q.lock_ptr))
516
return;
517
}
518
519
schedule();
520
}
521
522
/**
523
* futex_wait_multiple - Prepare to wait on and enqueue several futexes
524
* @vs: The list of futexes to wait on
525
* @count: The number of objects
526
* @to: Timeout before giving up and returning to userspace
527
*
528
* Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function
529
* sleeps on a group of futexes and returns on the first futex that is
530
* wake, or after the timeout has elapsed.
531
*
532
* Return:
533
* - >=0 - Hint to the futex that was awoken
534
* - <0 - On error
535
*/
536
int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
537
struct hrtimer_sleeper *to)
538
{
539
int ret, hint = 0;
540
541
if (to)
542
hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
543
544
while (1) {
545
ret = futex_wait_multiple_setup(vs, count, &hint);
546
if (ret) {
547
if (ret > 0) {
548
/* A futex was woken during setup */
549
ret = hint;
550
}
551
return ret;
552
}
553
554
futex_sleep_multiple(vs, count, to);
555
556
__set_current_state(TASK_RUNNING);
557
558
ret = futex_unqueue_multiple(vs, count);
559
if (ret >= 0)
560
return ret;
561
562
if (to && !to->task)
563
return -ETIMEDOUT;
564
else if (signal_pending(current))
565
return -ERESTARTSYS;
566
/*
567
* The final case is a spurious wakeup, for
568
* which just retry.
569
*/
570
}
571
}
572
573
/**
574
* futex_wait_setup() - Prepare to wait on a futex
575
* @uaddr: the futex userspace address
576
* @val: the expected value
577
* @flags: futex flags (FLAGS_SHARED, etc.)
578
* @q: the associated futex_q
579
* @key2: the second futex_key if used for requeue PI
580
* @task: Task queueing this futex
581
*
582
* Setup the futex_q and locate the hash_bucket. Get the futex value and
583
* compare it with the expected value. Handle atomic faults internally.
584
* Return with the hb lock held on success, and unlocked on failure.
585
*
586
* Return:
587
* - 0 - uaddr contains val and hb has been locked;
588
* - <0 - On error and the hb is unlocked. A possible reason: the uaddr can not
589
* be read, does not contain the expected value or is not properly aligned.
590
*/
591
int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
592
struct futex_q *q, union futex_key *key2,
593
struct task_struct *task)
594
{
595
u32 uval;
596
int ret;
597
598
/*
599
* Access the page AFTER the hash-bucket is locked.
600
* Order is important:
601
*
602
* Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
603
* Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }
604
*
605
* The basic logical guarantee of a futex is that it blocks ONLY
606
* if cond(var) is known to be true at the time of blocking, for
607
* any cond. If we locked the hash-bucket after testing *uaddr, that
608
* would open a race condition where we could block indefinitely with
609
* cond(var) false, which would violate the guarantee.
610
*
611
* On the other hand, we insert q and release the hash-bucket only
612
* after testing *uaddr. This guarantees that futex_wait() will NOT
613
* absorb a wakeup if *uaddr does not match the desired values
614
* while the syscall executes.
615
*/
616
retry:
617
ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ);
618
if (unlikely(ret != 0))
619
return ret;
620
621
retry_private:
622
if (1) {
623
CLASS(hb, hb)(&q->key);
624
625
futex_q_lock(q, hb);
626
627
ret = futex_get_value_locked(&uval, uaddr);
628
629
if (ret) {
630
futex_q_unlock(hb);
631
632
ret = get_user(uval, uaddr);
633
if (ret)
634
return ret;
635
636
if (!(flags & FLAGS_SHARED))
637
goto retry_private;
638
639
goto retry;
640
}
641
642
if (uval != val) {
643
futex_q_unlock(hb);
644
return -EWOULDBLOCK;
645
}
646
647
if (key2 && futex_match(&q->key, key2)) {
648
futex_q_unlock(hb);
649
return -EINVAL;
650
}
651
652
/*
653
* The task state is guaranteed to be set before another task can
654
* wake it. set_current_state() is implemented using smp_store_mb() and
655
* futex_queue() calls spin_unlock() upon completion, both serializing
656
* access to the hash list and forcing another memory barrier.
657
*/
658
if (task == current)
659
set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
660
futex_queue(q, hb, task);
661
}
662
663
return ret;
664
}
665
666
int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
667
struct hrtimer_sleeper *to, u32 bitset)
668
{
669
struct futex_q q = futex_q_init;
670
int ret;
671
672
if (!bitset)
673
return -EINVAL;
674
675
q.bitset = bitset;
676
677
retry:
678
/*
679
* Prepare to wait on uaddr. On success, it holds hb->lock and q
680
* is initialized.
681
*/
682
ret = futex_wait_setup(uaddr, val, flags, &q, NULL, current);
683
if (ret)
684
return ret;
685
686
/* futex_queue and wait for wakeup, timeout, or a signal. */
687
futex_do_wait(&q, to);
688
689
/* If we were woken (and unqueued), we succeeded, whatever. */
690
if (!futex_unqueue(&q))
691
return 0;
692
693
if (to && !to->task)
694
return -ETIMEDOUT;
695
696
/*
697
* We expect signal_pending(current), but we might be the
698
* victim of a spurious wakeup as well.
699
*/
700
if (!signal_pending(current))
701
goto retry;
702
703
return -ERESTARTSYS;
704
}
705
706
int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset)
707
{
708
struct hrtimer_sleeper timeout, *to;
709
struct restart_block *restart;
710
int ret;
711
712
to = futex_setup_timer(abs_time, &timeout, flags,
713
current->timer_slack_ns);
714
715
ret = __futex_wait(uaddr, flags, val, to, bitset);
716
717
/* No timeout, nothing to clean up. */
718
if (!to)
719
return ret;
720
721
hrtimer_cancel(&to->timer);
722
destroy_hrtimer_on_stack(&to->timer);
723
724
if (ret == -ERESTARTSYS) {
725
restart = &current->restart_block;
726
restart->futex.uaddr = uaddr;
727
restart->futex.val = val;
728
restart->futex.time = *abs_time;
729
restart->futex.bitset = bitset;
730
restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
731
732
return set_restart_fn(restart, futex_wait_restart);
733
}
734
735
return ret;
736
}
737
738
static long futex_wait_restart(struct restart_block *restart)
739
{
740
u32 __user *uaddr = restart->futex.uaddr;
741
ktime_t t, *tp = NULL;
742
743
if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
744
t = restart->futex.time;
745
tp = &t;
746
}
747
restart->fn = do_no_restart_syscall;
748
749
return (long)futex_wait(uaddr, restart->futex.flags,
750
restart->futex.val, tp, restart->futex.bitset);
751
}
752
753
754