Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/kernel/futex/pi.c
29266 views
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
3
#include <linux/slab.h>
4
#include <linux/sched/rt.h>
5
#include <linux/sched/task.h>
6
7
#include "futex.h"
8
#include "../locking/rtmutex_common.h"
9
10
/*
11
* PI code:
12
*/
13
int refill_pi_state_cache(void)
14
{
15
struct futex_pi_state *pi_state;
16
17
if (likely(current->pi_state_cache))
18
return 0;
19
20
pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
21
22
if (!pi_state)
23
return -ENOMEM;
24
25
INIT_LIST_HEAD(&pi_state->list);
26
/* pi_mutex gets initialized later */
27
pi_state->owner = NULL;
28
refcount_set(&pi_state->refcount, 1);
29
pi_state->key = FUTEX_KEY_INIT;
30
31
current->pi_state_cache = pi_state;
32
33
return 0;
34
}
35
36
static struct futex_pi_state *alloc_pi_state(void)
37
{
38
struct futex_pi_state *pi_state = current->pi_state_cache;
39
40
WARN_ON(!pi_state);
41
current->pi_state_cache = NULL;
42
43
return pi_state;
44
}
45
46
static void pi_state_update_owner(struct futex_pi_state *pi_state,
47
struct task_struct *new_owner)
48
{
49
struct task_struct *old_owner = pi_state->owner;
50
51
lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
52
53
if (old_owner) {
54
raw_spin_lock(&old_owner->pi_lock);
55
WARN_ON(list_empty(&pi_state->list));
56
list_del_init(&pi_state->list);
57
raw_spin_unlock(&old_owner->pi_lock);
58
}
59
60
if (new_owner) {
61
raw_spin_lock(&new_owner->pi_lock);
62
WARN_ON(!list_empty(&pi_state->list));
63
list_add(&pi_state->list, &new_owner->pi_state_list);
64
pi_state->owner = new_owner;
65
raw_spin_unlock(&new_owner->pi_lock);
66
}
67
}
68
69
void get_pi_state(struct futex_pi_state *pi_state)
70
{
71
WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
72
}
73
74
/*
75
* Drops a reference to the pi_state object and frees or caches it
76
* when the last reference is gone.
77
*/
78
void put_pi_state(struct futex_pi_state *pi_state)
79
{
80
if (!pi_state)
81
return;
82
83
if (!refcount_dec_and_test(&pi_state->refcount))
84
return;
85
86
/*
87
* If pi_state->owner is NULL, the owner is most probably dying
88
* and has cleaned up the pi_state already
89
*/
90
if (pi_state->owner) {
91
unsigned long flags;
92
93
raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
94
pi_state_update_owner(pi_state, NULL);
95
rt_mutex_proxy_unlock(&pi_state->pi_mutex);
96
raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
97
}
98
99
if (current->pi_state_cache) {
100
kfree(pi_state);
101
} else {
102
/*
103
* pi_state->list is already empty.
104
* clear pi_state->owner.
105
* refcount is at 0 - put it back to 1.
106
*/
107
pi_state->owner = NULL;
108
refcount_set(&pi_state->refcount, 1);
109
current->pi_state_cache = pi_state;
110
}
111
}
112
113
/*
114
* We need to check the following states:
115
*
116
* Waiter | pi_state | pi->owner | uTID | uODIED | ?
117
*
118
* [1] NULL | --- | --- | 0 | 0/1 | Valid
119
* [2] NULL | --- | --- | >0 | 0/1 | Valid
120
*
121
* [3] Found | NULL | -- | Any | 0/1 | Invalid
122
*
123
* [4] Found | Found | NULL | 0 | 1 | Valid
124
* [5] Found | Found | NULL | >0 | 1 | Invalid
125
*
126
* [6] Found | Found | task | 0 | 1 | Valid
127
*
128
* [7] Found | Found | NULL | Any | 0 | Invalid
129
*
130
* [8] Found | Found | task | ==taskTID | 0/1 | Valid
131
* [9] Found | Found | task | 0 | 0 | Invalid
132
* [10] Found | Found | task | !=taskTID | 0/1 | Invalid
133
*
134
* [1] Indicates that the kernel can acquire the futex atomically. We
135
* came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
136
*
137
* [2] Valid, if TID does not belong to a kernel thread. If no matching
138
* thread is found then it indicates that the owner TID has died.
139
*
140
* [3] Invalid. The waiter is queued on a non PI futex
141
*
142
* [4] Valid state after exit_robust_list(), which sets the user space
143
* value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
144
*
145
* [5] The user space value got manipulated between exit_robust_list()
146
* and exit_pi_state_list()
147
*
148
* [6] Valid state after exit_pi_state_list() which sets the new owner in
149
* the pi_state but cannot access the user space value.
150
*
151
* [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
152
*
153
* [8] Owner and user space value match
154
*
155
* [9] There is no transient state which sets the user space TID to 0
156
* except exit_robust_list(), but this is indicated by the
157
* FUTEX_OWNER_DIED bit. See [4]
158
*
159
* [10] There is no transient state which leaves owner and user space
160
* TID out of sync. Except one error case where the kernel is denied
161
* write access to the user address, see fixup_pi_state_owner().
162
*
163
*
164
* Serialization and lifetime rules:
165
*
166
* hb->lock:
167
*
168
* hb -> futex_q, relation
169
* futex_q -> pi_state, relation
170
*
171
* (cannot be raw because hb can contain arbitrary amount
172
* of futex_q's)
173
*
174
* pi_mutex->wait_lock:
175
*
176
* {uval, pi_state}
177
*
178
* (and pi_mutex 'obviously')
179
*
180
* p->pi_lock:
181
*
182
* p->pi_state_list -> pi_state->list, relation
183
* pi_mutex->owner -> pi_state->owner, relation
184
*
185
* pi_state->refcount:
186
*
187
* pi_state lifetime
188
*
189
*
190
* Lock order:
191
*
192
* hb->lock
193
* pi_mutex->wait_lock
194
* p->pi_lock
195
*
196
*/
197
198
/*
199
* Validate that the existing waiter has a pi_state and sanity check
200
* the pi_state against the user space value. If correct, attach to
201
* it.
202
*/
203
static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
204
struct futex_pi_state *pi_state,
205
struct futex_pi_state **ps)
206
{
207
pid_t pid = uval & FUTEX_TID_MASK;
208
u32 uval2;
209
int ret;
210
211
/*
212
* Userspace might have messed up non-PI and PI futexes [3]
213
*/
214
if (unlikely(!pi_state))
215
return -EINVAL;
216
217
/*
218
* We get here with hb->lock held, and having found a
219
* futex_top_waiter(). This means that futex_lock_pi() of said futex_q
220
* has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
221
* which in turn means that futex_lock_pi() still has a reference on
222
* our pi_state.
223
*
224
* The waiter holding a reference on @pi_state also protects against
225
* the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
226
* and futex_wait_requeue_pi() as it cannot go to 0 and consequently
227
* free pi_state before we can take a reference ourselves.
228
*/
229
WARN_ON(!refcount_read(&pi_state->refcount));
230
231
/*
232
* Now that we have a pi_state, we can acquire wait_lock
233
* and do the state validation.
234
*/
235
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
236
237
/*
238
* Since {uval, pi_state} is serialized by wait_lock, and our current
239
* uval was read without holding it, it can have changed. Verify it
240
* still is what we expect it to be, otherwise retry the entire
241
* operation.
242
*/
243
if (futex_get_value_locked(&uval2, uaddr))
244
goto out_efault;
245
246
if (uval != uval2)
247
goto out_eagain;
248
249
/*
250
* Handle the owner died case:
251
*/
252
if (uval & FUTEX_OWNER_DIED) {
253
/*
254
* exit_pi_state_list sets owner to NULL and wakes the
255
* topmost waiter. The task which acquires the
256
* pi_state->rt_mutex will fixup owner.
257
*/
258
if (!pi_state->owner) {
259
/*
260
* No pi state owner, but the user space TID
261
* is not 0. Inconsistent state. [5]
262
*/
263
if (pid)
264
goto out_einval;
265
/*
266
* Take a ref on the state and return success. [4]
267
*/
268
goto out_attach;
269
}
270
271
/*
272
* If TID is 0, then either the dying owner has not
273
* yet executed exit_pi_state_list() or some waiter
274
* acquired the rtmutex in the pi state, but did not
275
* yet fixup the TID in user space.
276
*
277
* Take a ref on the state and return success. [6]
278
*/
279
if (!pid)
280
goto out_attach;
281
} else {
282
/*
283
* If the owner died bit is not set, then the pi_state
284
* must have an owner. [7]
285
*/
286
if (!pi_state->owner)
287
goto out_einval;
288
}
289
290
/*
291
* Bail out if user space manipulated the futex value. If pi
292
* state exists then the owner TID must be the same as the
293
* user space TID. [9/10]
294
*/
295
if (pid != task_pid_vnr(pi_state->owner))
296
goto out_einval;
297
298
out_attach:
299
get_pi_state(pi_state);
300
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
301
*ps = pi_state;
302
return 0;
303
304
out_einval:
305
ret = -EINVAL;
306
goto out_error;
307
308
out_eagain:
309
ret = -EAGAIN;
310
goto out_error;
311
312
out_efault:
313
ret = -EFAULT;
314
goto out_error;
315
316
out_error:
317
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
318
return ret;
319
}
320
321
static int handle_exit_race(u32 __user *uaddr, u32 uval,
322
struct task_struct *tsk)
323
{
324
u32 uval2;
325
326
/*
327
* If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
328
* caller that the alleged owner is busy.
329
*/
330
if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
331
return -EBUSY;
332
333
/*
334
* Reread the user space value to handle the following situation:
335
*
336
* CPU0 CPU1
337
*
338
* sys_exit() sys_futex()
339
* do_exit() futex_lock_pi()
340
* futex_lock_pi_atomic()
341
* exit_signals(tsk) No waiters:
342
* tsk->flags |= PF_EXITING; *uaddr == 0x00000PID
343
* mm_release(tsk) Set waiter bit
344
* exit_robust_list(tsk) { *uaddr = 0x80000PID;
345
* Set owner died attach_to_pi_owner() {
346
* *uaddr = 0xC0000000; tsk = get_task(PID);
347
* } if (!tsk->flags & PF_EXITING) {
348
* ... attach();
349
* tsk->futex_state = } else {
350
* FUTEX_STATE_DEAD; if (tsk->futex_state !=
351
* FUTEX_STATE_DEAD)
352
* return -EAGAIN;
353
* return -ESRCH; <--- FAIL
354
* }
355
*
356
* Returning ESRCH unconditionally is wrong here because the
357
* user space value has been changed by the exiting task.
358
*
359
* The same logic applies to the case where the exiting task is
360
* already gone.
361
*/
362
if (futex_get_value_locked(&uval2, uaddr))
363
return -EFAULT;
364
365
/* If the user space value has changed, try again. */
366
if (uval2 != uval)
367
return -EAGAIN;
368
369
/*
370
* The exiting task did not have a robust list, the robust list was
371
* corrupted or the user space value in *uaddr is simply bogus.
372
* Give up and tell user space.
373
*/
374
return -ESRCH;
375
}
376
377
static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
378
struct futex_pi_state **ps)
379
{
380
/*
381
* No existing pi state. First waiter. [2]
382
*
383
* This creates pi_state, we have hb->lock held, this means nothing can
384
* observe this state, wait_lock is irrelevant.
385
*/
386
struct futex_pi_state *pi_state = alloc_pi_state();
387
388
/*
389
* Initialize the pi_mutex in locked state and make @p
390
* the owner of it:
391
*/
392
rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
393
394
/* Store the key for possible exit cleanups: */
395
pi_state->key = *key;
396
397
WARN_ON(!list_empty(&pi_state->list));
398
list_add(&pi_state->list, &p->pi_state_list);
399
/*
400
* Assignment without holding pi_state->pi_mutex.wait_lock is safe
401
* because there is no concurrency as the object is not published yet.
402
*/
403
pi_state->owner = p;
404
405
*ps = pi_state;
406
}
407
/*
408
* Lookup the task for the TID provided from user space and attach to
409
* it after doing proper sanity checks.
410
*/
411
static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
412
struct futex_pi_state **ps,
413
struct task_struct **exiting)
414
{
415
pid_t pid = uval & FUTEX_TID_MASK;
416
struct task_struct *p;
417
418
/*
419
* We are the first waiter - try to look up the real owner and attach
420
* the new pi_state to it, but bail out when TID = 0 [1]
421
*
422
* The !pid check is paranoid. None of the call sites should end up
423
* with pid == 0, but better safe than sorry. Let the caller retry
424
*/
425
if (!pid)
426
return -EAGAIN;
427
p = find_get_task_by_vpid(pid);
428
if (!p)
429
return handle_exit_race(uaddr, uval, NULL);
430
431
if (unlikely(p->flags & PF_KTHREAD)) {
432
put_task_struct(p);
433
return -EPERM;
434
}
435
436
/*
437
* We need to look at the task state to figure out, whether the
438
* task is exiting. To protect against the change of the task state
439
* in futex_exit_release(), we do this protected by p->pi_lock:
440
*/
441
raw_spin_lock_irq(&p->pi_lock);
442
if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
443
/*
444
* The task is on the way out. When the futex state is
445
* FUTEX_STATE_DEAD, we know that the task has finished
446
* the cleanup:
447
*/
448
int ret = handle_exit_race(uaddr, uval, p);
449
450
raw_spin_unlock_irq(&p->pi_lock);
451
/*
452
* If the owner task is between FUTEX_STATE_EXITING and
453
* FUTEX_STATE_DEAD then store the task pointer and keep
454
* the reference on the task struct. The calling code will
455
* drop all locks, wait for the task to reach
456
* FUTEX_STATE_DEAD and then drop the refcount. This is
457
* required to prevent a live lock when the current task
458
* preempted the exiting task between the two states.
459
*/
460
if (ret == -EBUSY)
461
*exiting = p;
462
else
463
put_task_struct(p);
464
return ret;
465
}
466
467
__attach_to_pi_owner(p, key, ps);
468
raw_spin_unlock_irq(&p->pi_lock);
469
470
put_task_struct(p);
471
472
return 0;
473
}
474
475
static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
476
{
477
int err;
478
u32 curval;
479
480
if (unlikely(should_fail_futex(true)))
481
return -EFAULT;
482
483
err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
484
if (unlikely(err))
485
return err;
486
487
/* If user space value changed, let the caller retry */
488
return curval != uval ? -EAGAIN : 0;
489
}
490
491
/**
492
* futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
493
* @uaddr: the pi futex user address
494
* @hb: the pi futex hash bucket
495
* @key: the futex key associated with uaddr and hb
496
* @ps: the pi_state pointer where we store the result of the
497
* lookup
498
* @task: the task to perform the atomic lock work for. This will
499
* be "current" except in the case of requeue pi.
500
* @exiting: Pointer to store the task pointer of the owner task
501
* which is in the middle of exiting
502
* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
503
*
504
* Return:
505
* - 0 - ready to wait;
506
* - 1 - acquired the lock;
507
* - <0 - error
508
*
509
* The hb->lock must be held by the caller.
510
*
511
* @exiting is only set when the return value is -EBUSY. If so, this holds
512
* a refcount on the exiting task on return and the caller needs to drop it
513
* after waiting for the exit to complete.
514
*/
515
int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
516
union futex_key *key,
517
struct futex_pi_state **ps,
518
struct task_struct *task,
519
struct task_struct **exiting,
520
int set_waiters)
521
{
522
u32 uval, newval, vpid = task_pid_vnr(task);
523
struct futex_q *top_waiter;
524
int ret;
525
526
/*
527
* Read the user space value first so we can validate a few
528
* things before proceeding further.
529
*/
530
if (futex_get_value_locked(&uval, uaddr))
531
return -EFAULT;
532
533
if (unlikely(should_fail_futex(true)))
534
return -EFAULT;
535
536
/*
537
* Detect deadlocks.
538
*/
539
if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
540
return -EDEADLK;
541
542
if ((unlikely(should_fail_futex(true))))
543
return -EDEADLK;
544
545
/*
546
* Lookup existing state first. If it exists, try to attach to
547
* its pi_state.
548
*/
549
top_waiter = futex_top_waiter(hb, key);
550
if (top_waiter)
551
return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
552
553
/*
554
* No waiter and user TID is 0. We are here because the
555
* waiters or the owner died bit is set or called from
556
* requeue_cmp_pi or for whatever reason something took the
557
* syscall.
558
*/
559
if (!(uval & FUTEX_TID_MASK)) {
560
/*
561
* We take over the futex. No other waiters and the user space
562
* TID is 0. We preserve the owner died bit.
563
*/
564
newval = uval & FUTEX_OWNER_DIED;
565
newval |= vpid;
566
567
/* The futex requeue_pi code can enforce the waiters bit */
568
if (set_waiters)
569
newval |= FUTEX_WAITERS;
570
571
ret = lock_pi_update_atomic(uaddr, uval, newval);
572
if (ret)
573
return ret;
574
575
/*
576
* If the waiter bit was requested the caller also needs PI
577
* state attached to the new owner of the user space futex.
578
*
579
* @task is guaranteed to be alive and it cannot be exiting
580
* because it is either sleeping or waiting in
581
* futex_requeue_pi_wakeup_sync().
582
*
583
* No need to do the full attach_to_pi_owner() exercise
584
* because @task is known and valid.
585
*/
586
if (set_waiters) {
587
raw_spin_lock_irq(&task->pi_lock);
588
__attach_to_pi_owner(task, key, ps);
589
raw_spin_unlock_irq(&task->pi_lock);
590
}
591
return 1;
592
}
593
594
/*
595
* First waiter. Set the waiters bit before attaching ourself to
596
* the owner. If owner tries to unlock, it will be forced into
597
* the kernel and blocked on hb->lock.
598
*/
599
newval = uval | FUTEX_WAITERS;
600
ret = lock_pi_update_atomic(uaddr, uval, newval);
601
if (ret)
602
return ret;
603
/*
604
* If the update of the user space value succeeded, we try to
605
* attach to the owner. If that fails, no harm done, we only
606
* set the FUTEX_WAITERS bit in the user space variable.
607
*/
608
return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
609
}
610
611
/*
612
* Caller must hold a reference on @pi_state.
613
*/
614
static int wake_futex_pi(u32 __user *uaddr, u32 uval,
615
struct futex_pi_state *pi_state,
616
struct rt_mutex_waiter *top_waiter)
617
{
618
struct task_struct *new_owner;
619
bool postunlock = false;
620
DEFINE_RT_WAKE_Q(wqh);
621
u32 curval, newval;
622
int ret = 0;
623
624
new_owner = top_waiter->task;
625
626
/*
627
* We pass it to the next owner. The WAITERS bit is always kept
628
* enabled while there is PI state around. We cleanup the owner
629
* died bit, because we are the owner.
630
*/
631
newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
632
633
if (unlikely(should_fail_futex(true))) {
634
ret = -EFAULT;
635
goto out_unlock;
636
}
637
638
ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
639
if (!ret && (curval != uval)) {
640
/*
641
* If a unconditional UNLOCK_PI operation (user space did not
642
* try the TID->0 transition) raced with a waiter setting the
643
* FUTEX_WAITERS flag between get_user() and locking the hash
644
* bucket lock, retry the operation.
645
*/
646
if ((FUTEX_TID_MASK & curval) == uval)
647
ret = -EAGAIN;
648
else
649
ret = -EINVAL;
650
}
651
652
if (!ret) {
653
/*
654
* This is a point of no return; once we modified the uval
655
* there is no going back and subsequent operations must
656
* not fail.
657
*/
658
pi_state_update_owner(pi_state, new_owner);
659
postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
660
}
661
662
out_unlock:
663
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
664
665
if (postunlock)
666
rt_mutex_postunlock(&wqh);
667
668
return ret;
669
}
670
671
static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
672
struct task_struct *argowner)
673
{
674
struct futex_pi_state *pi_state = q->pi_state;
675
struct task_struct *oldowner, *newowner;
676
u32 uval, curval, newval, newtid;
677
int err = 0;
678
679
oldowner = pi_state->owner;
680
681
/*
682
* We are here because either:
683
*
684
* - we stole the lock and pi_state->owner needs updating to reflect
685
* that (@argowner == current),
686
*
687
* or:
688
*
689
* - someone stole our lock and we need to fix things to point to the
690
* new owner (@argowner == NULL).
691
*
692
* Either way, we have to replace the TID in the user space variable.
693
* This must be atomic as we have to preserve the owner died bit here.
694
*
695
* Note: We write the user space value _before_ changing the pi_state
696
* because we can fault here. Imagine swapped out pages or a fork
697
* that marked all the anonymous memory readonly for cow.
698
*
699
* Modifying pi_state _before_ the user space value would leave the
700
* pi_state in an inconsistent state when we fault here, because we
701
* need to drop the locks to handle the fault. This might be observed
702
* in the PID checks when attaching to PI state .
703
*/
704
retry:
705
if (!argowner) {
706
if (oldowner != current) {
707
/*
708
* We raced against a concurrent self; things are
709
* already fixed up. Nothing to do.
710
*/
711
return 0;
712
}
713
714
if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
715
/* We got the lock. pi_state is correct. Tell caller. */
716
return 1;
717
}
718
719
/*
720
* The trylock just failed, so either there is an owner or
721
* there is a higher priority waiter than this one.
722
*/
723
newowner = rt_mutex_owner(&pi_state->pi_mutex);
724
/*
725
* If the higher priority waiter has not yet taken over the
726
* rtmutex then newowner is NULL. We can't return here with
727
* that state because it's inconsistent vs. the user space
728
* state. So drop the locks and try again. It's a valid
729
* situation and not any different from the other retry
730
* conditions.
731
*/
732
if (unlikely(!newowner)) {
733
err = -EAGAIN;
734
goto handle_err;
735
}
736
} else {
737
WARN_ON_ONCE(argowner != current);
738
if (oldowner == current) {
739
/*
740
* We raced against a concurrent self; things are
741
* already fixed up. Nothing to do.
742
*/
743
return 1;
744
}
745
newowner = argowner;
746
}
747
748
newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
749
/* Owner died? */
750
if (!pi_state->owner)
751
newtid |= FUTEX_OWNER_DIED;
752
753
err = futex_get_value_locked(&uval, uaddr);
754
if (err)
755
goto handle_err;
756
757
for (;;) {
758
newval = (uval & FUTEX_OWNER_DIED) | newtid;
759
760
err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
761
if (err)
762
goto handle_err;
763
764
if (curval == uval)
765
break;
766
uval = curval;
767
}
768
769
/*
770
* We fixed up user space. Now we need to fix the pi_state
771
* itself.
772
*/
773
pi_state_update_owner(pi_state, newowner);
774
775
return argowner == current;
776
777
/*
778
* In order to reschedule or handle a page fault, we need to drop the
779
* locks here. In the case of a fault, this gives the other task
780
* (either the highest priority waiter itself or the task which stole
781
* the rtmutex) the chance to try the fixup of the pi_state. So once we
782
* are back from handling the fault we need to check the pi_state after
783
* reacquiring the locks and before trying to do another fixup. When
784
* the fixup has been done already we simply return.
785
*
786
* Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
787
* drop hb->lock since the caller owns the hb -> futex_q relation.
788
* Dropping the pi_mutex->wait_lock requires the state revalidate.
789
*/
790
handle_err:
791
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
792
spin_unlock(q->lock_ptr);
793
794
switch (err) {
795
case -EFAULT:
796
err = fault_in_user_writeable(uaddr);
797
break;
798
799
case -EAGAIN:
800
cond_resched();
801
err = 0;
802
break;
803
804
default:
805
WARN_ON_ONCE(1);
806
break;
807
}
808
809
futex_q_lockptr_lock(q);
810
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
811
812
/*
813
* Check if someone else fixed it for us:
814
*/
815
if (pi_state->owner != oldowner)
816
return argowner == current;
817
818
/* Retry if err was -EAGAIN or the fault in succeeded */
819
if (!err)
820
goto retry;
821
822
/*
823
* fault_in_user_writeable() failed so user state is immutable. At
824
* best we can make the kernel state consistent but user state will
825
* be most likely hosed and any subsequent unlock operation will be
826
* rejected due to PI futex rule [10].
827
*
828
* Ensure that the rtmutex owner is also the pi_state owner despite
829
* the user space value claiming something different. There is no
830
* point in unlocking the rtmutex if current is the owner as it
831
* would need to wait until the next waiter has taken the rtmutex
832
* to guarantee consistent state. Keep it simple. Userspace asked
833
* for this wreckaged state.
834
*
835
* The rtmutex has an owner - either current or some other
836
* task. See the EAGAIN loop above.
837
*/
838
pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
839
840
return err;
841
}
842
843
static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
844
struct task_struct *argowner)
845
{
846
struct futex_pi_state *pi_state = q->pi_state;
847
int ret;
848
849
lockdep_assert_held(q->lock_ptr);
850
851
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
852
ret = __fixup_pi_state_owner(uaddr, q, argowner);
853
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
854
return ret;
855
}
856
857
/**
858
* fixup_pi_owner() - Post lock pi_state and corner case management
859
* @uaddr: user address of the futex
860
* @q: futex_q (contains pi_state and access to the rt_mutex)
861
* @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
862
*
863
* After attempting to lock an rt_mutex, this function is called to cleanup
864
* the pi_state owner as well as handle race conditions that may allow us to
865
* acquire the lock. Must be called with the hb lock held.
866
*
867
* Return:
868
* - 1 - success, lock taken;
869
* - 0 - success, lock not taken;
870
* - <0 - on error (-EFAULT)
871
*/
872
int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
873
{
874
if (locked) {
875
/*
876
* Got the lock. We might not be the anticipated owner if we
877
* did a lock-steal - fix up the PI-state in that case:
878
*
879
* Speculative pi_state->owner read (we don't hold wait_lock);
880
* since we own the lock pi_state->owner == current is the
881
* stable state, anything else needs more attention.
882
*/
883
if (q->pi_state->owner != current)
884
return fixup_pi_state_owner(uaddr, q, current);
885
return 1;
886
}
887
888
/*
889
* If we didn't get the lock; check if anybody stole it from us. In
890
* that case, we need to fix up the uval to point to them instead of
891
* us, otherwise bad things happen. [10]
892
*
893
* Another speculative read; pi_state->owner == current is unstable
894
* but needs our attention.
895
*/
896
if (q->pi_state->owner == current)
897
return fixup_pi_state_owner(uaddr, q, NULL);
898
899
/*
900
* Paranoia check. If we did not take the lock, then we should not be
901
* the owner of the rt_mutex. Warn and establish consistent state.
902
*/
903
if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
904
return fixup_pi_state_owner(uaddr, q, current);
905
906
return 0;
907
}
908
909
/*
910
* Userspace tried a 0 -> TID atomic transition of the futex value
911
* and failed. The kernel side here does the whole locking operation:
912
* if there are waiters then it will block as a consequence of relying
913
* on rt-mutexes, it does PI, etc. (Due to races the kernel might see
914
* a 0 value of the futex too.).
915
*
916
* Also serves as futex trylock_pi()'ing, and due semantics.
917
*/
918
int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
919
{
920
struct hrtimer_sleeper timeout, *to;
921
struct task_struct *exiting = NULL;
922
struct rt_mutex_waiter rt_waiter;
923
struct futex_q q = futex_q_init;
924
DEFINE_WAKE_Q(wake_q);
925
int res, ret;
926
927
if (!IS_ENABLED(CONFIG_FUTEX_PI))
928
return -ENOSYS;
929
930
if (refill_pi_state_cache())
931
return -ENOMEM;
932
933
to = futex_setup_timer(time, &timeout, flags, 0);
934
935
retry:
936
ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE);
937
if (unlikely(ret != 0))
938
goto out;
939
940
retry_private:
941
if (1) {
942
CLASS(hb, hb)(&q.key);
943
944
futex_q_lock(&q, hb);
945
946
ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
947
&exiting, 0);
948
if (unlikely(ret)) {
949
/*
950
* Atomic work succeeded and we got the lock,
951
* or failed. Either way, we do _not_ block.
952
*/
953
switch (ret) {
954
case 1:
955
/* We got the lock. */
956
ret = 0;
957
goto out_unlock_put_key;
958
case -EFAULT:
959
goto uaddr_faulted;
960
case -EBUSY:
961
case -EAGAIN:
962
/*
963
* Two reasons for this:
964
* - EBUSY: Task is exiting and we just wait for the
965
* exit to complete.
966
* - EAGAIN: The user space value changed.
967
*/
968
futex_q_unlock(hb);
969
/*
970
* Handle the case where the owner is in the middle of
971
* exiting. Wait for the exit to complete otherwise
972
* this task might loop forever, aka. live lock.
973
*/
974
wait_for_owner_exiting(ret, exiting);
975
cond_resched();
976
goto retry;
977
default:
978
goto out_unlock_put_key;
979
}
980
}
981
982
WARN_ON(!q.pi_state);
983
984
/*
985
* Only actually queue now that the atomic ops are done:
986
*/
987
__futex_queue(&q, hb, current);
988
989
if (trylock) {
990
ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
991
/* Fixup the trylock return value: */
992
ret = ret ? 0 : -EWOULDBLOCK;
993
goto no_block;
994
}
995
996
/*
997
* Caution; releasing @hb in-scope. The hb->lock is still locked
998
* while the reference is dropped. The reference can not be dropped
999
* after the unlock because if a user initiated resize is in progress
1000
* then we might need to wake him. This can not be done after the
1001
* rt_mutex_pre_schedule() invocation. The hb will remain valid because
1002
* the thread, performing resize, will block on hb->lock during
1003
* the requeue.
1004
*/
1005
futex_hash_put(no_free_ptr(hb));
1006
/*
1007
* Must be done before we enqueue the waiter, here is unfortunately
1008
* under the hb lock, but that *should* work because it does nothing.
1009
*/
1010
rt_mutex_pre_schedule();
1011
1012
rt_mutex_init_waiter(&rt_waiter);
1013
1014
/*
1015
* On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
1016
* hold it while doing rt_mutex_start_proxy(), because then it will
1017
* include hb->lock in the blocking chain, even through we'll not in
1018
* fact hold it while blocking. This will lead it to report -EDEADLK
1019
* and BUG when futex_unlock_pi() interleaves with this.
1020
*
1021
* Therefore acquire wait_lock while holding hb->lock, but drop the
1022
* latter before calling __rt_mutex_start_proxy_lock(). This
1023
* interleaves with futex_unlock_pi() -- which does a similar lock
1024
* handoff -- such that the latter can observe the futex_q::pi_state
1025
* before __rt_mutex_start_proxy_lock() is done.
1026
*/
1027
raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
1028
spin_unlock(q.lock_ptr);
1029
/*
1030
* __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
1031
* such that futex_unlock_pi() is guaranteed to observe the waiter when
1032
* it sees the futex_q::pi_state.
1033
*/
1034
ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q);
1035
raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q);
1036
1037
if (ret) {
1038
if (ret == 1)
1039
ret = 0;
1040
goto cleanup;
1041
}
1042
1043
if (unlikely(to))
1044
hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
1045
1046
ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
1047
1048
cleanup:
1049
/*
1050
* If we failed to acquire the lock (deadlock/signal/timeout), we must
1051
* unwind the above, however we canont lock hb->lock because
1052
* rt_mutex already has a waiter enqueued and hb->lock can itself try
1053
* and enqueue an rt_waiter through rtlock.
1054
*
1055
* Doing the cleanup without holding hb->lock can cause inconsistent
1056
* state between hb and pi_state, but only in the direction of not
1057
* seeing a waiter that is leaving.
1058
*
1059
* See futex_unlock_pi(), it deals with this inconsistency.
1060
*
1061
* There be dragons here, since we must deal with the inconsistency on
1062
* the way out (here), it is impossible to detect/warn about the race
1063
* the other way around (missing an incoming waiter).
1064
*
1065
* What could possibly go wrong...
1066
*/
1067
if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
1068
ret = 0;
1069
1070
/*
1071
* Now that the rt_waiter has been dequeued, it is safe to use
1072
* spinlock/rtlock (which might enqueue its own rt_waiter) and fix up
1073
* the
1074
*/
1075
futex_q_lockptr_lock(&q);
1076
/*
1077
* Waiter is unqueued.
1078
*/
1079
rt_mutex_post_schedule();
1080
no_block:
1081
/*
1082
* Fixup the pi_state owner and possibly acquire the lock if we
1083
* haven't already.
1084
*/
1085
res = fixup_pi_owner(uaddr, &q, !ret);
1086
/*
1087
* If fixup_pi_owner() returned an error, propagate that. If it acquired
1088
* the lock, clear our -ETIMEDOUT or -EINTR.
1089
*/
1090
if (res)
1091
ret = (res < 0) ? res : 0;
1092
1093
futex_unqueue_pi(&q);
1094
spin_unlock(q.lock_ptr);
1095
if (q.drop_hb_ref) {
1096
CLASS(hb, hb)(&q.key);
1097
/* Additional reference from futex_unlock_pi() */
1098
futex_hash_put(hb);
1099
}
1100
goto out;
1101
1102
out_unlock_put_key:
1103
futex_q_unlock(hb);
1104
goto out;
1105
1106
uaddr_faulted:
1107
futex_q_unlock(hb);
1108
1109
ret = fault_in_user_writeable(uaddr);
1110
if (ret)
1111
goto out;
1112
1113
if (!(flags & FLAGS_SHARED))
1114
goto retry_private;
1115
1116
goto retry;
1117
}
1118
1119
out:
1120
if (to) {
1121
hrtimer_cancel(&to->timer);
1122
destroy_hrtimer_on_stack(&to->timer);
1123
}
1124
return ret != -EINTR ? ret : -ERESTARTNOINTR;
1125
}
1126
1127
/*
1128
* Userspace attempted a TID -> 0 atomic transition, and failed.
1129
* This is the in-kernel slowpath: we look up the PI state (if any),
1130
* and do the rt-mutex unlock.
1131
*/
1132
int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
1133
{
1134
u32 curval, uval, vpid = task_pid_vnr(current);
1135
union futex_key key = FUTEX_KEY_INIT;
1136
struct futex_q *top_waiter;
1137
int ret;
1138
1139
if (!IS_ENABLED(CONFIG_FUTEX_PI))
1140
return -ENOSYS;
1141
1142
retry:
1143
if (get_user(uval, uaddr))
1144
return -EFAULT;
1145
/*
1146
* We release only a lock we actually own:
1147
*/
1148
if ((uval & FUTEX_TID_MASK) != vpid)
1149
return -EPERM;
1150
1151
ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE);
1152
if (ret)
1153
return ret;
1154
1155
CLASS(hb, hb)(&key);
1156
spin_lock(&hb->lock);
1157
retry_hb:
1158
1159
/*
1160
* Check waiters first. We do not trust user space values at
1161
* all and we at least want to know if user space fiddled
1162
* with the futex value instead of blindly unlocking.
1163
*/
1164
top_waiter = futex_top_waiter(hb, &key);
1165
if (top_waiter) {
1166
struct futex_pi_state *pi_state = top_waiter->pi_state;
1167
struct rt_mutex_waiter *rt_waiter;
1168
1169
ret = -EINVAL;
1170
if (!pi_state)
1171
goto out_unlock;
1172
1173
/*
1174
* If current does not own the pi_state then the futex is
1175
* inconsistent and user space fiddled with the futex value.
1176
*/
1177
if (pi_state->owner != current)
1178
goto out_unlock;
1179
1180
/*
1181
* By taking wait_lock while still holding hb->lock, we ensure
1182
* there is no point where we hold neither; and thereby
1183
* wake_futex_pi() must observe any new waiters.
1184
*
1185
* Since the cleanup: case in futex_lock_pi() removes the
1186
* rt_waiter without holding hb->lock, it is possible for
1187
* wake_futex_pi() to not find a waiter while the above does,
1188
* in this case the waiter is on the way out and it can be
1189
* ignored.
1190
*
1191
* In particular; this forces __rt_mutex_start_proxy() to
1192
* complete such that we're guaranteed to observe the
1193
* rt_waiter.
1194
*/
1195
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1196
1197
/*
1198
* Futex vs rt_mutex waiter state -- if there are no rt_mutex
1199
* waiters even though futex thinks there are, then the waiter
1200
* is leaving. The entry needs to be removed from the list so a
1201
* new futex_lock_pi() is not using this stale PI-state while
1202
* the futex is available in user space again.
1203
* There can be more than one task on its way out so it needs
1204
* to retry.
1205
*/
1206
rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
1207
if (!rt_waiter) {
1208
/*
1209
* Acquire a reference for the leaving waiter to ensure
1210
* valid futex_q::lock_ptr.
1211
*/
1212
futex_hash_get(hb);
1213
top_waiter->drop_hb_ref = true;
1214
__futex_unqueue(top_waiter);
1215
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1216
goto retry_hb;
1217
}
1218
1219
get_pi_state(pi_state);
1220
spin_unlock(&hb->lock);
1221
1222
/* drops pi_state->pi_mutex.wait_lock */
1223
ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter);
1224
1225
put_pi_state(pi_state);
1226
1227
/*
1228
* Success, we're done! No tricky corner cases.
1229
*/
1230
if (!ret)
1231
return ret;
1232
/*
1233
* The atomic access to the futex value generated a
1234
* pagefault, so retry the user-access and the wakeup:
1235
*/
1236
if (ret == -EFAULT)
1237
goto pi_faulted;
1238
/*
1239
* A unconditional UNLOCK_PI op raced against a waiter
1240
* setting the FUTEX_WAITERS bit. Try again.
1241
*/
1242
if (ret == -EAGAIN)
1243
goto pi_retry;
1244
/*
1245
* wake_futex_pi has detected invalid state. Tell user
1246
* space.
1247
*/
1248
return ret;
1249
}
1250
1251
/*
1252
* We have no kernel internal state, i.e. no waiters in the
1253
* kernel. Waiters which are about to queue themselves are stuck
1254
* on hb->lock. So we can safely ignore them. We do neither
1255
* preserve the WAITERS bit not the OWNER_DIED one. We are the
1256
* owner.
1257
*/
1258
if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
1259
spin_unlock(&hb->lock);
1260
switch (ret) {
1261
case -EFAULT:
1262
goto pi_faulted;
1263
1264
case -EAGAIN:
1265
goto pi_retry;
1266
1267
default:
1268
WARN_ON_ONCE(1);
1269
return ret;
1270
}
1271
}
1272
1273
/*
1274
* If uval has changed, let user space handle it.
1275
*/
1276
ret = (curval == uval) ? 0 : -EAGAIN;
1277
1278
out_unlock:
1279
spin_unlock(&hb->lock);
1280
return ret;
1281
1282
pi_retry:
1283
cond_resched();
1284
goto retry;
1285
1286
pi_faulted:
1287
1288
ret = fault_in_user_writeable(uaddr);
1289
if (!ret)
1290
goto retry;
1291
1292
return ret;
1293
}
1294
1295
1296