CoCalc -- pi.c

GitHub Repository: torvalds/linux
Path: blob/master/kernel/futex/pi.c
²⁹²⁶⁶ views
1
// SPDX-License-Identifier: GPL-2.0-or-later
2

3
#include <linux/slab.h>
4
#include <linux/sched/rt.h>
5
#include <linux/sched/task.h>
6

7
#include "futex.h"
8
#include "../locking/rtmutex_common.h"
9

10
/*
11
 * PI code:
12
 */
13
int refill_pi_state_cache(void)
14
{
15
	struct futex_pi_state *pi_state;
16

17
	if (likely(current->pi_state_cache))
18
		return 0;
19

20
	pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
21

22
	if (!pi_state)
23
		return -ENOMEM;
24

25
	INIT_LIST_HEAD(&pi_state->list);
26
	/* pi_mutex gets initialized later */
27
	pi_state->owner = NULL;
28
	refcount_set(&pi_state->refcount, 1);
29
	pi_state->key = FUTEX_KEY_INIT;
30

31
	current->pi_state_cache = pi_state;
32

33
	return 0;
34
}
35

36
static struct futex_pi_state *alloc_pi_state(void)
37
{
38
	struct futex_pi_state *pi_state = current->pi_state_cache;
39

40
	WARN_ON(!pi_state);
41
	current->pi_state_cache = NULL;
42

43
	return pi_state;
44
}
45

46
static void pi_state_update_owner(struct futex_pi_state *pi_state,
47
				  struct task_struct *new_owner)
48
{
49
	struct task_struct *old_owner = pi_state->owner;
50

51
	lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
52

53
	if (old_owner) {
54
		raw_spin_lock(&old_owner->pi_lock);
55
		WARN_ON(list_empty(&pi_state->list));
56
		list_del_init(&pi_state->list);
57
		raw_spin_unlock(&old_owner->pi_lock);
58
	}
59

60
	if (new_owner) {
61
		raw_spin_lock(&new_owner->pi_lock);
62
		WARN_ON(!list_empty(&pi_state->list));
63
		list_add(&pi_state->list, &new_owner->pi_state_list);
64
		pi_state->owner = new_owner;
65
		raw_spin_unlock(&new_owner->pi_lock);
66
	}
67
}
68

69
void get_pi_state(struct futex_pi_state *pi_state)
70
{
71
	WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
72
}
73

74
/*
75
 * Drops a reference to the pi_state object and frees or caches it
76
 * when the last reference is gone.
77
 */
78
void put_pi_state(struct futex_pi_state *pi_state)
79
{
80
	if (!pi_state)
81
		return;
82

83
	if (!refcount_dec_and_test(&pi_state->refcount))
84
		return;
85

86
	/*
87
	 * If pi_state->owner is NULL, the owner is most probably dying
88
	 * and has cleaned up the pi_state already
89
	 */
90
	if (pi_state->owner) {
91
		unsigned long flags;
92

93
		raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
94
		pi_state_update_owner(pi_state, NULL);
95
		rt_mutex_proxy_unlock(&pi_state->pi_mutex);
96
		raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
97
	}
98

99
	if (current->pi_state_cache) {
100
		kfree(pi_state);
101
	} else {
102
		/*
103
		 * pi_state->list is already empty.
104
		 * clear pi_state->owner.
105
		 * refcount is at 0 - put it back to 1.
106
		 */
107
		pi_state->owner = NULL;
108
		refcount_set(&pi_state->refcount, 1);
109
		current->pi_state_cache = pi_state;
110
	}
111
}
112

113
/*
114
 * We need to check the following states:
115
 *
116
 *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
117
 *
118
 * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
119
 * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
120
 *
121
 * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
122
 *
123
 * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
124
 * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
125
 *
126
 * [6]  Found  | Found    | task      | 0         | 1      | Valid
127
 *
128
 * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
129
 *
130
 * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
131
 * [9]  Found  | Found    | task      | 0         | 0      | Invalid
132
 * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
133
 *
134
 * [1]	Indicates that the kernel can acquire the futex atomically. We
135
 *	came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
136
 *
137
 * [2]	Valid, if TID does not belong to a kernel thread. If no matching
138
 *      thread is found then it indicates that the owner TID has died.
139
 *
140
 * [3]	Invalid. The waiter is queued on a non PI futex
141
 *
142
 * [4]	Valid state after exit_robust_list(), which sets the user space
143
 *	value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
144
 *
145
 * [5]	The user space value got manipulated between exit_robust_list()
146
 *	and exit_pi_state_list()
147
 *
148
 * [6]	Valid state after exit_pi_state_list() which sets the new owner in
149
 *	the pi_state but cannot access the user space value.
150
 *
151
 * [7]	pi_state->owner can only be NULL when the OWNER_DIED bit is set.
152
 *
153
 * [8]	Owner and user space value match
154
 *
155
 * [9]	There is no transient state which sets the user space TID to 0
156
 *	except exit_robust_list(), but this is indicated by the
157
 *	FUTEX_OWNER_DIED bit. See [4]
158
 *
159
 * [10] There is no transient state which leaves owner and user space
160
 *	TID out of sync. Except one error case where the kernel is denied
161
 *	write access to the user address, see fixup_pi_state_owner().
162
 *
163
 *
164
 * Serialization and lifetime rules:
165
 *
166
 * hb->lock:
167
 *
168
 *	hb -> futex_q, relation
169
 *	futex_q -> pi_state, relation
170
 *
171
 *	(cannot be raw because hb can contain arbitrary amount
172
 *	 of futex_q's)
173
 *
174
 * pi_mutex->wait_lock:
175
 *
176
 *	{uval, pi_state}
177
 *
178
 *	(and pi_mutex 'obviously')
179
 *
180
 * p->pi_lock:
181
 *
182
 *	p->pi_state_list -> pi_state->list, relation
183
 *	pi_mutex->owner -> pi_state->owner, relation
184
 *
185
 * pi_state->refcount:
186
 *
187
 *	pi_state lifetime
188
 *
189
 *
190
 * Lock order:
191
 *
192
 *   hb->lock
193
 *     pi_mutex->wait_lock
194
 *       p->pi_lock
195
 *
196
 */
197

198
/*
199
 * Validate that the existing waiter has a pi_state and sanity check
200
 * the pi_state against the user space value. If correct, attach to
201
 * it.
202
 */
203
static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
204
			      struct futex_pi_state *pi_state,
205
			      struct futex_pi_state **ps)
206
{
207
	pid_t pid = uval & FUTEX_TID_MASK;
208
	u32 uval2;
209
	int ret;
210

211
	/*
212
	 * Userspace might have messed up non-PI and PI futexes [3]
213
	 */
214
	if (unlikely(!pi_state))
215
		return -EINVAL;
216

217
	/*
218
	 * We get here with hb->lock held, and having found a
219
	 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
220
	 * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
221
	 * which in turn means that futex_lock_pi() still has a reference on
222
	 * our pi_state.
223
	 *
224
	 * The waiter holding a reference on @pi_state also protects against
225
	 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
226
	 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
227
	 * free pi_state before we can take a reference ourselves.
228
	 */
229
	WARN_ON(!refcount_read(&pi_state->refcount));
230

231
	/*
232
	 * Now that we have a pi_state, we can acquire wait_lock
233
	 * and do the state validation.
234
	 */
235
	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
236

237
	/*
238
	 * Since {uval, pi_state} is serialized by wait_lock, and our current
239
	 * uval was read without holding it, it can have changed. Verify it
240
	 * still is what we expect it to be, otherwise retry the entire
241
	 * operation.
242
	 */
243
	if (futex_get_value_locked(&uval2, uaddr))
244
		goto out_efault;
245

246
	if (uval != uval2)
247
		goto out_eagain;
248

249
	/*
250
	 * Handle the owner died case:
251
	 */
252
	if (uval & FUTEX_OWNER_DIED) {
253
		/*
254
		 * exit_pi_state_list sets owner to NULL and wakes the
255
		 * topmost waiter. The task which acquires the
256
		 * pi_state->rt_mutex will fixup owner.
257
		 */
258
		if (!pi_state->owner) {
259
			/*
260
			 * No pi state owner, but the user space TID
261
			 * is not 0. Inconsistent state. [5]
262
			 */
263
			if (pid)
264
				goto out_einval;
265
			/*
266
			 * Take a ref on the state and return success. [4]
267
			 */
268
			goto out_attach;
269
		}
270

271
		/*
272
		 * If TID is 0, then either the dying owner has not
273
		 * yet executed exit_pi_state_list() or some waiter
274
		 * acquired the rtmutex in the pi state, but did not
275
		 * yet fixup the TID in user space.
276
		 *
277
		 * Take a ref on the state and return success. [6]
278
		 */
279
		if (!pid)
280
			goto out_attach;
281
	} else {
282
		/*
283
		 * If the owner died bit is not set, then the pi_state
284
		 * must have an owner. [7]
285
		 */
286
		if (!pi_state->owner)
287
			goto out_einval;
288
	}
289

290
	/*
291
	 * Bail out if user space manipulated the futex value. If pi
292
	 * state exists then the owner TID must be the same as the
293
	 * user space TID. [9/10]
294
	 */
295
	if (pid != task_pid_vnr(pi_state->owner))
296
		goto out_einval;
297

298
out_attach:
299
	get_pi_state(pi_state);
300
	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
301
	*ps = pi_state;
302
	return 0;
303

304
out_einval:
305
	ret = -EINVAL;
306
	goto out_error;
307

308
out_eagain:
309
	ret = -EAGAIN;
310
	goto out_error;
311

312
out_efault:
313
	ret = -EFAULT;
314
	goto out_error;
315

316
out_error:
317
	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
318
	return ret;
319
}
320

321
static int handle_exit_race(u32 __user *uaddr, u32 uval,
322
			    struct task_struct *tsk)
323
{
324
	u32 uval2;
325

326
	/*
327
	 * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
328
	 * caller that the alleged owner is busy.
329
	 */
330
	if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
331
		return -EBUSY;
332

333
	/*
334
	 * Reread the user space value to handle the following situation:
335
	 *
336
	 * CPU0				CPU1
337
	 *
338
	 * sys_exit()			sys_futex()
339
	 *  do_exit()			 futex_lock_pi()
340
	 *                                futex_lock_pi_atomic()
341
	 *   exit_signals(tsk)		    No waiters:
342
	 *    tsk->flags |= PF_EXITING;	    *uaddr == 0x00000PID
343
	 *  mm_release(tsk)		    Set waiter bit
344
	 *   exit_robust_list(tsk) {	    *uaddr = 0x80000PID;
345
	 *      Set owner died		    attach_to_pi_owner() {
346
	 *    *uaddr = 0xC0000000;	     tsk = get_task(PID);
347
	 *   }				     if (!tsk->flags & PF_EXITING) {
348
	 *  ...				       attach();
349
	 *  tsk->futex_state =               } else {
350
	 *	FUTEX_STATE_DEAD;              if (tsk->futex_state !=
351
	 *					  FUTEX_STATE_DEAD)
352
	 *				         return -EAGAIN;
353
	 *				       return -ESRCH; <--- FAIL
354
	 *				     }
355
	 *
356
	 * Returning ESRCH unconditionally is wrong here because the
357
	 * user space value has been changed by the exiting task.
358
	 *
359
	 * The same logic applies to the case where the exiting task is
360
	 * already gone.
361
	 */
362
	if (futex_get_value_locked(&uval2, uaddr))
363
		return -EFAULT;
364

365
	/* If the user space value has changed, try again. */
366
	if (uval2 != uval)
367
		return -EAGAIN;
368

369
	/*
370
	 * The exiting task did not have a robust list, the robust list was
371
	 * corrupted or the user space value in *uaddr is simply bogus.
372
	 * Give up and tell user space.
373
	 */
374
	return -ESRCH;
375
}
376

377
static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
378
				 struct futex_pi_state **ps)
379
{
380
	/*
381
	 * No existing pi state. First waiter. [2]
382
	 *
383
	 * This creates pi_state, we have hb->lock held, this means nothing can
384
	 * observe this state, wait_lock is irrelevant.
385
	 */
386
	struct futex_pi_state *pi_state = alloc_pi_state();
387

388
	/*
389
	 * Initialize the pi_mutex in locked state and make @p
390
	 * the owner of it:
391
	 */
392
	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
393

394
	/* Store the key for possible exit cleanups: */
395
	pi_state->key = *key;
396

397
	WARN_ON(!list_empty(&pi_state->list));
398
	list_add(&pi_state->list, &p->pi_state_list);
399
	/*
400
	 * Assignment without holding pi_state->pi_mutex.wait_lock is safe
401
	 * because there is no concurrency as the object is not published yet.
402
	 */
403
	pi_state->owner = p;
404

405
	*ps = pi_state;
406
}
407
/*
408
 * Lookup the task for the TID provided from user space and attach to
409
 * it after doing proper sanity checks.
410
 */
411
static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
412
			      struct futex_pi_state **ps,
413
			      struct task_struct **exiting)
414
{
415
	pid_t pid = uval & FUTEX_TID_MASK;
416
	struct task_struct *p;
417

418
	/*
419
	 * We are the first waiter - try to look up the real owner and attach
420
	 * the new pi_state to it, but bail out when TID = 0 [1]
421
	 *
422
	 * The !pid check is paranoid. None of the call sites should end up
423
	 * with pid == 0, but better safe than sorry. Let the caller retry
424
	 */
425
	if (!pid)
426
		return -EAGAIN;
427
	p = find_get_task_by_vpid(pid);
428
	if (!p)
429
		return handle_exit_race(uaddr, uval, NULL);
430

431
	if (unlikely(p->flags & PF_KTHREAD)) {
432
		put_task_struct(p);
433
		return -EPERM;
434
	}
435

436
	/*
437
	 * We need to look at the task state to figure out, whether the
438
	 * task is exiting. To protect against the change of the task state
439
	 * in futex_exit_release(), we do this protected by p->pi_lock:
440
	 */
441
	raw_spin_lock_irq(&p->pi_lock);
442
	if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
443
		/*
444
		 * The task is on the way out. When the futex state is
445
		 * FUTEX_STATE_DEAD, we know that the task has finished
446
		 * the cleanup:
447
		 */
448
		int ret = handle_exit_race(uaddr, uval, p);
449

450
		raw_spin_unlock_irq(&p->pi_lock);
451
		/*
452
		 * If the owner task is between FUTEX_STATE_EXITING and
453
		 * FUTEX_STATE_DEAD then store the task pointer and keep
454
		 * the reference on the task struct. The calling code will
455
		 * drop all locks, wait for the task to reach
456
		 * FUTEX_STATE_DEAD and then drop the refcount. This is
457
		 * required to prevent a live lock when the current task
458
		 * preempted the exiting task between the two states.
459
		 */
460
		if (ret == -EBUSY)
461
			*exiting = p;
462
		else
463
			put_task_struct(p);
464
		return ret;
465
	}
466

467
	__attach_to_pi_owner(p, key, ps);
468
	raw_spin_unlock_irq(&p->pi_lock);
469

470
	put_task_struct(p);
471

472
	return 0;
473
}
474

475
static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
476
{
477
	int err;
478
	u32 curval;
479

480
	if (unlikely(should_fail_futex(true)))
481
		return -EFAULT;
482

483
	err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
484
	if (unlikely(err))
485
		return err;
486

487
	/* If user space value changed, let the caller retry */
488
	return curval != uval ? -EAGAIN : 0;
489
}
490

491
/**
492
 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
493
 * @uaddr:		the pi futex user address
494
 * @hb:			the pi futex hash bucket
495
 * @key:		the futex key associated with uaddr and hb
496
 * @ps:			the pi_state pointer where we store the result of the
497
 *			lookup
498
 * @task:		the task to perform the atomic lock work for.  This will
499
 *			be "current" except in the case of requeue pi.
500
 * @exiting:		Pointer to store the task pointer of the owner task
501
 *			which is in the middle of exiting
502
 * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
503
 *
504
 * Return:
505
 *  -  0 - ready to wait;
506
 *  -  1 - acquired the lock;
507
 *  - <0 - error
508
 *
509
 * The hb->lock must be held by the caller.
510
 *
511
 * @exiting is only set when the return value is -EBUSY. If so, this holds
512
 * a refcount on the exiting task on return and the caller needs to drop it
513
 * after waiting for the exit to complete.
514
 */
515
int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
516
			 union futex_key *key,
517
			 struct futex_pi_state **ps,
518
			 struct task_struct *task,
519
			 struct task_struct **exiting,
520
			 int set_waiters)
521
{
522
	u32 uval, newval, vpid = task_pid_vnr(task);
523
	struct futex_q *top_waiter;
524
	int ret;
525

526
	/*
527
	 * Read the user space value first so we can validate a few
528
	 * things before proceeding further.
529
	 */
530
	if (futex_get_value_locked(&uval, uaddr))
531
		return -EFAULT;
532

533
	if (unlikely(should_fail_futex(true)))
534
		return -EFAULT;
535

536
	/*
537
	 * Detect deadlocks.
538
	 */
539
	if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
540
		return -EDEADLK;
541

542
	if ((unlikely(should_fail_futex(true))))
543
		return -EDEADLK;
544

545
	/*
546
	 * Lookup existing state first. If it exists, try to attach to
547
	 * its pi_state.
548
	 */
549
	top_waiter = futex_top_waiter(hb, key);
550
	if (top_waiter)
551
		return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
552

553
	/*
554
	 * No waiter and user TID is 0. We are here because the
555
	 * waiters or the owner died bit is set or called from
556
	 * requeue_cmp_pi or for whatever reason something took the
557
	 * syscall.
558
	 */
559
	if (!(uval & FUTEX_TID_MASK)) {
560
		/*
561
		 * We take over the futex. No other waiters and the user space
562
		 * TID is 0. We preserve the owner died bit.
563
		 */
564
		newval = uval & FUTEX_OWNER_DIED;
565
		newval |= vpid;
566

567
		/* The futex requeue_pi code can enforce the waiters bit */
568
		if (set_waiters)
569
			newval |= FUTEX_WAITERS;
570

571
		ret = lock_pi_update_atomic(uaddr, uval, newval);
572
		if (ret)
573
			return ret;
574

575
		/*
576
		 * If the waiter bit was requested the caller also needs PI
577
		 * state attached to the new owner of the user space futex.
578
		 *
579
		 * @task is guaranteed to be alive and it cannot be exiting
580
		 * because it is either sleeping or waiting in
581
		 * futex_requeue_pi_wakeup_sync().
582
		 *
583
		 * No need to do the full attach_to_pi_owner() exercise
584
		 * because @task is known and valid.
585
		 */
586
		if (set_waiters) {
587
			raw_spin_lock_irq(&task->pi_lock);
588
			__attach_to_pi_owner(task, key, ps);
589
			raw_spin_unlock_irq(&task->pi_lock);
590
		}
591
		return 1;
592
	}
593

594
	/*
595
	 * First waiter. Set the waiters bit before attaching ourself to
596
	 * the owner. If owner tries to unlock, it will be forced into
597
	 * the kernel and blocked on hb->lock.
598
	 */
599
	newval = uval | FUTEX_WAITERS;
600
	ret = lock_pi_update_atomic(uaddr, uval, newval);
601
	if (ret)
602
		return ret;
603
	/*
604
	 * If the update of the user space value succeeded, we try to
605
	 * attach to the owner. If that fails, no harm done, we only
606
	 * set the FUTEX_WAITERS bit in the user space variable.
607
	 */
608
	return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
609
}
610

611
/*
612
 * Caller must hold a reference on @pi_state.
613
 */
614
static int wake_futex_pi(u32 __user *uaddr, u32 uval,
615
			 struct futex_pi_state *pi_state,
616
			 struct rt_mutex_waiter *top_waiter)
617
{
618
	struct task_struct *new_owner;
619
	bool postunlock = false;
620
	DEFINE_RT_WAKE_Q(wqh);
621
	u32 curval, newval;
622
	int ret = 0;
623

624
	new_owner = top_waiter->task;
625

626
	/*
627
	 * We pass it to the next owner. The WAITERS bit is always kept
628
	 * enabled while there is PI state around. We cleanup the owner
629
	 * died bit, because we are the owner.
630
	 */
631
	newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
632

633
	if (unlikely(should_fail_futex(true))) {
634
		ret = -EFAULT;
635
		goto out_unlock;
636
	}
637

638
	ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
639
	if (!ret && (curval != uval)) {
640
		/*
641
		 * If a unconditional UNLOCK_PI operation (user space did not
642
		 * try the TID->0 transition) raced with a waiter setting the
643
		 * FUTEX_WAITERS flag between get_user() and locking the hash
644
		 * bucket lock, retry the operation.
645
		 */
646
		if ((FUTEX_TID_MASK & curval) == uval)
647
			ret = -EAGAIN;
648
		else
649
			ret = -EINVAL;
650
	}
651

652
	if (!ret) {
653
		/*
654
		 * This is a point of no return; once we modified the uval
655
		 * there is no going back and subsequent operations must
656
		 * not fail.
657
		 */
658
		pi_state_update_owner(pi_state, new_owner);
659
		postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
660
	}
661

662
out_unlock:
663
	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
664

665
	if (postunlock)
666
		rt_mutex_postunlock(&wqh);
667

668
	return ret;
669
}
670

671
static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
672
				  struct task_struct *argowner)
673
{
674
	struct futex_pi_state *pi_state = q->pi_state;
675
	struct task_struct *oldowner, *newowner;
676
	u32 uval, curval, newval, newtid;
677
	int err = 0;
678

679
	oldowner = pi_state->owner;
680

681
	/*
682
	 * We are here because either:
683
	 *
684
	 *  - we stole the lock and pi_state->owner needs updating to reflect
685
	 *    that (@argowner == current),
686
	 *
687
	 * or:
688
	 *
689
	 *  - someone stole our lock and we need to fix things to point to the
690
	 *    new owner (@argowner == NULL).
691
	 *
692
	 * Either way, we have to replace the TID in the user space variable.
693
	 * This must be atomic as we have to preserve the owner died bit here.
694
	 *
695
	 * Note: We write the user space value _before_ changing the pi_state
696
	 * because we can fault here. Imagine swapped out pages or a fork
697
	 * that marked all the anonymous memory readonly for cow.
698
	 *
699
	 * Modifying pi_state _before_ the user space value would leave the
700
	 * pi_state in an inconsistent state when we fault here, because we
701
	 * need to drop the locks to handle the fault. This might be observed
702
	 * in the PID checks when attaching to PI state .
703
	 */
704
retry:
705
	if (!argowner) {
706
		if (oldowner != current) {
707
			/*
708
			 * We raced against a concurrent self; things are
709
			 * already fixed up. Nothing to do.
710
			 */
711
			return 0;
712
		}
713

714
		if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
715
			/* We got the lock. pi_state is correct. Tell caller. */
716
			return 1;
717
		}
718

719
		/*
720
		 * The trylock just failed, so either there is an owner or
721
		 * there is a higher priority waiter than this one.
722
		 */
723
		newowner = rt_mutex_owner(&pi_state->pi_mutex);
724
		/*
725
		 * If the higher priority waiter has not yet taken over the
726
		 * rtmutex then newowner is NULL. We can't return here with
727
		 * that state because it's inconsistent vs. the user space
728
		 * state. So drop the locks and try again. It's a valid
729
		 * situation and not any different from the other retry
730
		 * conditions.
731
		 */
732
		if (unlikely(!newowner)) {
733
			err = -EAGAIN;
734
			goto handle_err;
735
		}
736
	} else {
737
		WARN_ON_ONCE(argowner != current);
738
		if (oldowner == current) {
739
			/*
740
			 * We raced against a concurrent self; things are
741
			 * already fixed up. Nothing to do.
742
			 */
743
			return 1;
744
		}
745
		newowner = argowner;
746
	}
747

748
	newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
749
	/* Owner died? */
750
	if (!pi_state->owner)
751
		newtid |= FUTEX_OWNER_DIED;
752

753
	err = futex_get_value_locked(&uval, uaddr);
754
	if (err)
755
		goto handle_err;
756

757
	for (;;) {
758
		newval = (uval & FUTEX_OWNER_DIED) | newtid;
759

760
		err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
761
		if (err)
762
			goto handle_err;
763

764
		if (curval == uval)
765
			break;
766
		uval = curval;
767
	}
768

769
	/*
770
	 * We fixed up user space. Now we need to fix the pi_state
771
	 * itself.
772
	 */
773
	pi_state_update_owner(pi_state, newowner);
774

775
	return argowner == current;
776

777
	/*
778
	 * In order to reschedule or handle a page fault, we need to drop the
779
	 * locks here. In the case of a fault, this gives the other task
780
	 * (either the highest priority waiter itself or the task which stole
781
	 * the rtmutex) the chance to try the fixup of the pi_state. So once we
782
	 * are back from handling the fault we need to check the pi_state after
783
	 * reacquiring the locks and before trying to do another fixup. When
784
	 * the fixup has been done already we simply return.
785
	 *
786
	 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
787
	 * drop hb->lock since the caller owns the hb -> futex_q relation.
788
	 * Dropping the pi_mutex->wait_lock requires the state revalidate.
789
	 */
790
handle_err:
791
	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
792
	spin_unlock(q->lock_ptr);
793

794
	switch (err) {
795
	case -EFAULT:
796
		err = fault_in_user_writeable(uaddr);
797
		break;
798

799
	case -EAGAIN:
800
		cond_resched();
801
		err = 0;
802
		break;
803

804
	default:
805
		WARN_ON_ONCE(1);
806
		break;
807
	}
808

809
	futex_q_lockptr_lock(q);
810
	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
811

812
	/*
813
	 * Check if someone else fixed it for us:
814
	 */
815
	if (pi_state->owner != oldowner)
816
		return argowner == current;
817

818
	/* Retry if err was -EAGAIN or the fault in succeeded */
819
	if (!err)
820
		goto retry;
821

822
	/*
823
	 * fault_in_user_writeable() failed so user state is immutable. At
824
	 * best we can make the kernel state consistent but user state will
825
	 * be most likely hosed and any subsequent unlock operation will be
826
	 * rejected due to PI futex rule [10].
827
	 *
828
	 * Ensure that the rtmutex owner is also the pi_state owner despite
829
	 * the user space value claiming something different. There is no
830
	 * point in unlocking the rtmutex if current is the owner as it
831
	 * would need to wait until the next waiter has taken the rtmutex
832
	 * to guarantee consistent state. Keep it simple. Userspace asked
833
	 * for this wreckaged state.
834
	 *
835
	 * The rtmutex has an owner - either current or some other
836
	 * task. See the EAGAIN loop above.
837
	 */
838
	pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
839

840
	return err;
841
}
842

843
static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
844
				struct task_struct *argowner)
845
{
846
	struct futex_pi_state *pi_state = q->pi_state;
847
	int ret;
848

849
	lockdep_assert_held(q->lock_ptr);
850

851
	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
852
	ret = __fixup_pi_state_owner(uaddr, q, argowner);
853
	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
854
	return ret;
855
}
856

857
/**
858
 * fixup_pi_owner() - Post lock pi_state and corner case management
859
 * @uaddr:	user address of the futex
860
 * @q:		futex_q (contains pi_state and access to the rt_mutex)
861
 * @locked:	if the attempt to take the rt_mutex succeeded (1) or not (0)
862
 *
863
 * After attempting to lock an rt_mutex, this function is called to cleanup
864
 * the pi_state owner as well as handle race conditions that may allow us to
865
 * acquire the lock. Must be called with the hb lock held.
866
 *
867
 * Return:
868
 *  -  1 - success, lock taken;
869
 *  -  0 - success, lock not taken;
870
 *  - <0 - on error (-EFAULT)
871
 */
872
int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
873
{
874
	if (locked) {
875
		/*
876
		 * Got the lock. We might not be the anticipated owner if we
877
		 * did a lock-steal - fix up the PI-state in that case:
878
		 *
879
		 * Speculative pi_state->owner read (we don't hold wait_lock);
880
		 * since we own the lock pi_state->owner == current is the
881
		 * stable state, anything else needs more attention.
882
		 */
883
		if (q->pi_state->owner != current)
884
			return fixup_pi_state_owner(uaddr, q, current);
885
		return 1;
886
	}
887

888
	/*
889
	 * If we didn't get the lock; check if anybody stole it from us. In
890
	 * that case, we need to fix up the uval to point to them instead of
891
	 * us, otherwise bad things happen. [10]
892
	 *
893
	 * Another speculative read; pi_state->owner == current is unstable
894
	 * but needs our attention.
895
	 */
896
	if (q->pi_state->owner == current)
897
		return fixup_pi_state_owner(uaddr, q, NULL);
898

899
	/*
900
	 * Paranoia check. If we did not take the lock, then we should not be
901
	 * the owner of the rt_mutex. Warn and establish consistent state.
902
	 */
903
	if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
904
		return fixup_pi_state_owner(uaddr, q, current);
905

906
	return 0;
907
}
908

909
/*
910
 * Userspace tried a 0 -> TID atomic transition of the futex value
911
 * and failed. The kernel side here does the whole locking operation:
912
 * if there are waiters then it will block as a consequence of relying
913
 * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
914
 * a 0 value of the futex too.).
915
 *
916
 * Also serves as futex trylock_pi()'ing, and due semantics.
917
 */
918
int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
919
{
920
	struct hrtimer_sleeper timeout, *to;
921
	struct task_struct *exiting = NULL;
922
	struct rt_mutex_waiter rt_waiter;
923
	struct futex_q q = futex_q_init;
924
	DEFINE_WAKE_Q(wake_q);
925
	int res, ret;
926

927
	if (!IS_ENABLED(CONFIG_FUTEX_PI))
928
		return -ENOSYS;
929

930
	if (refill_pi_state_cache())
931
		return -ENOMEM;
932

933
	to = futex_setup_timer(time, &timeout, flags, 0);
934

935
retry:
936
	ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE);
937
	if (unlikely(ret != 0))
938
		goto out;
939

940
retry_private:
941
	if (1) {
942
		CLASS(hb, hb)(&q.key);
943

944
		futex_q_lock(&q, hb);
945

946
		ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
947
					   &exiting, 0);
948
		if (unlikely(ret)) {
949
			/*
950
			 * Atomic work succeeded and we got the lock,
951
			 * or failed. Either way, we do _not_ block.
952
			 */
953
			switch (ret) {
954
			case 1:
955
				/* We got the lock. */
956
				ret = 0;
957
				goto out_unlock_put_key;
958
			case -EFAULT:
959
				goto uaddr_faulted;
960
			case -EBUSY:
961
			case -EAGAIN:
962
				/*
963
				 * Two reasons for this:
964
				 * - EBUSY: Task is exiting and we just wait for the
965
				 *   exit to complete.
966
				 * - EAGAIN: The user space value changed.
967
				 */
968
				futex_q_unlock(hb);
969
				/*
970
				 * Handle the case where the owner is in the middle of
971
				 * exiting. Wait for the exit to complete otherwise
972
				 * this task might loop forever, aka. live lock.
973
				 */
974
				wait_for_owner_exiting(ret, exiting);
975
				cond_resched();
976
				goto retry;
977
			default:
978
				goto out_unlock_put_key;
979
			}
980
		}
981

982
		WARN_ON(!q.pi_state);
983

984
		/*
985
		 * Only actually queue now that the atomic ops are done:
986
		 */
987
		__futex_queue(&q, hb, current);
988

989
		if (trylock) {
990
			ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
991
			/* Fixup the trylock return value: */
992
			ret = ret ? 0 : -EWOULDBLOCK;
993
			goto no_block;
994
		}
995

996
		/*
997
		 * Caution; releasing @hb in-scope. The hb->lock is still locked
998
		 * while the reference is dropped. The reference can not be dropped
999
		 * after the unlock because if a user initiated resize is in progress
1000
		 * then we might need to wake him. This can not be done after the
1001
		 * rt_mutex_pre_schedule() invocation. The hb will remain valid because
1002
		 * the thread, performing resize, will block on hb->lock during
1003
		 * the requeue.
1004
		 */
1005
		futex_hash_put(no_free_ptr(hb));
1006
		/*
1007
		 * Must be done before we enqueue the waiter, here is unfortunately
1008
		 * under the hb lock, but that *should* work because it does nothing.
1009
		 */
1010
		rt_mutex_pre_schedule();
1011

1012
		rt_mutex_init_waiter(&rt_waiter);
1013

1014
		/*
1015
		 * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
1016
		 * hold it while doing rt_mutex_start_proxy(), because then it will
1017
		 * include hb->lock in the blocking chain, even through we'll not in
1018
		 * fact hold it while blocking. This will lead it to report -EDEADLK
1019
		 * and BUG when futex_unlock_pi() interleaves with this.
1020
		 *
1021
		 * Therefore acquire wait_lock while holding hb->lock, but drop the
1022
		 * latter before calling __rt_mutex_start_proxy_lock(). This
1023
		 * interleaves with futex_unlock_pi() -- which does a similar lock
1024
		 * handoff -- such that the latter can observe the futex_q::pi_state
1025
		 * before __rt_mutex_start_proxy_lock() is done.
1026
		 */
1027
		raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
1028
		spin_unlock(q.lock_ptr);
1029
		/*
1030
		 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
1031
		 * such that futex_unlock_pi() is guaranteed to observe the waiter when
1032
		 * it sees the futex_q::pi_state.
1033
		 */
1034
		ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q);
1035
		raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q);
1036

1037
		if (ret) {
1038
			if (ret == 1)
1039
				ret = 0;
1040
			goto cleanup;
1041
		}
1042

1043
		if (unlikely(to))
1044
			hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
1045

1046
		ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
1047

1048
cleanup:
1049
		/*
1050
		 * If we failed to acquire the lock (deadlock/signal/timeout), we must
1051
		 * unwind the above, however we canont lock hb->lock because
1052
		 * rt_mutex already has a waiter enqueued and hb->lock can itself try
1053
		 * and enqueue an rt_waiter through rtlock.
1054
		 *
1055
		 * Doing the cleanup without holding hb->lock can cause inconsistent
1056
		 * state between hb and pi_state, but only in the direction of not
1057
		 * seeing a waiter that is leaving.
1058
		 *
1059
		 * See futex_unlock_pi(), it deals with this inconsistency.
1060
		 *
1061
		 * There be dragons here, since we must deal with the inconsistency on
1062
		 * the way out (here), it is impossible to detect/warn about the race
1063
		 * the other way around (missing an incoming waiter).
1064
		 *
1065
		 * What could possibly go wrong...
1066
		 */
1067
		if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
1068
			ret = 0;
1069

1070
		/*
1071
		 * Now that the rt_waiter has been dequeued, it is safe to use
1072
		 * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up
1073
		 * the
1074
		 */
1075
		futex_q_lockptr_lock(&q);
1076
		/*
1077
		 * Waiter is unqueued.
1078
		 */
1079
		rt_mutex_post_schedule();
1080
no_block:
1081
		/*
1082
		 * Fixup the pi_state owner and possibly acquire the lock if we
1083
		 * haven't already.
1084
		 */
1085
		res = fixup_pi_owner(uaddr, &q, !ret);
1086
		/*
1087
		 * If fixup_pi_owner() returned an error, propagate that.  If it acquired
1088
		 * the lock, clear our -ETIMEDOUT or -EINTR.
1089
		 */
1090
		if (res)
1091
			ret = (res < 0) ? res : 0;
1092

1093
		futex_unqueue_pi(&q);
1094
		spin_unlock(q.lock_ptr);
1095
		if (q.drop_hb_ref) {
1096
			CLASS(hb, hb)(&q.key);
1097
			/* Additional reference from futex_unlock_pi() */
1098
			futex_hash_put(hb);
1099
		}
1100
		goto out;
1101

1102
out_unlock_put_key:
1103
		futex_q_unlock(hb);
1104
		goto out;
1105

1106
uaddr_faulted:
1107
		futex_q_unlock(hb);
1108

1109
		ret = fault_in_user_writeable(uaddr);
1110
		if (ret)
1111
			goto out;
1112

1113
		if (!(flags & FLAGS_SHARED))
1114
			goto retry_private;
1115

1116
		goto retry;
1117
	}
1118

1119
out:
1120
	if (to) {
1121
		hrtimer_cancel(&to->timer);
1122
		destroy_hrtimer_on_stack(&to->timer);
1123
	}
1124
	return ret != -EINTR ? ret : -ERESTARTNOINTR;
1125
}
1126

1127
/*
1128
 * Userspace attempted a TID -> 0 atomic transition, and failed.
1129
 * This is the in-kernel slowpath: we look up the PI state (if any),
1130
 * and do the rt-mutex unlock.
1131
 */
1132
int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
1133
{
1134
	u32 curval, uval, vpid = task_pid_vnr(current);
1135
	union futex_key key = FUTEX_KEY_INIT;
1136
	struct futex_q *top_waiter;
1137
	int ret;
1138

1139
	if (!IS_ENABLED(CONFIG_FUTEX_PI))
1140
		return -ENOSYS;
1141

1142
retry:
1143
	if (get_user(uval, uaddr))
1144
		return -EFAULT;
1145
	/*
1146
	 * We release only a lock we actually own:
1147
	 */
1148
	if ((uval & FUTEX_TID_MASK) != vpid)
1149
		return -EPERM;
1150

1151
	ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE);
1152
	if (ret)
1153
		return ret;
1154

1155
	CLASS(hb, hb)(&key);
1156
	spin_lock(&hb->lock);
1157
retry_hb:
1158

1159
	/*
1160
	 * Check waiters first. We do not trust user space values at
1161
	 * all and we at least want to know if user space fiddled
1162
	 * with the futex value instead of blindly unlocking.
1163
	 */
1164
	top_waiter = futex_top_waiter(hb, &key);
1165
	if (top_waiter) {
1166
		struct futex_pi_state *pi_state = top_waiter->pi_state;
1167
		struct rt_mutex_waiter *rt_waiter;
1168

1169
		ret = -EINVAL;
1170
		if (!pi_state)
1171
			goto out_unlock;
1172

1173
		/*
1174
		 * If current does not own the pi_state then the futex is
1175
		 * inconsistent and user space fiddled with the futex value.
1176
		 */
1177
		if (pi_state->owner != current)
1178
			goto out_unlock;
1179

1180
		/*
1181
		 * By taking wait_lock while still holding hb->lock, we ensure
1182
		 * there is no point where we hold neither; and thereby
1183
		 * wake_futex_pi() must observe any new waiters.
1184
		 *
1185
		 * Since the cleanup: case in futex_lock_pi() removes the
1186
		 * rt_waiter without holding hb->lock, it is possible for
1187
		 * wake_futex_pi() to not find a waiter while the above does,
1188
		 * in this case the waiter is on the way out and it can be
1189
		 * ignored.
1190
		 *
1191
		 * In particular; this forces __rt_mutex_start_proxy() to
1192
		 * complete such that we're guaranteed to observe the
1193
		 * rt_waiter.
1194
		 */
1195
		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1196

1197
		/*
1198
		 * Futex vs rt_mutex waiter state -- if there are no rt_mutex
1199
		 * waiters even though futex thinks there are, then the waiter
1200
		 * is leaving. The entry needs to be removed from the list so a
1201
		 * new futex_lock_pi() is not using this stale PI-state while
1202
		 * the futex is available in user space again.
1203
		 * There can be more than one task on its way out so it needs
1204
		 * to retry.
1205
		 */
1206
		rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
1207
		if (!rt_waiter) {
1208
			/*
1209
			 * Acquire a reference for the leaving waiter to ensure
1210
			 * valid futex_q::lock_ptr.
1211
			 */
1212
			futex_hash_get(hb);
1213
			top_waiter->drop_hb_ref = true;
1214
			__futex_unqueue(top_waiter);
1215
			raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1216
			goto retry_hb;
1217
		}
1218

1219
		get_pi_state(pi_state);
1220
		spin_unlock(&hb->lock);
1221

1222
		/* drops pi_state->pi_mutex.wait_lock */
1223
		ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter);
1224

1225
		put_pi_state(pi_state);
1226

1227
		/*
1228
		 * Success, we're done! No tricky corner cases.
1229
		 */
1230
		if (!ret)
1231
			return ret;
1232
		/*
1233
		 * The atomic access to the futex value generated a
1234
		 * pagefault, so retry the user-access and the wakeup:
1235
		 */
1236
		if (ret == -EFAULT)
1237
			goto pi_faulted;
1238
		/*
1239
		 * A unconditional UNLOCK_PI op raced against a waiter
1240
		 * setting the FUTEX_WAITERS bit. Try again.
1241
		 */
1242
		if (ret == -EAGAIN)
1243
			goto pi_retry;
1244
		/*
1245
		 * wake_futex_pi has detected invalid state. Tell user
1246
		 * space.
1247
		 */
1248
		return ret;
1249
	}
1250

1251
	/*
1252
	 * We have no kernel internal state, i.e. no waiters in the
1253
	 * kernel. Waiters which are about to queue themselves are stuck
1254
	 * on hb->lock. So we can safely ignore them. We do neither
1255
	 * preserve the WAITERS bit not the OWNER_DIED one. We are the
1256
	 * owner.
1257
	 */
1258
	if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
1259
		spin_unlock(&hb->lock);
1260
		switch (ret) {
1261
		case -EFAULT:
1262
			goto pi_faulted;
1263

1264
		case -EAGAIN:
1265
			goto pi_retry;
1266

1267
		default:
1268
			WARN_ON_ONCE(1);
1269
			return ret;
1270
		}
1271
	}
1272

1273
	/*
1274
	 * If uval has changed, let user space handle it.
1275
	 */
1276
	ret = (curval == uval) ? 0 : -EAGAIN;
1277

1278
out_unlock:
1279
	spin_unlock(&hb->lock);
1280
	return ret;
1281

1282
pi_retry:
1283
	cond_resched();
1284
	goto retry;
1285

1286
pi_faulted:
1287

1288
	ret = fault_in_user_writeable(uaddr);
1289
	if (!ret)
1290
		goto retry;
1291

1292
	return ret;
1293
}
1294

1295

1296
Product

Resources

Company