Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/fs/exec.c
29264 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* linux/fs/exec.c
4
*
5
* Copyright (C) 1991, 1992 Linus Torvalds
6
*/
7
8
/*
9
* #!-checking implemented by tytso.
10
*/
11
/*
12
* Demand-loading implemented 01.12.91 - no need to read anything but
13
* the header into memory. The inode of the executable is put into
14
* "current->executable", and page faults do the actual loading. Clean.
15
*
16
* Once more I can proudly say that linux stood up to being changed: it
17
* was less than 2 hours work to get demand-loading completely implemented.
18
*
19
* Demand loading changed July 1993 by Eric Youngdale. Use mmap instead,
20
* current->executable is only used by the procfs. This allows a dispatch
21
* table to check for several different types of binary formats. We keep
22
* trying until we recognize the file or we run out of supported binary
23
* formats.
24
*/
25
26
#include <linux/kernel_read_file.h>
27
#include <linux/slab.h>
28
#include <linux/file.h>
29
#include <linux/fdtable.h>
30
#include <linux/mm.h>
31
#include <linux/stat.h>
32
#include <linux/fcntl.h>
33
#include <linux/swap.h>
34
#include <linux/string.h>
35
#include <linux/init.h>
36
#include <linux/sched/mm.h>
37
#include <linux/sched/coredump.h>
38
#include <linux/sched/signal.h>
39
#include <linux/sched/numa_balancing.h>
40
#include <linux/sched/task.h>
41
#include <linux/pagemap.h>
42
#include <linux/perf_event.h>
43
#include <linux/highmem.h>
44
#include <linux/spinlock.h>
45
#include <linux/key.h>
46
#include <linux/personality.h>
47
#include <linux/binfmts.h>
48
#include <linux/utsname.h>
49
#include <linux/pid_namespace.h>
50
#include <linux/module.h>
51
#include <linux/namei.h>
52
#include <linux/mount.h>
53
#include <linux/security.h>
54
#include <linux/syscalls.h>
55
#include <linux/tsacct_kern.h>
56
#include <linux/cn_proc.h>
57
#include <linux/audit.h>
58
#include <linux/kmod.h>
59
#include <linux/fsnotify.h>
60
#include <linux/fs_struct.h>
61
#include <linux/oom.h>
62
#include <linux/compat.h>
63
#include <linux/vmalloc.h>
64
#include <linux/io_uring.h>
65
#include <linux/syscall_user_dispatch.h>
66
#include <linux/coredump.h>
67
#include <linux/time_namespace.h>
68
#include <linux/user_events.h>
69
#include <linux/rseq.h>
70
#include <linux/ksm.h>
71
72
#include <linux/uaccess.h>
73
#include <asm/mmu_context.h>
74
#include <asm/tlb.h>
75
76
#include <trace/events/task.h>
77
#include "internal.h"
78
79
#include <trace/events/sched.h>
80
81
/* For vma exec functions. */
82
#include "../mm/internal.h"
83
84
static int bprm_creds_from_file(struct linux_binprm *bprm);
85
86
int suid_dumpable = 0;
87
88
static LIST_HEAD(formats);
89
static DEFINE_RWLOCK(binfmt_lock);
90
91
void __register_binfmt(struct linux_binfmt * fmt, int insert)
92
{
93
write_lock(&binfmt_lock);
94
insert ? list_add(&fmt->lh, &formats) :
95
list_add_tail(&fmt->lh, &formats);
96
write_unlock(&binfmt_lock);
97
}
98
99
EXPORT_SYMBOL(__register_binfmt);
100
101
void unregister_binfmt(struct linux_binfmt * fmt)
102
{
103
write_lock(&binfmt_lock);
104
list_del(&fmt->lh);
105
write_unlock(&binfmt_lock);
106
}
107
108
EXPORT_SYMBOL(unregister_binfmt);
109
110
static inline void put_binfmt(struct linux_binfmt * fmt)
111
{
112
module_put(fmt->module);
113
}
114
115
bool path_noexec(const struct path *path)
116
{
117
/* If it's an anonymous inode make sure that we catch any shenanigans. */
118
VFS_WARN_ON_ONCE(IS_ANON_FILE(d_inode(path->dentry)) &&
119
!(path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC));
120
return (path->mnt->mnt_flags & MNT_NOEXEC) ||
121
(path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
122
}
123
124
#ifdef CONFIG_MMU
125
/*
126
* The nascent bprm->mm is not visible until exec_mmap() but it can
127
* use a lot of memory, account these pages in current->mm temporary
128
* for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
129
* change the counter back via acct_arg_size(0).
130
*/
131
static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
132
{
133
struct mm_struct *mm = current->mm;
134
long diff = (long)(pages - bprm->vma_pages);
135
136
if (!mm || !diff)
137
return;
138
139
bprm->vma_pages = pages;
140
add_mm_counter(mm, MM_ANONPAGES, diff);
141
}
142
143
static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
144
int write)
145
{
146
struct page *page;
147
struct vm_area_struct *vma = bprm->vma;
148
struct mm_struct *mm = bprm->mm;
149
int ret;
150
151
/*
152
* Avoid relying on expanding the stack down in GUP (which
153
* does not work for STACK_GROWSUP anyway), and just do it
154
* ahead of time.
155
*/
156
if (!mmap_read_lock_maybe_expand(mm, vma, pos, write))
157
return NULL;
158
159
/*
160
* We are doing an exec(). 'current' is the process
161
* doing the exec and 'mm' is the new process's mm.
162
*/
163
ret = get_user_pages_remote(mm, pos, 1,
164
write ? FOLL_WRITE : 0,
165
&page, NULL);
166
mmap_read_unlock(mm);
167
if (ret <= 0)
168
return NULL;
169
170
if (write)
171
acct_arg_size(bprm, vma_pages(vma));
172
173
return page;
174
}
175
176
static void put_arg_page(struct page *page)
177
{
178
put_page(page);
179
}
180
181
static void free_arg_pages(struct linux_binprm *bprm)
182
{
183
}
184
185
static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
186
struct page *page)
187
{
188
flush_cache_page(bprm->vma, pos, page_to_pfn(page));
189
}
190
191
static bool valid_arg_len(struct linux_binprm *bprm, long len)
192
{
193
return len <= MAX_ARG_STRLEN;
194
}
195
196
#else
197
198
static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
199
{
200
}
201
202
static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
203
int write)
204
{
205
struct page *page;
206
207
page = bprm->page[pos / PAGE_SIZE];
208
if (!page && write) {
209
page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
210
if (!page)
211
return NULL;
212
bprm->page[pos / PAGE_SIZE] = page;
213
}
214
215
return page;
216
}
217
218
static void put_arg_page(struct page *page)
219
{
220
}
221
222
static void free_arg_page(struct linux_binprm *bprm, int i)
223
{
224
if (bprm->page[i]) {
225
__free_page(bprm->page[i]);
226
bprm->page[i] = NULL;
227
}
228
}
229
230
static void free_arg_pages(struct linux_binprm *bprm)
231
{
232
int i;
233
234
for (i = 0; i < MAX_ARG_PAGES; i++)
235
free_arg_page(bprm, i);
236
}
237
238
static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
239
struct page *page)
240
{
241
}
242
243
static bool valid_arg_len(struct linux_binprm *bprm, long len)
244
{
245
return len <= bprm->p;
246
}
247
248
#endif /* CONFIG_MMU */
249
250
/*
251
* Create a new mm_struct and populate it with a temporary stack
252
* vm_area_struct. We don't have enough context at this point to set the stack
253
* flags, permissions, and offset, so we use temporary values. We'll update
254
* them later in setup_arg_pages().
255
*/
256
static int bprm_mm_init(struct linux_binprm *bprm)
257
{
258
int err;
259
struct mm_struct *mm = NULL;
260
261
bprm->mm = mm = mm_alloc();
262
err = -ENOMEM;
263
if (!mm)
264
goto err;
265
266
/* Save current stack limit for all calculations made during exec. */
267
task_lock(current->group_leader);
268
bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
269
task_unlock(current->group_leader);
270
271
#ifndef CONFIG_MMU
272
bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
273
#else
274
err = create_init_stack_vma(bprm->mm, &bprm->vma, &bprm->p);
275
if (err)
276
goto err;
277
#endif
278
279
return 0;
280
281
err:
282
if (mm) {
283
bprm->mm = NULL;
284
mmdrop(mm);
285
}
286
287
return err;
288
}
289
290
struct user_arg_ptr {
291
#ifdef CONFIG_COMPAT
292
bool is_compat;
293
#endif
294
union {
295
const char __user *const __user *native;
296
#ifdef CONFIG_COMPAT
297
const compat_uptr_t __user *compat;
298
#endif
299
} ptr;
300
};
301
302
static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
303
{
304
const char __user *native;
305
306
#ifdef CONFIG_COMPAT
307
if (unlikely(argv.is_compat)) {
308
compat_uptr_t compat;
309
310
if (get_user(compat, argv.ptr.compat + nr))
311
return ERR_PTR(-EFAULT);
312
313
return compat_ptr(compat);
314
}
315
#endif
316
317
if (get_user(native, argv.ptr.native + nr))
318
return ERR_PTR(-EFAULT);
319
320
return native;
321
}
322
323
/*
324
* count() counts the number of strings in array ARGV.
325
*/
326
static int count(struct user_arg_ptr argv, int max)
327
{
328
int i = 0;
329
330
if (argv.ptr.native != NULL) {
331
for (;;) {
332
const char __user *p = get_user_arg_ptr(argv, i);
333
334
if (!p)
335
break;
336
337
if (IS_ERR(p))
338
return -EFAULT;
339
340
if (i >= max)
341
return -E2BIG;
342
++i;
343
344
if (fatal_signal_pending(current))
345
return -ERESTARTNOHAND;
346
cond_resched();
347
}
348
}
349
return i;
350
}
351
352
static int count_strings_kernel(const char *const *argv)
353
{
354
int i;
355
356
if (!argv)
357
return 0;
358
359
for (i = 0; argv[i]; ++i) {
360
if (i >= MAX_ARG_STRINGS)
361
return -E2BIG;
362
if (fatal_signal_pending(current))
363
return -ERESTARTNOHAND;
364
cond_resched();
365
}
366
return i;
367
}
368
369
static inline int bprm_set_stack_limit(struct linux_binprm *bprm,
370
unsigned long limit)
371
{
372
#ifdef CONFIG_MMU
373
/* Avoid a pathological bprm->p. */
374
if (bprm->p < limit)
375
return -E2BIG;
376
bprm->argmin = bprm->p - limit;
377
#endif
378
return 0;
379
}
380
static inline bool bprm_hit_stack_limit(struct linux_binprm *bprm)
381
{
382
#ifdef CONFIG_MMU
383
return bprm->p < bprm->argmin;
384
#else
385
return false;
386
#endif
387
}
388
389
/*
390
* Calculate bprm->argmin from:
391
* - _STK_LIM
392
* - ARG_MAX
393
* - bprm->rlim_stack.rlim_cur
394
* - bprm->argc
395
* - bprm->envc
396
* - bprm->p
397
*/
398
static int bprm_stack_limits(struct linux_binprm *bprm)
399
{
400
unsigned long limit, ptr_size;
401
402
/*
403
* Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
404
* (whichever is smaller) for the argv+env strings.
405
* This ensures that:
406
* - the remaining binfmt code will not run out of stack space,
407
* - the program will have a reasonable amount of stack left
408
* to work from.
409
*/
410
limit = _STK_LIM / 4 * 3;
411
limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
412
/*
413
* We've historically supported up to 32 pages (ARG_MAX)
414
* of argument strings even with small stacks
415
*/
416
limit = max_t(unsigned long, limit, ARG_MAX);
417
/* Reject totally pathological counts. */
418
if (bprm->argc < 0 || bprm->envc < 0)
419
return -E2BIG;
420
/*
421
* We must account for the size of all the argv and envp pointers to
422
* the argv and envp strings, since they will also take up space in
423
* the stack. They aren't stored until much later when we can't
424
* signal to the parent that the child has run out of stack space.
425
* Instead, calculate it here so it's possible to fail gracefully.
426
*
427
* In the case of argc = 0, make sure there is space for adding a
428
* empty string (which will bump argc to 1), to ensure confused
429
* userspace programs don't start processing from argv[1], thinking
430
* argc can never be 0, to keep them from walking envp by accident.
431
* See do_execveat_common().
432
*/
433
if (check_add_overflow(max(bprm->argc, 1), bprm->envc, &ptr_size) ||
434
check_mul_overflow(ptr_size, sizeof(void *), &ptr_size))
435
return -E2BIG;
436
if (limit <= ptr_size)
437
return -E2BIG;
438
limit -= ptr_size;
439
440
return bprm_set_stack_limit(bprm, limit);
441
}
442
443
/*
444
* 'copy_strings()' copies argument/environment strings from the old
445
* processes's memory to the new process's stack. The call to get_user_pages()
446
* ensures the destination page is created and not swapped out.
447
*/
448
static int copy_strings(int argc, struct user_arg_ptr argv,
449
struct linux_binprm *bprm)
450
{
451
struct page *kmapped_page = NULL;
452
char *kaddr = NULL;
453
unsigned long kpos = 0;
454
int ret;
455
456
while (argc-- > 0) {
457
const char __user *str;
458
int len;
459
unsigned long pos;
460
461
ret = -EFAULT;
462
str = get_user_arg_ptr(argv, argc);
463
if (IS_ERR(str))
464
goto out;
465
466
len = strnlen_user(str, MAX_ARG_STRLEN);
467
if (!len)
468
goto out;
469
470
ret = -E2BIG;
471
if (!valid_arg_len(bprm, len))
472
goto out;
473
474
/* We're going to work our way backwards. */
475
pos = bprm->p;
476
str += len;
477
bprm->p -= len;
478
if (bprm_hit_stack_limit(bprm))
479
goto out;
480
481
while (len > 0) {
482
int offset, bytes_to_copy;
483
484
if (fatal_signal_pending(current)) {
485
ret = -ERESTARTNOHAND;
486
goto out;
487
}
488
cond_resched();
489
490
offset = pos % PAGE_SIZE;
491
if (offset == 0)
492
offset = PAGE_SIZE;
493
494
bytes_to_copy = offset;
495
if (bytes_to_copy > len)
496
bytes_to_copy = len;
497
498
offset -= bytes_to_copy;
499
pos -= bytes_to_copy;
500
str -= bytes_to_copy;
501
len -= bytes_to_copy;
502
503
if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
504
struct page *page;
505
506
page = get_arg_page(bprm, pos, 1);
507
if (!page) {
508
ret = -E2BIG;
509
goto out;
510
}
511
512
if (kmapped_page) {
513
flush_dcache_page(kmapped_page);
514
kunmap_local(kaddr);
515
put_arg_page(kmapped_page);
516
}
517
kmapped_page = page;
518
kaddr = kmap_local_page(kmapped_page);
519
kpos = pos & PAGE_MASK;
520
flush_arg_page(bprm, kpos, kmapped_page);
521
}
522
if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
523
ret = -EFAULT;
524
goto out;
525
}
526
}
527
}
528
ret = 0;
529
out:
530
if (kmapped_page) {
531
flush_dcache_page(kmapped_page);
532
kunmap_local(kaddr);
533
put_arg_page(kmapped_page);
534
}
535
return ret;
536
}
537
538
/*
539
* Copy and argument/environment string from the kernel to the processes stack.
540
*/
541
int copy_string_kernel(const char *arg, struct linux_binprm *bprm)
542
{
543
int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */;
544
unsigned long pos = bprm->p;
545
546
if (len == 0)
547
return -EFAULT;
548
if (!valid_arg_len(bprm, len))
549
return -E2BIG;
550
551
/* We're going to work our way backwards. */
552
arg += len;
553
bprm->p -= len;
554
if (bprm_hit_stack_limit(bprm))
555
return -E2BIG;
556
557
while (len > 0) {
558
unsigned int bytes_to_copy = min_t(unsigned int, len,
559
min_not_zero(offset_in_page(pos), PAGE_SIZE));
560
struct page *page;
561
562
pos -= bytes_to_copy;
563
arg -= bytes_to_copy;
564
len -= bytes_to_copy;
565
566
page = get_arg_page(bprm, pos, 1);
567
if (!page)
568
return -E2BIG;
569
flush_arg_page(bprm, pos & PAGE_MASK, page);
570
memcpy_to_page(page, offset_in_page(pos), arg, bytes_to_copy);
571
put_arg_page(page);
572
}
573
574
return 0;
575
}
576
EXPORT_SYMBOL(copy_string_kernel);
577
578
static int copy_strings_kernel(int argc, const char *const *argv,
579
struct linux_binprm *bprm)
580
{
581
while (argc-- > 0) {
582
int ret = copy_string_kernel(argv[argc], bprm);
583
if (ret < 0)
584
return ret;
585
if (fatal_signal_pending(current))
586
return -ERESTARTNOHAND;
587
cond_resched();
588
}
589
return 0;
590
}
591
592
#ifdef CONFIG_MMU
593
594
/*
595
* Finalizes the stack vm_area_struct. The flags and permissions are updated,
596
* the stack is optionally relocated, and some extra space is added.
597
*/
598
int setup_arg_pages(struct linux_binprm *bprm,
599
unsigned long stack_top,
600
int executable_stack)
601
{
602
int ret;
603
unsigned long stack_shift;
604
struct mm_struct *mm = current->mm;
605
struct vm_area_struct *vma = bprm->vma;
606
struct vm_area_struct *prev = NULL;
607
vm_flags_t vm_flags;
608
unsigned long stack_base;
609
unsigned long stack_size;
610
unsigned long stack_expand;
611
unsigned long rlim_stack;
612
struct mmu_gather tlb;
613
struct vma_iterator vmi;
614
615
#ifdef CONFIG_STACK_GROWSUP
616
/* Limit stack size */
617
stack_base = bprm->rlim_stack.rlim_max;
618
619
stack_base = calc_max_stack_size(stack_base);
620
621
/* Add space for stack randomization. */
622
if (current->flags & PF_RANDOMIZE)
623
stack_base += (STACK_RND_MASK << PAGE_SHIFT);
624
625
/* Make sure we didn't let the argument array grow too large. */
626
if (vma->vm_end - vma->vm_start > stack_base)
627
return -ENOMEM;
628
629
stack_base = PAGE_ALIGN(stack_top - stack_base);
630
631
stack_shift = vma->vm_start - stack_base;
632
mm->arg_start = bprm->p - stack_shift;
633
bprm->p = vma->vm_end - stack_shift;
634
#else
635
stack_top = arch_align_stack(stack_top);
636
stack_top = PAGE_ALIGN(stack_top);
637
638
if (unlikely(stack_top < mmap_min_addr) ||
639
unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
640
return -ENOMEM;
641
642
stack_shift = vma->vm_end - stack_top;
643
644
bprm->p -= stack_shift;
645
mm->arg_start = bprm->p;
646
#endif
647
648
bprm->exec -= stack_shift;
649
650
if (mmap_write_lock_killable(mm))
651
return -EINTR;
652
653
vm_flags = VM_STACK_FLAGS;
654
655
/*
656
* Adjust stack execute permissions; explicitly enable for
657
* EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
658
* (arch default) otherwise.
659
*/
660
if (unlikely(executable_stack == EXSTACK_ENABLE_X))
661
vm_flags |= VM_EXEC;
662
else if (executable_stack == EXSTACK_DISABLE_X)
663
vm_flags &= ~VM_EXEC;
664
vm_flags |= mm->def_flags;
665
vm_flags |= VM_STACK_INCOMPLETE_SETUP;
666
667
vma_iter_init(&vmi, mm, vma->vm_start);
668
669
tlb_gather_mmu(&tlb, mm);
670
ret = mprotect_fixup(&vmi, &tlb, vma, &prev, vma->vm_start, vma->vm_end,
671
vm_flags);
672
tlb_finish_mmu(&tlb);
673
674
if (ret)
675
goto out_unlock;
676
BUG_ON(prev != vma);
677
678
if (unlikely(vm_flags & VM_EXEC)) {
679
pr_warn_once("process '%pD4' started with executable stack\n",
680
bprm->file);
681
}
682
683
/* Move stack pages down in memory. */
684
if (stack_shift) {
685
/*
686
* During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once
687
* the binfmt code determines where the new stack should reside, we shift it to
688
* its final location.
689
*/
690
ret = relocate_vma_down(vma, stack_shift);
691
if (ret)
692
goto out_unlock;
693
}
694
695
/* mprotect_fixup is overkill to remove the temporary stack flags */
696
vm_flags_clear(vma, VM_STACK_INCOMPLETE_SETUP);
697
698
stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
699
stack_size = vma->vm_end - vma->vm_start;
700
/*
701
* Align this down to a page boundary as expand_stack
702
* will align it up.
703
*/
704
rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK;
705
706
stack_expand = min(rlim_stack, stack_size + stack_expand);
707
708
#ifdef CONFIG_STACK_GROWSUP
709
stack_base = vma->vm_start + stack_expand;
710
#else
711
stack_base = vma->vm_end - stack_expand;
712
#endif
713
current->mm->start_stack = bprm->p;
714
ret = expand_stack_locked(vma, stack_base);
715
if (ret)
716
ret = -EFAULT;
717
718
out_unlock:
719
mmap_write_unlock(mm);
720
return ret;
721
}
722
EXPORT_SYMBOL(setup_arg_pages);
723
724
#else
725
726
/*
727
* Transfer the program arguments and environment from the holding pages
728
* onto the stack. The provided stack pointer is adjusted accordingly.
729
*/
730
int transfer_args_to_stack(struct linux_binprm *bprm,
731
unsigned long *sp_location)
732
{
733
unsigned long index, stop, sp;
734
int ret = 0;
735
736
stop = bprm->p >> PAGE_SHIFT;
737
sp = *sp_location;
738
739
for (index = MAX_ARG_PAGES - 1; index >= stop; index--) {
740
unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0;
741
char *src = kmap_local_page(bprm->page[index]) + offset;
742
sp -= PAGE_SIZE - offset;
743
if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0)
744
ret = -EFAULT;
745
kunmap_local(src);
746
if (ret)
747
goto out;
748
}
749
750
bprm->exec += *sp_location - MAX_ARG_PAGES * PAGE_SIZE;
751
*sp_location = sp;
752
753
out:
754
return ret;
755
}
756
EXPORT_SYMBOL(transfer_args_to_stack);
757
758
#endif /* CONFIG_MMU */
759
760
/*
761
* On success, caller must call do_close_execat() on the returned
762
* struct file to close it.
763
*/
764
static struct file *do_open_execat(int fd, struct filename *name, int flags)
765
{
766
int err;
767
struct file *file __free(fput) = NULL;
768
struct open_flags open_exec_flags = {
769
.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
770
.acc_mode = MAY_EXEC,
771
.intent = LOOKUP_OPEN,
772
.lookup_flags = LOOKUP_FOLLOW,
773
};
774
775
if ((flags &
776
~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH | AT_EXECVE_CHECK)) != 0)
777
return ERR_PTR(-EINVAL);
778
if (flags & AT_SYMLINK_NOFOLLOW)
779
open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
780
if (flags & AT_EMPTY_PATH)
781
open_exec_flags.lookup_flags |= LOOKUP_EMPTY;
782
783
file = do_filp_open(fd, name, &open_exec_flags);
784
if (IS_ERR(file))
785
return file;
786
787
if (path_noexec(&file->f_path))
788
return ERR_PTR(-EACCES);
789
790
/*
791
* In the past the regular type check was here. It moved to may_open() in
792
* 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is
793
* an invariant that all non-regular files error out before we get here.
794
*/
795
if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)))
796
return ERR_PTR(-EACCES);
797
798
err = exe_file_deny_write_access(file);
799
if (err)
800
return ERR_PTR(err);
801
802
return no_free_ptr(file);
803
}
804
805
/**
806
* open_exec - Open a path name for execution
807
*
808
* @name: path name to open with the intent of executing it.
809
*
810
* Returns ERR_PTR on failure or allocated struct file on success.
811
*
812
* As this is a wrapper for the internal do_open_execat(), callers
813
* must call exe_file_allow_write_access() before fput() on release. Also see
814
* do_close_execat().
815
*/
816
struct file *open_exec(const char *name)
817
{
818
struct filename *filename = getname_kernel(name);
819
struct file *f = ERR_CAST(filename);
820
821
if (!IS_ERR(filename)) {
822
f = do_open_execat(AT_FDCWD, filename, 0);
823
putname(filename);
824
}
825
return f;
826
}
827
EXPORT_SYMBOL(open_exec);
828
829
#if defined(CONFIG_BINFMT_FLAT) || defined(CONFIG_BINFMT_ELF_FDPIC)
830
ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
831
{
832
ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
833
if (res > 0)
834
flush_icache_user_range(addr, addr + len);
835
return res;
836
}
837
EXPORT_SYMBOL(read_code);
838
#endif
839
840
/*
841
* Maps the mm_struct mm into the current task struct.
842
* On success, this function returns with exec_update_lock
843
* held for writing.
844
*/
845
static int exec_mmap(struct mm_struct *mm)
846
{
847
struct task_struct *tsk;
848
struct mm_struct *old_mm, *active_mm;
849
int ret;
850
851
/* Notify parent that we're no longer interested in the old VM */
852
tsk = current;
853
old_mm = current->mm;
854
exec_mm_release(tsk, old_mm);
855
856
ret = down_write_killable(&tsk->signal->exec_update_lock);
857
if (ret)
858
return ret;
859
860
if (old_mm) {
861
/*
862
* If there is a pending fatal signal perhaps a signal
863
* whose default action is to create a coredump get
864
* out and die instead of going through with the exec.
865
*/
866
ret = mmap_read_lock_killable(old_mm);
867
if (ret) {
868
up_write(&tsk->signal->exec_update_lock);
869
return ret;
870
}
871
}
872
873
task_lock(tsk);
874
membarrier_exec_mmap(mm);
875
876
local_irq_disable();
877
active_mm = tsk->active_mm;
878
tsk->active_mm = mm;
879
tsk->mm = mm;
880
mm_init_cid(mm, tsk);
881
/*
882
* This prevents preemption while active_mm is being loaded and
883
* it and mm are being updated, which could cause problems for
884
* lazy tlb mm refcounting when these are updated by context
885
* switches. Not all architectures can handle irqs off over
886
* activate_mm yet.
887
*/
888
if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
889
local_irq_enable();
890
activate_mm(active_mm, mm);
891
if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
892
local_irq_enable();
893
lru_gen_add_mm(mm);
894
task_unlock(tsk);
895
lru_gen_use_mm(mm);
896
if (old_mm) {
897
mmap_read_unlock(old_mm);
898
BUG_ON(active_mm != old_mm);
899
setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
900
mm_update_next_owner(old_mm);
901
mmput(old_mm);
902
return 0;
903
}
904
mmdrop_lazy_tlb(active_mm);
905
return 0;
906
}
907
908
static int de_thread(struct task_struct *tsk)
909
{
910
struct signal_struct *sig = tsk->signal;
911
struct sighand_struct *oldsighand = tsk->sighand;
912
spinlock_t *lock = &oldsighand->siglock;
913
914
if (thread_group_empty(tsk))
915
goto no_thread_group;
916
917
/*
918
* Kill all other threads in the thread group.
919
*/
920
spin_lock_irq(lock);
921
if ((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task) {
922
/*
923
* Another group action in progress, just
924
* return so that the signal is processed.
925
*/
926
spin_unlock_irq(lock);
927
return -EAGAIN;
928
}
929
930
sig->group_exec_task = tsk;
931
sig->notify_count = zap_other_threads(tsk);
932
if (!thread_group_leader(tsk))
933
sig->notify_count--;
934
935
while (sig->notify_count) {
936
__set_current_state(TASK_KILLABLE);
937
spin_unlock_irq(lock);
938
schedule();
939
if (__fatal_signal_pending(tsk))
940
goto killed;
941
spin_lock_irq(lock);
942
}
943
spin_unlock_irq(lock);
944
945
/*
946
* At this point all other threads have exited, all we have to
947
* do is to wait for the thread group leader to become inactive,
948
* and to assume its PID:
949
*/
950
if (!thread_group_leader(tsk)) {
951
struct task_struct *leader = tsk->group_leader;
952
953
for (;;) {
954
cgroup_threadgroup_change_begin(tsk);
955
write_lock_irq(&tasklist_lock);
956
/*
957
* Do this under tasklist_lock to ensure that
958
* exit_notify() can't miss ->group_exec_task
959
*/
960
sig->notify_count = -1;
961
if (likely(leader->exit_state))
962
break;
963
__set_current_state(TASK_KILLABLE);
964
write_unlock_irq(&tasklist_lock);
965
cgroup_threadgroup_change_end(tsk);
966
schedule();
967
if (__fatal_signal_pending(tsk))
968
goto killed;
969
}
970
971
/*
972
* The only record we have of the real-time age of a
973
* process, regardless of execs it's done, is start_time.
974
* All the past CPU time is accumulated in signal_struct
975
* from sister threads now dead. But in this non-leader
976
* exec, nothing survives from the original leader thread,
977
* whose birth marks the true age of this process now.
978
* When we take on its identity by switching to its PID, we
979
* also take its birthdate (always earlier than our own).
980
*/
981
tsk->start_time = leader->start_time;
982
tsk->start_boottime = leader->start_boottime;
983
984
BUG_ON(!same_thread_group(leader, tsk));
985
/*
986
* An exec() starts a new thread group with the
987
* TGID of the previous thread group. Rehash the
988
* two threads with a switched PID, and release
989
* the former thread group leader:
990
*/
991
992
/* Become a process group leader with the old leader's pid.
993
* The old leader becomes a thread of the this thread group.
994
*/
995
exchange_tids(tsk, leader);
996
transfer_pid(leader, tsk, PIDTYPE_TGID);
997
transfer_pid(leader, tsk, PIDTYPE_PGID);
998
transfer_pid(leader, tsk, PIDTYPE_SID);
999
1000
list_replace_rcu(&leader->tasks, &tsk->tasks);
1001
list_replace_init(&leader->sibling, &tsk->sibling);
1002
1003
tsk->group_leader = tsk;
1004
leader->group_leader = tsk;
1005
1006
tsk->exit_signal = SIGCHLD;
1007
leader->exit_signal = -1;
1008
1009
BUG_ON(leader->exit_state != EXIT_ZOMBIE);
1010
leader->exit_state = EXIT_DEAD;
1011
/*
1012
* We are going to release_task()->ptrace_unlink() silently,
1013
* the tracer can sleep in do_wait(). EXIT_DEAD guarantees
1014
* the tracer won't block again waiting for this thread.
1015
*/
1016
if (unlikely(leader->ptrace))
1017
__wake_up_parent(leader, leader->parent);
1018
write_unlock_irq(&tasklist_lock);
1019
cgroup_threadgroup_change_end(tsk);
1020
1021
release_task(leader);
1022
}
1023
1024
sig->group_exec_task = NULL;
1025
sig->notify_count = 0;
1026
1027
no_thread_group:
1028
/* we have changed execution domain */
1029
tsk->exit_signal = SIGCHLD;
1030
1031
BUG_ON(!thread_group_leader(tsk));
1032
return 0;
1033
1034
killed:
1035
/* protects against exit_notify() and __exit_signal() */
1036
read_lock(&tasklist_lock);
1037
sig->group_exec_task = NULL;
1038
sig->notify_count = 0;
1039
read_unlock(&tasklist_lock);
1040
return -EAGAIN;
1041
}
1042
1043
1044
/*
1045
* This function makes sure the current process has its own signal table,
1046
* so that flush_signal_handlers can later reset the handlers without
1047
* disturbing other processes. (Other processes might share the signal
1048
* table via the CLONE_SIGHAND option to clone().)
1049
*/
1050
static int unshare_sighand(struct task_struct *me)
1051
{
1052
struct sighand_struct *oldsighand = me->sighand;
1053
1054
if (refcount_read(&oldsighand->count) != 1) {
1055
struct sighand_struct *newsighand;
1056
/*
1057
* This ->sighand is shared with the CLONE_SIGHAND
1058
* but not CLONE_THREAD task, switch to the new one.
1059
*/
1060
newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
1061
if (!newsighand)
1062
return -ENOMEM;
1063
1064
refcount_set(&newsighand->count, 1);
1065
1066
write_lock_irq(&tasklist_lock);
1067
spin_lock(&oldsighand->siglock);
1068
memcpy(newsighand->action, oldsighand->action,
1069
sizeof(newsighand->action));
1070
rcu_assign_pointer(me->sighand, newsighand);
1071
spin_unlock(&oldsighand->siglock);
1072
write_unlock_irq(&tasklist_lock);
1073
1074
__cleanup_sighand(oldsighand);
1075
}
1076
return 0;
1077
}
1078
1079
/*
1080
* This is unlocked -- the string will always be NUL-terminated, but
1081
* may show overlapping contents if racing concurrent reads.
1082
*/
1083
void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
1084
{
1085
size_t len = min(strlen(buf), sizeof(tsk->comm) - 1);
1086
1087
trace_task_rename(tsk, buf);
1088
memcpy(tsk->comm, buf, len);
1089
memset(&tsk->comm[len], 0, sizeof(tsk->comm) - len);
1090
perf_event_comm(tsk, exec);
1091
}
1092
1093
/*
1094
* Calling this is the point of no return. None of the failures will be
1095
* seen by userspace since either the process is already taking a fatal
1096
* signal (via de_thread() or coredump), or will have SEGV raised
1097
* (after exec_mmap()) by search_binary_handler (see below).
1098
*/
1099
int begin_new_exec(struct linux_binprm * bprm)
1100
{
1101
struct task_struct *me = current;
1102
int retval;
1103
1104
/* Once we are committed compute the creds */
1105
retval = bprm_creds_from_file(bprm);
1106
if (retval)
1107
return retval;
1108
1109
/*
1110
* This tracepoint marks the point before flushing the old exec where
1111
* the current task is still unchanged, but errors are fatal (point of
1112
* no return). The later "sched_process_exec" tracepoint is called after
1113
* the current task has successfully switched to the new exec.
1114
*/
1115
trace_sched_prepare_exec(current, bprm);
1116
1117
/*
1118
* Ensure all future errors are fatal.
1119
*/
1120
bprm->point_of_no_return = true;
1121
1122
/* Make this the only thread in the thread group */
1123
retval = de_thread(me);
1124
if (retval)
1125
goto out;
1126
/* see the comment in check_unsafe_exec() */
1127
current->fs->in_exec = 0;
1128
/*
1129
* Cancel any io_uring activity across execve
1130
*/
1131
io_uring_task_cancel();
1132
1133
/* Ensure the files table is not shared. */
1134
retval = unshare_files();
1135
if (retval)
1136
goto out;
1137
1138
/*
1139
* Must be called _before_ exec_mmap() as bprm->mm is
1140
* not visible until then. Doing it here also ensures
1141
* we don't race against replace_mm_exe_file().
1142
*/
1143
retval = set_mm_exe_file(bprm->mm, bprm->file);
1144
if (retval)
1145
goto out;
1146
1147
/* If the binary is not readable then enforce mm->dumpable=0 */
1148
would_dump(bprm, bprm->file);
1149
if (bprm->have_execfd)
1150
would_dump(bprm, bprm->executable);
1151
1152
/*
1153
* Release all of the old mmap stuff
1154
*/
1155
acct_arg_size(bprm, 0);
1156
retval = exec_mmap(bprm->mm);
1157
if (retval)
1158
goto out;
1159
1160
bprm->mm = NULL;
1161
1162
retval = exec_task_namespaces();
1163
if (retval)
1164
goto out_unlock;
1165
1166
#ifdef CONFIG_POSIX_TIMERS
1167
spin_lock_irq(&me->sighand->siglock);
1168
posix_cpu_timers_exit(me);
1169
spin_unlock_irq(&me->sighand->siglock);
1170
exit_itimers(me);
1171
flush_itimer_signals();
1172
#endif
1173
1174
/*
1175
* Make the signal table private.
1176
*/
1177
retval = unshare_sighand(me);
1178
if (retval)
1179
goto out_unlock;
1180
1181
me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC |
1182
PF_NOFREEZE | PF_NO_SETAFFINITY);
1183
flush_thread();
1184
me->personality &= ~bprm->per_clear;
1185
1186
clear_syscall_work_syscall_user_dispatch(me);
1187
1188
/*
1189
* We have to apply CLOEXEC before we change whether the process is
1190
* dumpable (in setup_new_exec) to avoid a race with a process in userspace
1191
* trying to access the should-be-closed file descriptors of a process
1192
* undergoing exec(2).
1193
*/
1194
do_close_on_exec(me->files);
1195
1196
if (bprm->secureexec) {
1197
/* Make sure parent cannot signal privileged process. */
1198
me->pdeath_signal = 0;
1199
1200
/*
1201
* For secureexec, reset the stack limit to sane default to
1202
* avoid bad behavior from the prior rlimits. This has to
1203
* happen before arch_pick_mmap_layout(), which examines
1204
* RLIMIT_STACK, but after the point of no return to avoid
1205
* needing to clean up the change on failure.
1206
*/
1207
if (bprm->rlim_stack.rlim_cur > _STK_LIM)
1208
bprm->rlim_stack.rlim_cur = _STK_LIM;
1209
}
1210
1211
me->sas_ss_sp = me->sas_ss_size = 0;
1212
1213
/*
1214
* Figure out dumpability. Note that this checking only of current
1215
* is wrong, but userspace depends on it. This should be testing
1216
* bprm->secureexec instead.
1217
*/
1218
if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
1219
!(uid_eq(current_euid(), current_uid()) &&
1220
gid_eq(current_egid(), current_gid())))
1221
set_dumpable(current->mm, suid_dumpable);
1222
else
1223
set_dumpable(current->mm, SUID_DUMP_USER);
1224
1225
perf_event_exec();
1226
1227
/*
1228
* If the original filename was empty, alloc_bprm() made up a path
1229
* that will probably not be useful to admins running ps or similar.
1230
* Let's fix it up to be something reasonable.
1231
*/
1232
if (bprm->comm_from_dentry) {
1233
/*
1234
* Hold RCU lock to keep the name from being freed behind our back.
1235
* Use acquire semantics to make sure the terminating NUL from
1236
* __d_alloc() is seen.
1237
*
1238
* Note, we're deliberately sloppy here. We don't need to care about
1239
* detecting a concurrent rename and just want a terminated name.
1240
*/
1241
rcu_read_lock();
1242
__set_task_comm(me, smp_load_acquire(&bprm->file->f_path.dentry->d_name.name),
1243
true);
1244
rcu_read_unlock();
1245
} else {
1246
__set_task_comm(me, kbasename(bprm->filename), true);
1247
}
1248
1249
/* An exec changes our domain. We are no longer part of the thread
1250
group */
1251
WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1);
1252
flush_signal_handlers(me, 0);
1253
1254
retval = set_cred_ucounts(bprm->cred);
1255
if (retval < 0)
1256
goto out_unlock;
1257
1258
/*
1259
* install the new credentials for this executable
1260
*/
1261
security_bprm_committing_creds(bprm);
1262
1263
commit_creds(bprm->cred);
1264
bprm->cred = NULL;
1265
1266
/*
1267
* Disable monitoring for regular users
1268
* when executing setuid binaries. Must
1269
* wait until new credentials are committed
1270
* by commit_creds() above
1271
*/
1272
if (get_dumpable(me->mm) != SUID_DUMP_USER)
1273
perf_event_exit_task(me);
1274
/*
1275
* cred_guard_mutex must be held at least to this point to prevent
1276
* ptrace_attach() from altering our determination of the task's
1277
* credentials; any time after this it may be unlocked.
1278
*/
1279
security_bprm_committed_creds(bprm);
1280
1281
/* Pass the opened binary to the interpreter. */
1282
if (bprm->have_execfd) {
1283
retval = get_unused_fd_flags(0);
1284
if (retval < 0)
1285
goto out_unlock;
1286
fd_install(retval, bprm->executable);
1287
bprm->executable = NULL;
1288
bprm->execfd = retval;
1289
}
1290
return 0;
1291
1292
out_unlock:
1293
up_write(&me->signal->exec_update_lock);
1294
if (!bprm->cred)
1295
mutex_unlock(&me->signal->cred_guard_mutex);
1296
1297
out:
1298
return retval;
1299
}
1300
EXPORT_SYMBOL(begin_new_exec);
1301
1302
void would_dump(struct linux_binprm *bprm, struct file *file)
1303
{
1304
struct inode *inode = file_inode(file);
1305
struct mnt_idmap *idmap = file_mnt_idmap(file);
1306
if (inode_permission(idmap, inode, MAY_READ) < 0) {
1307
struct user_namespace *old, *user_ns;
1308
bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
1309
1310
/* Ensure mm->user_ns contains the executable */
1311
user_ns = old = bprm->mm->user_ns;
1312
while ((user_ns != &init_user_ns) &&
1313
!privileged_wrt_inode_uidgid(user_ns, idmap, inode))
1314
user_ns = user_ns->parent;
1315
1316
if (old != user_ns) {
1317
bprm->mm->user_ns = get_user_ns(user_ns);
1318
put_user_ns(old);
1319
}
1320
}
1321
}
1322
EXPORT_SYMBOL(would_dump);
1323
1324
void setup_new_exec(struct linux_binprm * bprm)
1325
{
1326
/* Setup things that can depend upon the personality */
1327
struct task_struct *me = current;
1328
1329
arch_pick_mmap_layout(me->mm, &bprm->rlim_stack);
1330
1331
arch_setup_new_exec();
1332
1333
/* Set the new mm task size. We have to do that late because it may
1334
* depend on TIF_32BIT which is only updated in flush_thread() on
1335
* some architectures like powerpc
1336
*/
1337
me->mm->task_size = TASK_SIZE;
1338
up_write(&me->signal->exec_update_lock);
1339
mutex_unlock(&me->signal->cred_guard_mutex);
1340
}
1341
EXPORT_SYMBOL(setup_new_exec);
1342
1343
/* Runs immediately before start_thread() takes over. */
1344
void finalize_exec(struct linux_binprm *bprm)
1345
{
1346
/* Store any stack rlimit changes before starting thread. */
1347
task_lock(current->group_leader);
1348
current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack;
1349
task_unlock(current->group_leader);
1350
}
1351
EXPORT_SYMBOL(finalize_exec);
1352
1353
/*
1354
* Prepare credentials and lock ->cred_guard_mutex.
1355
* setup_new_exec() commits the new creds and drops the lock.
1356
* Or, if exec fails before, free_bprm() should release ->cred
1357
* and unlock.
1358
*/
1359
static int prepare_bprm_creds(struct linux_binprm *bprm)
1360
{
1361
if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
1362
return -ERESTARTNOINTR;
1363
1364
bprm->cred = prepare_exec_creds();
1365
if (likely(bprm->cred))
1366
return 0;
1367
1368
mutex_unlock(&current->signal->cred_guard_mutex);
1369
return -ENOMEM;
1370
}
1371
1372
/* Matches do_open_execat() */
1373
static void do_close_execat(struct file *file)
1374
{
1375
if (!file)
1376
return;
1377
exe_file_allow_write_access(file);
1378
fput(file);
1379
}
1380
1381
static void free_bprm(struct linux_binprm *bprm)
1382
{
1383
if (bprm->mm) {
1384
acct_arg_size(bprm, 0);
1385
mmput(bprm->mm);
1386
}
1387
free_arg_pages(bprm);
1388
if (bprm->cred) {
1389
/* in case exec fails before de_thread() succeeds */
1390
current->fs->in_exec = 0;
1391
mutex_unlock(&current->signal->cred_guard_mutex);
1392
abort_creds(bprm->cred);
1393
}
1394
do_close_execat(bprm->file);
1395
if (bprm->executable)
1396
fput(bprm->executable);
1397
/* If a binfmt changed the interp, free it. */
1398
if (bprm->interp != bprm->filename)
1399
kfree(bprm->interp);
1400
kfree(bprm->fdpath);
1401
kfree(bprm);
1402
}
1403
1404
static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags)
1405
{
1406
struct linux_binprm *bprm;
1407
struct file *file;
1408
int retval = -ENOMEM;
1409
1410
file = do_open_execat(fd, filename, flags);
1411
if (IS_ERR(file))
1412
return ERR_CAST(file);
1413
1414
bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
1415
if (!bprm) {
1416
do_close_execat(file);
1417
return ERR_PTR(-ENOMEM);
1418
}
1419
1420
bprm->file = file;
1421
1422
if (fd == AT_FDCWD || filename->name[0] == '/') {
1423
bprm->filename = filename->name;
1424
} else {
1425
if (filename->name[0] == '\0') {
1426
bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd);
1427
bprm->comm_from_dentry = 1;
1428
} else {
1429
bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s",
1430
fd, filename->name);
1431
}
1432
if (!bprm->fdpath)
1433
goto out_free;
1434
1435
/*
1436
* Record that a name derived from an O_CLOEXEC fd will be
1437
* inaccessible after exec. This allows the code in exec to
1438
* choose to fail when the executable is not mmaped into the
1439
* interpreter and an open file descriptor is not passed to
1440
* the interpreter. This makes for a better user experience
1441
* than having the interpreter start and then immediately fail
1442
* when it finds the executable is inaccessible.
1443
*/
1444
if (get_close_on_exec(fd))
1445
bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
1446
1447
bprm->filename = bprm->fdpath;
1448
}
1449
bprm->interp = bprm->filename;
1450
1451
/*
1452
* At this point, security_file_open() has already been called (with
1453
* __FMODE_EXEC) and access control checks for AT_EXECVE_CHECK will
1454
* stop just after the security_bprm_creds_for_exec() call in
1455
* bprm_execve(). Indeed, the kernel should not try to parse the
1456
* content of the file with exec_binprm() nor change the calling
1457
* thread, which means that the following security functions will not
1458
* be called:
1459
* - security_bprm_check()
1460
* - security_bprm_creds_from_file()
1461
* - security_bprm_committing_creds()
1462
* - security_bprm_committed_creds()
1463
*/
1464
bprm->is_check = !!(flags & AT_EXECVE_CHECK);
1465
1466
retval = bprm_mm_init(bprm);
1467
if (!retval)
1468
return bprm;
1469
1470
out_free:
1471
free_bprm(bprm);
1472
return ERR_PTR(retval);
1473
}
1474
1475
int bprm_change_interp(const char *interp, struct linux_binprm *bprm)
1476
{
1477
/* If a binfmt changed the interp, free it first. */
1478
if (bprm->interp != bprm->filename)
1479
kfree(bprm->interp);
1480
bprm->interp = kstrdup(interp, GFP_KERNEL);
1481
if (!bprm->interp)
1482
return -ENOMEM;
1483
return 0;
1484
}
1485
EXPORT_SYMBOL(bprm_change_interp);
1486
1487
/*
1488
* determine how safe it is to execute the proposed program
1489
* - the caller must hold ->cred_guard_mutex to protect against
1490
* PTRACE_ATTACH or seccomp thread-sync
1491
*/
1492
static void check_unsafe_exec(struct linux_binprm *bprm)
1493
{
1494
struct task_struct *p = current, *t;
1495
unsigned n_fs;
1496
1497
if (p->ptrace)
1498
bprm->unsafe |= LSM_UNSAFE_PTRACE;
1499
1500
/*
1501
* This isn't strictly necessary, but it makes it harder for LSMs to
1502
* mess up.
1503
*/
1504
if (task_no_new_privs(current))
1505
bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
1506
1507
/*
1508
* If another task is sharing our fs, we cannot safely
1509
* suid exec because the differently privileged task
1510
* will be able to manipulate the current directory, etc.
1511
* It would be nice to force an unshare instead...
1512
*
1513
* Otherwise we set fs->in_exec = 1 to deny clone(CLONE_FS)
1514
* from another sub-thread until de_thread() succeeds, this
1515
* state is protected by cred_guard_mutex we hold.
1516
*/
1517
n_fs = 1;
1518
read_seqlock_excl(&p->fs->seq);
1519
rcu_read_lock();
1520
for_other_threads(p, t) {
1521
if (t->fs == p->fs)
1522
n_fs++;
1523
}
1524
rcu_read_unlock();
1525
1526
/* "users" and "in_exec" locked for copy_fs() */
1527
if (p->fs->users > n_fs)
1528
bprm->unsafe |= LSM_UNSAFE_SHARE;
1529
else
1530
p->fs->in_exec = 1;
1531
read_sequnlock_excl(&p->fs->seq);
1532
}
1533
1534
static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
1535
{
1536
/* Handle suid and sgid on files */
1537
struct mnt_idmap *idmap;
1538
struct inode *inode = file_inode(file);
1539
unsigned int mode;
1540
vfsuid_t vfsuid;
1541
vfsgid_t vfsgid;
1542
int err;
1543
1544
if (!mnt_may_suid(file->f_path.mnt))
1545
return;
1546
1547
if (task_no_new_privs(current))
1548
return;
1549
1550
mode = READ_ONCE(inode->i_mode);
1551
if (!(mode & (S_ISUID|S_ISGID)))
1552
return;
1553
1554
idmap = file_mnt_idmap(file);
1555
1556
/* Be careful if suid/sgid is set */
1557
inode_lock(inode);
1558
1559
/* Atomically reload and check mode/uid/gid now that lock held. */
1560
mode = inode->i_mode;
1561
vfsuid = i_uid_into_vfsuid(idmap, inode);
1562
vfsgid = i_gid_into_vfsgid(idmap, inode);
1563
err = inode_permission(idmap, inode, MAY_EXEC);
1564
inode_unlock(inode);
1565
1566
/* Did the exec bit vanish out from under us? Give up. */
1567
if (err)
1568
return;
1569
1570
/* We ignore suid/sgid if there are no mappings for them in the ns */
1571
if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) ||
1572
!vfsgid_has_mapping(bprm->cred->user_ns, vfsgid))
1573
return;
1574
1575
if (mode & S_ISUID) {
1576
bprm->per_clear |= PER_CLEAR_ON_SETID;
1577
bprm->cred->euid = vfsuid_into_kuid(vfsuid);
1578
}
1579
1580
if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
1581
bprm->per_clear |= PER_CLEAR_ON_SETID;
1582
bprm->cred->egid = vfsgid_into_kgid(vfsgid);
1583
}
1584
}
1585
1586
/*
1587
* Compute brpm->cred based upon the final binary.
1588
*/
1589
static int bprm_creds_from_file(struct linux_binprm *bprm)
1590
{
1591
/* Compute creds based on which file? */
1592
struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file;
1593
1594
bprm_fill_uid(bprm, file);
1595
return security_bprm_creds_from_file(bprm, file);
1596
}
1597
1598
/*
1599
* Fill the binprm structure from the inode.
1600
* Read the first BINPRM_BUF_SIZE bytes
1601
*
1602
* This may be called multiple times for binary chains (scripts for example).
1603
*/
1604
static int prepare_binprm(struct linux_binprm *bprm)
1605
{
1606
loff_t pos = 0;
1607
1608
memset(bprm->buf, 0, BINPRM_BUF_SIZE);
1609
return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos);
1610
}
1611
1612
/*
1613
* Arguments are '\0' separated strings found at the location bprm->p
1614
* points to; chop off the first by relocating brpm->p to right after
1615
* the first '\0' encountered.
1616
*/
1617
int remove_arg_zero(struct linux_binprm *bprm)
1618
{
1619
unsigned long offset;
1620
char *kaddr;
1621
struct page *page;
1622
1623
if (!bprm->argc)
1624
return 0;
1625
1626
do {
1627
offset = bprm->p & ~PAGE_MASK;
1628
page = get_arg_page(bprm, bprm->p, 0);
1629
if (!page)
1630
return -EFAULT;
1631
kaddr = kmap_local_page(page);
1632
1633
for (; offset < PAGE_SIZE && kaddr[offset];
1634
offset++, bprm->p++)
1635
;
1636
1637
kunmap_local(kaddr);
1638
put_arg_page(page);
1639
} while (offset == PAGE_SIZE);
1640
1641
bprm->p++;
1642
bprm->argc--;
1643
1644
return 0;
1645
}
1646
EXPORT_SYMBOL(remove_arg_zero);
1647
1648
/*
1649
* cycle the list of binary formats handler, until one recognizes the image
1650
*/
1651
static int search_binary_handler(struct linux_binprm *bprm)
1652
{
1653
struct linux_binfmt *fmt;
1654
int retval;
1655
1656
retval = prepare_binprm(bprm);
1657
if (retval < 0)
1658
return retval;
1659
1660
retval = security_bprm_check(bprm);
1661
if (retval)
1662
return retval;
1663
1664
read_lock(&binfmt_lock);
1665
list_for_each_entry(fmt, &formats, lh) {
1666
if (!try_module_get(fmt->module))
1667
continue;
1668
read_unlock(&binfmt_lock);
1669
1670
retval = fmt->load_binary(bprm);
1671
1672
read_lock(&binfmt_lock);
1673
put_binfmt(fmt);
1674
if (bprm->point_of_no_return || (retval != -ENOEXEC)) {
1675
read_unlock(&binfmt_lock);
1676
return retval;
1677
}
1678
}
1679
read_unlock(&binfmt_lock);
1680
1681
return -ENOEXEC;
1682
}
1683
1684
/* binfmt handlers will call back into begin_new_exec() on success. */
1685
static int exec_binprm(struct linux_binprm *bprm)
1686
{
1687
pid_t old_pid, old_vpid;
1688
int ret, depth;
1689
1690
/* Need to fetch pid before load_binary changes it */
1691
old_pid = current->pid;
1692
rcu_read_lock();
1693
old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
1694
rcu_read_unlock();
1695
1696
/* This allows 4 levels of binfmt rewrites before failing hard. */
1697
for (depth = 0;; depth++) {
1698
struct file *exec;
1699
if (depth > 5)
1700
return -ELOOP;
1701
1702
ret = search_binary_handler(bprm);
1703
if (ret < 0)
1704
return ret;
1705
if (!bprm->interpreter)
1706
break;
1707
1708
exec = bprm->file;
1709
bprm->file = bprm->interpreter;
1710
bprm->interpreter = NULL;
1711
1712
exe_file_allow_write_access(exec);
1713
if (unlikely(bprm->have_execfd)) {
1714
if (bprm->executable) {
1715
fput(exec);
1716
return -ENOEXEC;
1717
}
1718
bprm->executable = exec;
1719
} else
1720
fput(exec);
1721
}
1722
1723
audit_bprm(bprm);
1724
trace_sched_process_exec(current, old_pid, bprm);
1725
ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
1726
proc_exec_connector(current);
1727
return 0;
1728
}
1729
1730
static int bprm_execve(struct linux_binprm *bprm)
1731
{
1732
int retval;
1733
1734
retval = prepare_bprm_creds(bprm);
1735
if (retval)
1736
return retval;
1737
1738
/*
1739
* Check for unsafe execution states before exec_binprm(), which
1740
* will call back into begin_new_exec(), into bprm_creds_from_file(),
1741
* where setuid-ness is evaluated.
1742
*/
1743
check_unsafe_exec(bprm);
1744
current->in_execve = 1;
1745
sched_mm_cid_before_execve(current);
1746
1747
sched_exec();
1748
1749
/* Set the unchanging part of bprm->cred */
1750
retval = security_bprm_creds_for_exec(bprm);
1751
if (retval || bprm->is_check)
1752
goto out;
1753
1754
retval = exec_binprm(bprm);
1755
if (retval < 0)
1756
goto out;
1757
1758
sched_mm_cid_after_execve(current);
1759
rseq_execve(current);
1760
/* execve succeeded */
1761
current->in_execve = 0;
1762
user_events_execve(current);
1763
acct_update_integrals(current);
1764
task_numa_free(current, false);
1765
return retval;
1766
1767
out:
1768
/*
1769
* If past the point of no return ensure the code never
1770
* returns to the userspace process. Use an existing fatal
1771
* signal if present otherwise terminate the process with
1772
* SIGSEGV.
1773
*/
1774
if (bprm->point_of_no_return && !fatal_signal_pending(current))
1775
force_fatal_sig(SIGSEGV);
1776
1777
sched_mm_cid_after_execve(current);
1778
rseq_set_notify_resume(current);
1779
current->in_execve = 0;
1780
1781
return retval;
1782
}
1783
1784
static int do_execveat_common(int fd, struct filename *filename,
1785
struct user_arg_ptr argv,
1786
struct user_arg_ptr envp,
1787
int flags)
1788
{
1789
struct linux_binprm *bprm;
1790
int retval;
1791
1792
if (IS_ERR(filename))
1793
return PTR_ERR(filename);
1794
1795
/*
1796
* We move the actual failure in case of RLIMIT_NPROC excess from
1797
* set*uid() to execve() because too many poorly written programs
1798
* don't check setuid() return code. Here we additionally recheck
1799
* whether NPROC limit is still exceeded.
1800
*/
1801
if ((current->flags & PF_NPROC_EXCEEDED) &&
1802
is_rlimit_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
1803
retval = -EAGAIN;
1804
goto out_ret;
1805
}
1806
1807
/* We're below the limit (still or again), so we don't want to make
1808
* further execve() calls fail. */
1809
current->flags &= ~PF_NPROC_EXCEEDED;
1810
1811
bprm = alloc_bprm(fd, filename, flags);
1812
if (IS_ERR(bprm)) {
1813
retval = PTR_ERR(bprm);
1814
goto out_ret;
1815
}
1816
1817
retval = count(argv, MAX_ARG_STRINGS);
1818
if (retval < 0)
1819
goto out_free;
1820
bprm->argc = retval;
1821
1822
retval = count(envp, MAX_ARG_STRINGS);
1823
if (retval < 0)
1824
goto out_free;
1825
bprm->envc = retval;
1826
1827
retval = bprm_stack_limits(bprm);
1828
if (retval < 0)
1829
goto out_free;
1830
1831
retval = copy_string_kernel(bprm->filename, bprm);
1832
if (retval < 0)
1833
goto out_free;
1834
bprm->exec = bprm->p;
1835
1836
retval = copy_strings(bprm->envc, envp, bprm);
1837
if (retval < 0)
1838
goto out_free;
1839
1840
retval = copy_strings(bprm->argc, argv, bprm);
1841
if (retval < 0)
1842
goto out_free;
1843
1844
/*
1845
* When argv is empty, add an empty string ("") as argv[0] to
1846
* ensure confused userspace programs that start processing
1847
* from argv[1] won't end up walking envp. See also
1848
* bprm_stack_limits().
1849
*/
1850
if (bprm->argc == 0) {
1851
retval = copy_string_kernel("", bprm);
1852
if (retval < 0)
1853
goto out_free;
1854
bprm->argc = 1;
1855
1856
pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n",
1857
current->comm, bprm->filename);
1858
}
1859
1860
retval = bprm_execve(bprm);
1861
out_free:
1862
free_bprm(bprm);
1863
1864
out_ret:
1865
putname(filename);
1866
return retval;
1867
}
1868
1869
int kernel_execve(const char *kernel_filename,
1870
const char *const *argv, const char *const *envp)
1871
{
1872
struct filename *filename;
1873
struct linux_binprm *bprm;
1874
int fd = AT_FDCWD;
1875
int retval;
1876
1877
/* It is non-sense for kernel threads to call execve */
1878
if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
1879
return -EINVAL;
1880
1881
filename = getname_kernel(kernel_filename);
1882
if (IS_ERR(filename))
1883
return PTR_ERR(filename);
1884
1885
bprm = alloc_bprm(fd, filename, 0);
1886
if (IS_ERR(bprm)) {
1887
retval = PTR_ERR(bprm);
1888
goto out_ret;
1889
}
1890
1891
retval = count_strings_kernel(argv);
1892
if (WARN_ON_ONCE(retval == 0))
1893
retval = -EINVAL;
1894
if (retval < 0)
1895
goto out_free;
1896
bprm->argc = retval;
1897
1898
retval = count_strings_kernel(envp);
1899
if (retval < 0)
1900
goto out_free;
1901
bprm->envc = retval;
1902
1903
retval = bprm_stack_limits(bprm);
1904
if (retval < 0)
1905
goto out_free;
1906
1907
retval = copy_string_kernel(bprm->filename, bprm);
1908
if (retval < 0)
1909
goto out_free;
1910
bprm->exec = bprm->p;
1911
1912
retval = copy_strings_kernel(bprm->envc, envp, bprm);
1913
if (retval < 0)
1914
goto out_free;
1915
1916
retval = copy_strings_kernel(bprm->argc, argv, bprm);
1917
if (retval < 0)
1918
goto out_free;
1919
1920
retval = bprm_execve(bprm);
1921
out_free:
1922
free_bprm(bprm);
1923
out_ret:
1924
putname(filename);
1925
return retval;
1926
}
1927
1928
static int do_execve(struct filename *filename,
1929
const char __user *const __user *__argv,
1930
const char __user *const __user *__envp)
1931
{
1932
struct user_arg_ptr argv = { .ptr.native = __argv };
1933
struct user_arg_ptr envp = { .ptr.native = __envp };
1934
return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
1935
}
1936
1937
static int do_execveat(int fd, struct filename *filename,
1938
const char __user *const __user *__argv,
1939
const char __user *const __user *__envp,
1940
int flags)
1941
{
1942
struct user_arg_ptr argv = { .ptr.native = __argv };
1943
struct user_arg_ptr envp = { .ptr.native = __envp };
1944
1945
return do_execveat_common(fd, filename, argv, envp, flags);
1946
}
1947
1948
#ifdef CONFIG_COMPAT
1949
static int compat_do_execve(struct filename *filename,
1950
const compat_uptr_t __user *__argv,
1951
const compat_uptr_t __user *__envp)
1952
{
1953
struct user_arg_ptr argv = {
1954
.is_compat = true,
1955
.ptr.compat = __argv,
1956
};
1957
struct user_arg_ptr envp = {
1958
.is_compat = true,
1959
.ptr.compat = __envp,
1960
};
1961
return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
1962
}
1963
1964
static int compat_do_execveat(int fd, struct filename *filename,
1965
const compat_uptr_t __user *__argv,
1966
const compat_uptr_t __user *__envp,
1967
int flags)
1968
{
1969
struct user_arg_ptr argv = {
1970
.is_compat = true,
1971
.ptr.compat = __argv,
1972
};
1973
struct user_arg_ptr envp = {
1974
.is_compat = true,
1975
.ptr.compat = __envp,
1976
};
1977
return do_execveat_common(fd, filename, argv, envp, flags);
1978
}
1979
#endif
1980
1981
void set_binfmt(struct linux_binfmt *new)
1982
{
1983
struct mm_struct *mm = current->mm;
1984
1985
if (mm->binfmt)
1986
module_put(mm->binfmt->module);
1987
1988
mm->binfmt = new;
1989
if (new)
1990
__module_get(new->module);
1991
}
1992
EXPORT_SYMBOL(set_binfmt);
1993
1994
/*
1995
* set_dumpable stores three-value SUID_DUMP_* into mm->flags.
1996
*/
1997
void set_dumpable(struct mm_struct *mm, int value)
1998
{
1999
if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
2000
return;
2001
2002
__mm_flags_set_mask_dumpable(mm, value);
2003
}
2004
2005
SYSCALL_DEFINE3(execve,
2006
const char __user *, filename,
2007
const char __user *const __user *, argv,
2008
const char __user *const __user *, envp)
2009
{
2010
return do_execve(getname(filename), argv, envp);
2011
}
2012
2013
SYSCALL_DEFINE5(execveat,
2014
int, fd, const char __user *, filename,
2015
const char __user *const __user *, argv,
2016
const char __user *const __user *, envp,
2017
int, flags)
2018
{
2019
return do_execveat(fd,
2020
getname_uflags(filename, flags),
2021
argv, envp, flags);
2022
}
2023
2024
#ifdef CONFIG_COMPAT
2025
COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
2026
const compat_uptr_t __user *, argv,
2027
const compat_uptr_t __user *, envp)
2028
{
2029
return compat_do_execve(getname(filename), argv, envp);
2030
}
2031
2032
COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
2033
const char __user *, filename,
2034
const compat_uptr_t __user *, argv,
2035
const compat_uptr_t __user *, envp,
2036
int, flags)
2037
{
2038
return compat_do_execveat(fd,
2039
getname_uflags(filename, flags),
2040
argv, envp, flags);
2041
}
2042
#endif
2043
2044
#ifdef CONFIG_SYSCTL
2045
2046
static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int write,
2047
void *buffer, size_t *lenp, loff_t *ppos)
2048
{
2049
int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2050
2051
if (!error && !write)
2052
validate_coredump_safety();
2053
return error;
2054
}
2055
2056
static const struct ctl_table fs_exec_sysctls[] = {
2057
{
2058
.procname = "suid_dumpable",
2059
.data = &suid_dumpable,
2060
.maxlen = sizeof(int),
2061
.mode = 0644,
2062
.proc_handler = proc_dointvec_minmax_coredump,
2063
.extra1 = SYSCTL_ZERO,
2064
.extra2 = SYSCTL_TWO,
2065
},
2066
};
2067
2068
static int __init init_fs_exec_sysctls(void)
2069
{
2070
register_sysctl_init("fs", fs_exec_sysctls);
2071
return 0;
2072
}
2073
2074
fs_initcall(init_fs_exec_sysctls);
2075
#endif /* CONFIG_SYSCTL */
2076
2077
#ifdef CONFIG_EXEC_KUNIT_TEST
2078
#include "tests/exec_kunit.c"
2079
#endif
2080
2081