// SPDX-License-Identifier: GPL-2.0-only1#include <linux/kernel.h>2#include <linux/errno.h>3#include <linux/err.h>4#include <linux/spinlock.h>56#include <linux/mm.h>7#include <linux/memfd.h>8#include <linux/memremap.h>9#include <linux/pagemap.h>10#include <linux/rmap.h>11#include <linux/swap.h>12#include <linux/swapops.h>13#include <linux/secretmem.h>1415#include <linux/sched/signal.h>16#include <linux/rwsem.h>17#include <linux/hugetlb.h>18#include <linux/migrate.h>19#include <linux/mm_inline.h>20#include <linux/pagevec.h>21#include <linux/sched/mm.h>22#include <linux/shmem_fs.h>2324#include <asm/mmu_context.h>25#include <asm/tlbflush.h>2627#include "internal.h"28#include "swap.h"2930static inline void sanity_check_pinned_pages(struct page **pages,31unsigned long npages)32{33if (!IS_ENABLED(CONFIG_DEBUG_VM))34return;3536/*37* We only pin anonymous pages if they are exclusive. Once pinned, we38* can no longer turn them possibly shared and PageAnonExclusive() will39* stick around until the page is freed.40*41* We'd like to verify that our pinned anonymous pages are still mapped42* exclusively. The issue with anon THP is that we don't know how43* they are/were mapped when pinning them. However, for anon44* THP we can assume that either the given page (PTE-mapped THP) or45* the head page (PMD-mapped THP) should be PageAnonExclusive(). If46* neither is the case, there is certainly something wrong.47*/48for (; npages; npages--, pages++) {49struct page *page = *pages;50struct folio *folio;5152if (!page)53continue;5455folio = page_folio(page);5657if (is_zero_page(page) ||58!folio_test_anon(folio))59continue;60if (!folio_test_large(folio) || folio_test_hugetlb(folio))61VM_WARN_ON_ONCE_FOLIO(!PageAnonExclusive(&folio->page), folio);62else63/* Either a PTE-mapped or a PMD-mapped THP. */64VM_WARN_ON_ONCE_PAGE(!PageAnonExclusive(&folio->page) &&65!PageAnonExclusive(page), page);66}67}6869/*70* Return the folio with ref appropriately incremented,71* or NULL if that failed.72*/73static inline struct folio *try_get_folio(struct page *page, int refs)74{75struct folio *folio;7677retry:78folio = page_folio(page);79if (WARN_ON_ONCE(folio_ref_count(folio) < 0))80return NULL;81if (unlikely(!folio_ref_try_add(folio, refs)))82return NULL;8384/*85* At this point we have a stable reference to the folio; but it86* could be that between calling page_folio() and the refcount87* increment, the folio was split, in which case we'd end up88* holding a reference on a folio that has nothing to do with the page89* we were given anymore.90* So now that the folio is stable, recheck that the page still91* belongs to this folio.92*/93if (unlikely(page_folio(page) != folio)) {94folio_put_refs(folio, refs);95goto retry;96}9798return folio;99}100101static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)102{103if (flags & FOLL_PIN) {104if (is_zero_folio(folio))105return;106node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs);107if (folio_has_pincount(folio))108atomic_sub(refs, &folio->_pincount);109else110refs *= GUP_PIN_COUNTING_BIAS;111}112113folio_put_refs(folio, refs);114}115116/**117* try_grab_folio() - add a folio's refcount by a flag-dependent amount118* @folio: pointer to folio to be grabbed119* @refs: the value to (effectively) add to the folio's refcount120* @flags: gup flags: these are the FOLL_* flag values121*122* This might not do anything at all, depending on the flags argument.123*124* "grab" names in this file mean, "look at flags to decide whether to use125* FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.126*127* Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same128* time.129*130* Return: 0 for success, or if no action was required (if neither FOLL_PIN131* nor FOLL_GET was set, nothing is done). A negative error code for failure:132*133* -ENOMEM FOLL_GET or FOLL_PIN was set, but the folio could not134* be grabbed.135*136* It is called when we have a stable reference for the folio, typically in137* GUP slow path.138*/139int __must_check try_grab_folio(struct folio *folio, int refs,140unsigned int flags)141{142if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))143return -ENOMEM;144145if (unlikely(!(flags & FOLL_PCI_P2PDMA) && folio_is_pci_p2pdma(folio)))146return -EREMOTEIO;147148if (flags & FOLL_GET)149folio_ref_add(folio, refs);150else if (flags & FOLL_PIN) {151/*152* Don't take a pin on the zero page - it's not going anywhere153* and it is used in a *lot* of places.154*/155if (is_zero_folio(folio))156return 0;157158/*159* Increment the normal page refcount field at least once,160* so that the page really is pinned.161*/162if (folio_has_pincount(folio)) {163folio_ref_add(folio, refs);164atomic_add(refs, &folio->_pincount);165} else {166folio_ref_add(folio, refs * GUP_PIN_COUNTING_BIAS);167}168169node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);170}171172return 0;173}174175/**176* unpin_user_page() - release a dma-pinned page177* @page: pointer to page to be released178*179* Pages that were pinned via pin_user_pages*() must be released via either180* unpin_user_page(), or one of the unpin_user_pages*() routines. This is so181* that such pages can be separately tracked and uniquely handled. In182* particular, interactions with RDMA and filesystems need special handling.183*/184void unpin_user_page(struct page *page)185{186sanity_check_pinned_pages(&page, 1);187gup_put_folio(page_folio(page), 1, FOLL_PIN);188}189EXPORT_SYMBOL(unpin_user_page);190191/**192* unpin_folio() - release a dma-pinned folio193* @folio: pointer to folio to be released194*195* Folios that were pinned via memfd_pin_folios() or other similar routines196* must be released either using unpin_folio() or unpin_folios().197*/198void unpin_folio(struct folio *folio)199{200gup_put_folio(folio, 1, FOLL_PIN);201}202EXPORT_SYMBOL_GPL(unpin_folio);203204/**205* folio_add_pin - Try to get an additional pin on a pinned folio206* @folio: The folio to be pinned207*208* Get an additional pin on a folio we already have a pin on. Makes no change209* if the folio is a zero_page.210*/211void folio_add_pin(struct folio *folio)212{213if (is_zero_folio(folio))214return;215216/*217* Similar to try_grab_folio(): be sure to *also* increment the normal218* page refcount field at least once, so that the page really is219* pinned.220*/221if (folio_has_pincount(folio)) {222WARN_ON_ONCE(atomic_read(&folio->_pincount) < 1);223folio_ref_inc(folio);224atomic_inc(&folio->_pincount);225} else {226WARN_ON_ONCE(folio_ref_count(folio) < GUP_PIN_COUNTING_BIAS);227folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);228}229}230231static inline struct folio *gup_folio_range_next(struct page *start,232unsigned long npages, unsigned long i, unsigned int *ntails)233{234struct page *next = start + i;235struct folio *folio = page_folio(next);236unsigned int nr = 1;237238if (folio_test_large(folio))239nr = min_t(unsigned int, npages - i,240folio_nr_pages(folio) - folio_page_idx(folio, next));241242*ntails = nr;243return folio;244}245246static inline struct folio *gup_folio_next(struct page **list,247unsigned long npages, unsigned long i, unsigned int *ntails)248{249struct folio *folio = page_folio(list[i]);250unsigned int nr;251252for (nr = i + 1; nr < npages; nr++) {253if (page_folio(list[nr]) != folio)254break;255}256257*ntails = nr - i;258return folio;259}260261/**262* unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages263* @pages: array of pages to be maybe marked dirty, and definitely released.264* @npages: number of pages in the @pages array.265* @make_dirty: whether to mark the pages dirty266*267* "gup-pinned page" refers to a page that has had one of the get_user_pages()268* variants called on that page.269*270* For each page in the @pages array, make that page (or its head page, if a271* compound page) dirty, if @make_dirty is true, and if the page was previously272* listed as clean. In any case, releases all pages using unpin_user_page(),273* possibly via unpin_user_pages(), for the non-dirty case.274*275* Please see the unpin_user_page() documentation for details.276*277* set_page_dirty_lock() is used internally. If instead, set_page_dirty() is278* required, then the caller should a) verify that this is really correct,279* because _lock() is usually required, and b) hand code it:280* set_page_dirty_lock(), unpin_user_page().281*282*/283void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,284bool make_dirty)285{286unsigned long i;287struct folio *folio;288unsigned int nr;289290if (!make_dirty) {291unpin_user_pages(pages, npages);292return;293}294295sanity_check_pinned_pages(pages, npages);296for (i = 0; i < npages; i += nr) {297folio = gup_folio_next(pages, npages, i, &nr);298/*299* Checking PageDirty at this point may race with300* clear_page_dirty_for_io(), but that's OK. Two key301* cases:302*303* 1) This code sees the page as already dirty, so it304* skips the call to set_page_dirty(). That could happen305* because clear_page_dirty_for_io() called306* folio_mkclean(), followed by set_page_dirty().307* However, now the page is going to get written back,308* which meets the original intention of setting it309* dirty, so all is well: clear_page_dirty_for_io() goes310* on to call TestClearPageDirty(), and write the page311* back.312*313* 2) This code sees the page as clean, so it calls314* set_page_dirty(). The page stays dirty, despite being315* written back, so it gets written back again in the316* next writeback cycle. This is harmless.317*/318if (!folio_test_dirty(folio)) {319folio_lock(folio);320folio_mark_dirty(folio);321folio_unlock(folio);322}323gup_put_folio(folio, nr, FOLL_PIN);324}325}326EXPORT_SYMBOL(unpin_user_pages_dirty_lock);327328/**329* unpin_user_page_range_dirty_lock() - release and optionally dirty330* gup-pinned page range331*332* @page: the starting page of a range maybe marked dirty, and definitely released.333* @npages: number of consecutive pages to release.334* @make_dirty: whether to mark the pages dirty335*336* "gup-pinned page range" refers to a range of pages that has had one of the337* pin_user_pages() variants called on that page.338*339* The page range must be truly physically contiguous: the page range340* corresponds to a contiguous PFN range and all pages can be iterated341* naturally.342*343* For the page ranges defined by [page .. page+npages], make that range (or344* its head pages, if a compound page) dirty, if @make_dirty is true, and if the345* page range was previously listed as clean.346*347* set_page_dirty_lock() is used internally. If instead, set_page_dirty() is348* required, then the caller should a) verify that this is really correct,349* because _lock() is usually required, and b) hand code it:350* set_page_dirty_lock(), unpin_user_page().351*352*/353void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,354bool make_dirty)355{356unsigned long i;357struct folio *folio;358unsigned int nr;359360VM_WARN_ON_ONCE(!page_range_contiguous(page, npages));361362for (i = 0; i < npages; i += nr) {363folio = gup_folio_range_next(page, npages, i, &nr);364if (make_dirty && !folio_test_dirty(folio)) {365folio_lock(folio);366folio_mark_dirty(folio);367folio_unlock(folio);368}369gup_put_folio(folio, nr, FOLL_PIN);370}371}372EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);373374static void gup_fast_unpin_user_pages(struct page **pages, unsigned long npages)375{376unsigned long i;377struct folio *folio;378unsigned int nr;379380/*381* Don't perform any sanity checks because we might have raced with382* fork() and some anonymous pages might now actually be shared --383* which is why we're unpinning after all.384*/385for (i = 0; i < npages; i += nr) {386folio = gup_folio_next(pages, npages, i, &nr);387gup_put_folio(folio, nr, FOLL_PIN);388}389}390391/**392* unpin_user_pages() - release an array of gup-pinned pages.393* @pages: array of pages to be marked dirty and released.394* @npages: number of pages in the @pages array.395*396* For each page in the @pages array, release the page using unpin_user_page().397*398* Please see the unpin_user_page() documentation for details.399*/400void unpin_user_pages(struct page **pages, unsigned long npages)401{402unsigned long i;403struct folio *folio;404unsigned int nr;405406/*407* If this WARN_ON() fires, then the system *might* be leaking pages (by408* leaving them pinned), but probably not. More likely, gup/pup returned409* a hard -ERRNO error to the caller, who erroneously passed it here.410*/411if (WARN_ON(IS_ERR_VALUE(npages)))412return;413414sanity_check_pinned_pages(pages, npages);415for (i = 0; i < npages; i += nr) {416if (!pages[i]) {417nr = 1;418continue;419}420folio = gup_folio_next(pages, npages, i, &nr);421gup_put_folio(folio, nr, FOLL_PIN);422}423}424EXPORT_SYMBOL(unpin_user_pages);425426/**427* unpin_user_folio() - release pages of a folio428* @folio: pointer to folio to be released429* @npages: number of pages of same folio430*431* Release npages of the folio432*/433void unpin_user_folio(struct folio *folio, unsigned long npages)434{435gup_put_folio(folio, npages, FOLL_PIN);436}437EXPORT_SYMBOL(unpin_user_folio);438439/**440* unpin_folios() - release an array of gup-pinned folios.441* @folios: array of folios to be marked dirty and released.442* @nfolios: number of folios in the @folios array.443*444* For each folio in the @folios array, release the folio using gup_put_folio.445*446* Please see the unpin_folio() documentation for details.447*/448void unpin_folios(struct folio **folios, unsigned long nfolios)449{450unsigned long i = 0, j;451452/*453* If this WARN_ON() fires, then the system *might* be leaking folios454* (by leaving them pinned), but probably not. More likely, gup/pup455* returned a hard -ERRNO error to the caller, who erroneously passed456* it here.457*/458if (WARN_ON(IS_ERR_VALUE(nfolios)))459return;460461while (i < nfolios) {462for (j = i + 1; j < nfolios; j++)463if (folios[i] != folios[j])464break;465466if (folios[i])467gup_put_folio(folios[i], j - i, FOLL_PIN);468i = j;469}470}471EXPORT_SYMBOL_GPL(unpin_folios);472473/*474* Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's475* lifecycle. Avoid setting the bit unless necessary, or it might cause write476* cache bouncing on large SMP machines for concurrent pinned gups.477*/478static inline void mm_set_has_pinned_flag(struct mm_struct *mm)479{480if (!mm_flags_test(MMF_HAS_PINNED, mm))481mm_flags_set(MMF_HAS_PINNED, mm);482}483484#ifdef CONFIG_MMU485486#ifdef CONFIG_HAVE_GUP_FAST487/**488* try_grab_folio_fast() - Attempt to get or pin a folio in fast path.489* @page: pointer to page to be grabbed490* @refs: the value to (effectively) add to the folio's refcount491* @flags: gup flags: these are the FOLL_* flag values.492*493* "grab" names in this file mean, "look at flags to decide whether to use494* FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.495*496* Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the497* same time. (That's true throughout the get_user_pages*() and498* pin_user_pages*() APIs.) Cases:499*500* FOLL_GET: folio's refcount will be incremented by @refs.501*502* FOLL_PIN on large folios: folio's refcount will be incremented by503* @refs, and its pincount will be incremented by @refs.504*505* FOLL_PIN on single-page folios: folio's refcount will be incremented by506* @refs * GUP_PIN_COUNTING_BIAS.507*508* Return: The folio containing @page (with refcount appropriately509* incremented) for success, or NULL upon failure. If neither FOLL_GET510* nor FOLL_PIN was set, that's considered failure, and furthermore,511* a likely bug in the caller, so a warning is also emitted.512*513* It uses add ref unless zero to elevate the folio refcount and must be called514* in fast path only.515*/516static struct folio *try_grab_folio_fast(struct page *page, int refs,517unsigned int flags)518{519struct folio *folio;520521/* Raise warn if it is not called in fast GUP */522VM_WARN_ON_ONCE(!irqs_disabled());523524if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0))525return NULL;526527if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))528return NULL;529530if (flags & FOLL_GET)531return try_get_folio(page, refs);532533/* FOLL_PIN is set */534535/*536* Don't take a pin on the zero page - it's not going anywhere537* and it is used in a *lot* of places.538*/539if (is_zero_page(page))540return page_folio(page);541542folio = try_get_folio(page, refs);543if (!folio)544return NULL;545546/*547* Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a548* right zone, so fail and let the caller fall back to the slow549* path.550*/551if (unlikely((flags & FOLL_LONGTERM) &&552!folio_is_longterm_pinnable(folio))) {553folio_put_refs(folio, refs);554return NULL;555}556557/*558* When pinning a large folio, use an exact count to track it.559*560* However, be sure to *also* increment the normal folio561* refcount field at least once, so that the folio really562* is pinned. That's why the refcount from the earlier563* try_get_folio() is left intact.564*/565if (folio_has_pincount(folio))566atomic_add(refs, &folio->_pincount);567else568folio_ref_add(folio,569refs * (GUP_PIN_COUNTING_BIAS - 1));570/*571* Adjust the pincount before re-checking the PTE for changes.572* This is essentially a smp_mb() and is paired with a memory573* barrier in folio_try_share_anon_rmap_*().574*/575smp_mb__after_atomic();576577node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);578579return folio;580}581#endif /* CONFIG_HAVE_GUP_FAST */582583/* Common code for can_follow_write_* */584static inline bool can_follow_write_common(struct page *page,585struct vm_area_struct *vma, unsigned int flags)586{587/* Maybe FOLL_FORCE is set to override it? */588if (!(flags & FOLL_FORCE))589return false;590591/* But FOLL_FORCE has no effect on shared mappings */592if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))593return false;594595/* ... or read-only private ones */596if (!(vma->vm_flags & VM_MAYWRITE))597return false;598599/* ... or already writable ones that just need to take a write fault */600if (vma->vm_flags & VM_WRITE)601return false;602603/*604* See can_change_pte_writable(): we broke COW and could map the page605* writable if we have an exclusive anonymous page ...606*/607return page && PageAnon(page) && PageAnonExclusive(page);608}609610static struct page *no_page_table(struct vm_area_struct *vma,611unsigned int flags, unsigned long address)612{613if (!(flags & FOLL_DUMP))614return NULL;615616/*617* When core dumping, we don't want to allocate unnecessary pages or618* page tables. Return error instead of NULL to skip handle_mm_fault,619* then get_dump_page() will return NULL to leave a hole in the dump.620* But we can only make this optimization where a hole would surely621* be zero-filled if handle_mm_fault() actually did handle it.622*/623if (is_vm_hugetlb_page(vma)) {624struct hstate *h = hstate_vma(vma);625626if (!hugetlbfs_pagecache_present(h, vma, address))627return ERR_PTR(-EFAULT);628} else if ((vma_is_anonymous(vma) || !vma->vm_ops->fault)) {629return ERR_PTR(-EFAULT);630}631632return NULL;633}634635#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES636/* FOLL_FORCE can write to even unwritable PUDs in COW mappings. */637static inline bool can_follow_write_pud(pud_t pud, struct page *page,638struct vm_area_struct *vma,639unsigned int flags)640{641/* If the pud is writable, we can write to the page. */642if (pud_write(pud))643return true;644645return can_follow_write_common(page, vma, flags);646}647648static struct page *follow_huge_pud(struct vm_area_struct *vma,649unsigned long addr, pud_t *pudp,650int flags, unsigned long *page_mask)651{652struct mm_struct *mm = vma->vm_mm;653struct page *page;654pud_t pud = *pudp;655unsigned long pfn = pud_pfn(pud);656int ret;657658assert_spin_locked(pud_lockptr(mm, pudp));659660if (!pud_present(pud))661return NULL;662663if ((flags & FOLL_WRITE) &&664!can_follow_write_pud(pud, pfn_to_page(pfn), vma, flags))665return NULL;666667pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;668page = pfn_to_page(pfn);669670if (!pud_write(pud) && gup_must_unshare(vma, flags, page))671return ERR_PTR(-EMLINK);672673ret = try_grab_folio(page_folio(page), 1, flags);674if (ret)675page = ERR_PTR(ret);676else677*page_mask = HPAGE_PUD_NR - 1;678679return page;680}681682/* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */683static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,684struct vm_area_struct *vma,685unsigned int flags)686{687/* If the pmd is writable, we can write to the page. */688if (pmd_write(pmd))689return true;690691if (!can_follow_write_common(page, vma, flags))692return false;693694/* ... and a write-fault isn't required for other reasons. */695if (pmd_needs_soft_dirty_wp(vma, pmd))696return false;697return !userfaultfd_huge_pmd_wp(vma, pmd);698}699700static struct page *follow_huge_pmd(struct vm_area_struct *vma,701unsigned long addr, pmd_t *pmd,702unsigned int flags,703unsigned long *page_mask)704{705struct mm_struct *mm = vma->vm_mm;706pmd_t pmdval = *pmd;707struct page *page;708int ret;709710assert_spin_locked(pmd_lockptr(mm, pmd));711712page = pmd_page(pmdval);713if ((flags & FOLL_WRITE) &&714!can_follow_write_pmd(pmdval, page, vma, flags))715return NULL;716717/* Avoid dumping huge zero page */718if ((flags & FOLL_DUMP) && is_huge_zero_pmd(pmdval))719return ERR_PTR(-EFAULT);720721if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags))722return NULL;723724if (!pmd_write(pmdval) && gup_must_unshare(vma, flags, page))725return ERR_PTR(-EMLINK);726727VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) &&728!PageAnonExclusive(page), page);729730ret = try_grab_folio(page_folio(page), 1, flags);731if (ret)732return ERR_PTR(ret);733734#ifdef CONFIG_TRANSPARENT_HUGEPAGE735if (pmd_trans_huge(pmdval) && (flags & FOLL_TOUCH))736touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);737#endif /* CONFIG_TRANSPARENT_HUGEPAGE */738739page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;740*page_mask = HPAGE_PMD_NR - 1;741742return page;743}744745#else /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */746static struct page *follow_huge_pud(struct vm_area_struct *vma,747unsigned long addr, pud_t *pudp,748int flags, unsigned long *page_mask)749{750return NULL;751}752753static struct page *follow_huge_pmd(struct vm_area_struct *vma,754unsigned long addr, pmd_t *pmd,755unsigned int flags,756unsigned long *page_mask)757{758return NULL;759}760#endif /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */761762static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,763pte_t *pte, unsigned int flags)764{765if (flags & FOLL_TOUCH) {766pte_t orig_entry = ptep_get(pte);767pte_t entry = orig_entry;768769if (flags & FOLL_WRITE)770entry = pte_mkdirty(entry);771entry = pte_mkyoung(entry);772773if (!pte_same(orig_entry, entry)) {774set_pte_at(vma->vm_mm, address, pte, entry);775update_mmu_cache(vma, address, pte);776}777}778779/* Proper page table entry exists, but no corresponding struct page */780return -EEXIST;781}782783/* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */784static inline bool can_follow_write_pte(pte_t pte, struct page *page,785struct vm_area_struct *vma,786unsigned int flags)787{788/* If the pte is writable, we can write to the page. */789if (pte_write(pte))790return true;791792if (!can_follow_write_common(page, vma, flags))793return false;794795/* ... and a write-fault isn't required for other reasons. */796if (pte_needs_soft_dirty_wp(vma, pte))797return false;798return !userfaultfd_pte_wp(vma, pte);799}800801static struct page *follow_page_pte(struct vm_area_struct *vma,802unsigned long address, pmd_t *pmd, unsigned int flags)803{804struct mm_struct *mm = vma->vm_mm;805struct folio *folio;806struct page *page;807spinlock_t *ptl;808pte_t *ptep, pte;809int ret;810811ptep = pte_offset_map_lock(mm, pmd, address, &ptl);812if (!ptep)813return no_page_table(vma, flags, address);814pte = ptep_get(ptep);815if (!pte_present(pte))816goto no_page;817if (pte_protnone(pte) && !gup_can_follow_protnone(vma, flags))818goto no_page;819820page = vm_normal_page(vma, address, pte);821822/*823* We only care about anon pages in can_follow_write_pte().824*/825if ((flags & FOLL_WRITE) &&826!can_follow_write_pte(pte, page, vma, flags)) {827page = NULL;828goto out;829}830831if (unlikely(!page)) {832if (flags & FOLL_DUMP) {833/* Avoid special (like zero) pages in core dumps */834page = ERR_PTR(-EFAULT);835goto out;836}837838if (is_zero_pfn(pte_pfn(pte))) {839page = pte_page(pte);840} else {841ret = follow_pfn_pte(vma, address, ptep, flags);842page = ERR_PTR(ret);843goto out;844}845}846folio = page_folio(page);847848if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) {849page = ERR_PTR(-EMLINK);850goto out;851}852853VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) &&854!PageAnonExclusive(page), page);855856/* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */857ret = try_grab_folio(folio, 1, flags);858if (unlikely(ret)) {859page = ERR_PTR(ret);860goto out;861}862863/*864* We need to make the page accessible if and only if we are going865* to access its content (the FOLL_PIN case). Please see866* Documentation/core-api/pin_user_pages.rst for details.867*/868if (flags & FOLL_PIN) {869ret = arch_make_folio_accessible(folio);870if (ret) {871unpin_user_page(page);872page = ERR_PTR(ret);873goto out;874}875}876if (flags & FOLL_TOUCH) {877if ((flags & FOLL_WRITE) &&878!pte_dirty(pte) && !folio_test_dirty(folio))879folio_mark_dirty(folio);880/*881* pte_mkyoung() would be more correct here, but atomic care882* is needed to avoid losing the dirty bit: it is easier to use883* folio_mark_accessed().884*/885folio_mark_accessed(folio);886}887out:888pte_unmap_unlock(ptep, ptl);889return page;890no_page:891pte_unmap_unlock(ptep, ptl);892if (!pte_none(pte))893return NULL;894return no_page_table(vma, flags, address);895}896897static struct page *follow_pmd_mask(struct vm_area_struct *vma,898unsigned long address, pud_t *pudp,899unsigned int flags,900unsigned long *page_mask)901{902pmd_t *pmd, pmdval;903spinlock_t *ptl;904struct page *page;905struct mm_struct *mm = vma->vm_mm;906907pmd = pmd_offset(pudp, address);908pmdval = pmdp_get_lockless(pmd);909if (pmd_none(pmdval))910return no_page_table(vma, flags, address);911if (!pmd_present(pmdval))912return no_page_table(vma, flags, address);913if (likely(!pmd_leaf(pmdval)))914return follow_page_pte(vma, address, pmd, flags);915916if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags))917return no_page_table(vma, flags, address);918919ptl = pmd_lock(mm, pmd);920pmdval = *pmd;921if (unlikely(!pmd_present(pmdval))) {922spin_unlock(ptl);923return no_page_table(vma, flags, address);924}925if (unlikely(!pmd_leaf(pmdval))) {926spin_unlock(ptl);927return follow_page_pte(vma, address, pmd, flags);928}929if (pmd_trans_huge(pmdval) && (flags & FOLL_SPLIT_PMD)) {930spin_unlock(ptl);931split_huge_pmd(vma, pmd, address);932/* If pmd was left empty, stuff a page table in there quickly */933return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) :934follow_page_pte(vma, address, pmd, flags);935}936page = follow_huge_pmd(vma, address, pmd, flags, page_mask);937spin_unlock(ptl);938return page;939}940941static struct page *follow_pud_mask(struct vm_area_struct *vma,942unsigned long address, p4d_t *p4dp,943unsigned int flags,944unsigned long *page_mask)945{946pud_t *pudp, pud;947spinlock_t *ptl;948struct page *page;949struct mm_struct *mm = vma->vm_mm;950951pudp = pud_offset(p4dp, address);952pud = READ_ONCE(*pudp);953if (!pud_present(pud))954return no_page_table(vma, flags, address);955if (pud_leaf(pud)) {956ptl = pud_lock(mm, pudp);957page = follow_huge_pud(vma, address, pudp, flags, page_mask);958spin_unlock(ptl);959if (page)960return page;961return no_page_table(vma, flags, address);962}963if (unlikely(pud_bad(pud)))964return no_page_table(vma, flags, address);965966return follow_pmd_mask(vma, address, pudp, flags, page_mask);967}968969static struct page *follow_p4d_mask(struct vm_area_struct *vma,970unsigned long address, pgd_t *pgdp,971unsigned int flags,972unsigned long *page_mask)973{974p4d_t *p4dp, p4d;975976p4dp = p4d_offset(pgdp, address);977p4d = READ_ONCE(*p4dp);978BUILD_BUG_ON(p4d_leaf(p4d));979980if (!p4d_present(p4d) || p4d_bad(p4d))981return no_page_table(vma, flags, address);982983return follow_pud_mask(vma, address, p4dp, flags, page_mask);984}985986/**987* follow_page_mask - look up a page descriptor from a user-virtual address988* @vma: vm_area_struct mapping @address989* @address: virtual address to look up990* @flags: flags modifying lookup behaviour991* @page_mask: a pointer to output page_mask992*993* @flags can have FOLL_ flags set, defined in <linux/mm.h>994*995* When getting an anonymous page and the caller has to trigger unsharing996* of a shared anonymous page first, -EMLINK is returned. The caller should997* trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only998* relevant with FOLL_PIN and !FOLL_WRITE.999*1000* On output, @page_mask is set according to the size of the page.1001*1002* Return: the mapped (struct page *), %NULL if no mapping exists, or1003* an error pointer if there is a mapping to something not represented1004* by a page descriptor (see also vm_normal_page()).1005*/1006static struct page *follow_page_mask(struct vm_area_struct *vma,1007unsigned long address, unsigned int flags,1008unsigned long *page_mask)1009{1010pgd_t *pgd;1011struct mm_struct *mm = vma->vm_mm;1012struct page *page;10131014vma_pgtable_walk_begin(vma);10151016*page_mask = 0;1017pgd = pgd_offset(mm, address);10181019if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))1020page = no_page_table(vma, flags, address);1021else1022page = follow_p4d_mask(vma, address, pgd, flags, page_mask);10231024vma_pgtable_walk_end(vma);10251026return page;1027}10281029static int get_gate_page(struct mm_struct *mm, unsigned long address,1030unsigned int gup_flags, struct vm_area_struct **vma,1031struct page **page)1032{1033pgd_t *pgd;1034p4d_t *p4d;1035pud_t *pud;1036pmd_t *pmd;1037pte_t *pte;1038pte_t entry;1039int ret = -EFAULT;10401041/* user gate pages are read-only */1042if (gup_flags & FOLL_WRITE)1043return -EFAULT;1044pgd = pgd_offset(mm, address);1045if (pgd_none(*pgd))1046return -EFAULT;1047p4d = p4d_offset(pgd, address);1048if (p4d_none(*p4d))1049return -EFAULT;1050pud = pud_offset(p4d, address);1051if (pud_none(*pud))1052return -EFAULT;1053pmd = pmd_offset(pud, address);1054if (!pmd_present(*pmd))1055return -EFAULT;1056pte = pte_offset_map(pmd, address);1057if (!pte)1058return -EFAULT;1059entry = ptep_get(pte);1060if (pte_none(entry))1061goto unmap;1062*vma = get_gate_vma(mm);1063if (!page)1064goto out;1065*page = vm_normal_page(*vma, address, entry);1066if (!*page) {1067if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(entry)))1068goto unmap;1069*page = pte_page(entry);1070}1071ret = try_grab_folio(page_folio(*page), 1, gup_flags);1072if (unlikely(ret))1073goto unmap;1074out:1075ret = 0;1076unmap:1077pte_unmap(pte);1078return ret;1079}10801081/*1082* mmap_lock must be held on entry. If @flags has FOLL_UNLOCKABLE but not1083* FOLL_NOWAIT, the mmap_lock may be released. If it is, *@locked will be set1084* to 0 and -EBUSY returned.1085*/1086static int faultin_page(struct vm_area_struct *vma,1087unsigned long address, unsigned int flags, bool unshare,1088int *locked)1089{1090unsigned int fault_flags = 0;1091vm_fault_t ret;10921093if (flags & FOLL_NOFAULT)1094return -EFAULT;1095if (flags & FOLL_WRITE)1096fault_flags |= FAULT_FLAG_WRITE;1097if (flags & FOLL_REMOTE)1098fault_flags |= FAULT_FLAG_REMOTE;1099if (flags & FOLL_UNLOCKABLE) {1100fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;1101/*1102* FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set1103* FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE.1104* That's because some callers may not be prepared to1105* handle early exits caused by non-fatal signals.1106*/1107if (flags & FOLL_INTERRUPTIBLE)1108fault_flags |= FAULT_FLAG_INTERRUPTIBLE;1109}1110if (flags & FOLL_NOWAIT)1111fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;1112if (flags & FOLL_TRIED) {1113/*1114* Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED1115* can co-exist1116*/1117fault_flags |= FAULT_FLAG_TRIED;1118}1119if (unshare) {1120fault_flags |= FAULT_FLAG_UNSHARE;1121/* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */1122VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_WRITE);1123}11241125ret = handle_mm_fault(vma, address, fault_flags, NULL);11261127if (ret & VM_FAULT_COMPLETED) {1128/*1129* With FAULT_FLAG_RETRY_NOWAIT we'll never release the1130* mmap lock in the page fault handler. Sanity check this.1131*/1132WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT);1133*locked = 0;11341135/*1136* We should do the same as VM_FAULT_RETRY, but let's not1137* return -EBUSY since that's not reflecting the reality of1138* what has happened - we've just fully completed a page1139* fault, with the mmap lock released. Use -EAGAIN to show1140* that we want to take the mmap lock _again_.1141*/1142return -EAGAIN;1143}11441145if (ret & VM_FAULT_ERROR) {1146int err = vm_fault_to_errno(ret, flags);11471148if (err)1149return err;1150BUG();1151}11521153if (ret & VM_FAULT_RETRY) {1154if (!(fault_flags & FAULT_FLAG_RETRY_NOWAIT))1155*locked = 0;1156return -EBUSY;1157}11581159return 0;1160}11611162/*1163* Writing to file-backed mappings which require folio dirty tracking using GUP1164* is a fundamentally broken operation, as kernel write access to GUP mappings1165* do not adhere to the semantics expected by a file system.1166*1167* Consider the following scenario:-1168*1169* 1. A folio is written to via GUP which write-faults the memory, notifying1170* the file system and dirtying the folio.1171* 2. Later, writeback is triggered, resulting in the folio being cleaned and1172* the PTE being marked read-only.1173* 3. The GUP caller writes to the folio, as it is mapped read/write via the1174* direct mapping.1175* 4. The GUP caller, now done with the page, unpins it and sets it dirty1176* (though it does not have to).1177*1178* This results in both data being written to a folio without writenotify, and1179* the folio being dirtied unexpectedly (if the caller decides to do so).1180*/1181static bool writable_file_mapping_allowed(struct vm_area_struct *vma,1182unsigned long gup_flags)1183{1184/*1185* If we aren't pinning then no problematic write can occur. A long term1186* pin is the most egregious case so this is the case we disallow.1187*/1188if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) !=1189(FOLL_PIN | FOLL_LONGTERM))1190return true;11911192/*1193* If the VMA does not require dirty tracking then no problematic write1194* can occur either.1195*/1196return !vma_needs_dirty_tracking(vma);1197}11981199static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)1200{1201vm_flags_t vm_flags = vma->vm_flags;1202int write = (gup_flags & FOLL_WRITE);1203int foreign = (gup_flags & FOLL_REMOTE);1204bool vma_anon = vma_is_anonymous(vma);12051206if (vm_flags & (VM_IO | VM_PFNMAP))1207return -EFAULT;12081209if ((gup_flags & FOLL_ANON) && !vma_anon)1210return -EFAULT;12111212if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))1213return -EOPNOTSUPP;12141215if ((gup_flags & FOLL_SPLIT_PMD) && is_vm_hugetlb_page(vma))1216return -EOPNOTSUPP;12171218if (vma_is_secretmem(vma))1219return -EFAULT;12201221if (write) {1222if (!vma_anon &&1223!writable_file_mapping_allowed(vma, gup_flags))1224return -EFAULT;12251226if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) {1227if (!(gup_flags & FOLL_FORCE))1228return -EFAULT;1229/*1230* We used to let the write,force case do COW in a1231* VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could1232* set a breakpoint in a read-only mapping of an1233* executable, without corrupting the file (yet only1234* when that file had been opened for writing!).1235* Anon pages in shared mappings are surprising: now1236* just reject it.1237*/1238if (!is_cow_mapping(vm_flags))1239return -EFAULT;1240}1241} else if (!(vm_flags & VM_READ)) {1242if (!(gup_flags & FOLL_FORCE))1243return -EFAULT;1244/*1245* Is there actually any vma we can reach here which does not1246* have VM_MAYREAD set?1247*/1248if (!(vm_flags & VM_MAYREAD))1249return -EFAULT;1250}1251/*1252* gups are always data accesses, not instruction1253* fetches, so execute=false here1254*/1255if (!arch_vma_access_permitted(vma, write, false, foreign))1256return -EFAULT;1257return 0;1258}12591260/*1261* This is "vma_lookup()", but with a warning if we would have1262* historically expanded the stack in the GUP code.1263*/1264static struct vm_area_struct *gup_vma_lookup(struct mm_struct *mm,1265unsigned long addr)1266{1267#ifdef CONFIG_STACK_GROWSUP1268return vma_lookup(mm, addr);1269#else1270static volatile unsigned long next_warn;1271struct vm_area_struct *vma;1272unsigned long now, next;12731274vma = find_vma(mm, addr);1275if (!vma || (addr >= vma->vm_start))1276return vma;12771278/* Only warn for half-way relevant accesses */1279if (!(vma->vm_flags & VM_GROWSDOWN))1280return NULL;1281if (vma->vm_start - addr > 65536)1282return NULL;12831284/* Let's not warn more than once an hour.. */1285now = jiffies; next = next_warn;1286if (next && time_before(now, next))1287return NULL;1288next_warn = now + 60*60*HZ;12891290/* Let people know things may have changed. */1291pr_warn("GUP no longer grows the stack in %s (%d): %lx-%lx (%lx)\n",1292current->comm, task_pid_nr(current),1293vma->vm_start, vma->vm_end, addr);1294dump_stack();1295return NULL;1296#endif1297}12981299/**1300* __get_user_pages() - pin user pages in memory1301* @mm: mm_struct of target mm1302* @start: starting user address1303* @nr_pages: number of pages from start to pin1304* @gup_flags: flags modifying pin behaviour1305* @pages: array that receives pointers to the pages pinned.1306* Should be at least nr_pages long. Or NULL, if caller1307* only intends to ensure the pages are faulted in.1308* @locked: whether we're still with the mmap_lock held1309*1310* Returns either number of pages pinned (which may be less than the1311* number requested), or an error. Details about the return value:1312*1313* -- If nr_pages is 0, returns 0.1314* -- If nr_pages is >0, but no pages were pinned, returns -errno.1315* -- If nr_pages is >0, and some pages were pinned, returns the number of1316* pages pinned. Again, this may be less than nr_pages.1317* -- 0 return value is possible when the fault would need to be retried.1318*1319* The caller is responsible for releasing returned @pages, via put_page().1320*1321* Must be called with mmap_lock held. It may be released. See below.1322*1323* __get_user_pages walks a process's page tables and takes a reference to1324* each struct page that each user address corresponds to at a given1325* instant. That is, it takes the page that would be accessed if a user1326* thread accesses the given user virtual address at that instant.1327*1328* This does not guarantee that the page exists in the user mappings when1329* __get_user_pages returns, and there may even be a completely different1330* page there in some cases (eg. if mmapped pagecache has been invalidated1331* and subsequently re-faulted). However it does guarantee that the page1332* won't be freed completely. And mostly callers simply care that the page1333* contains data that was valid *at some point in time*. Typically, an IO1334* or similar operation cannot guarantee anything stronger anyway because1335* locks can't be held over the syscall boundary.1336*1337* If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If1338* the page is written to, set_page_dirty (or set_page_dirty_lock, as1339* appropriate) must be called after the page is finished with, and1340* before put_page is called.1341*1342* If FOLL_UNLOCKABLE is set without FOLL_NOWAIT then the mmap_lock may1343* be released. If this happens *@locked will be set to 0 on return.1344*1345* A caller using such a combination of @gup_flags must therefore hold the1346* mmap_lock for reading only, and recognize when it's been released. Otherwise,1347* it must be held for either reading or writing and will not be released.1348*1349* In most cases, get_user_pages or get_user_pages_fast should be used1350* instead of __get_user_pages. __get_user_pages should be used only if1351* you need some special @gup_flags.1352*/1353static long __get_user_pages(struct mm_struct *mm,1354unsigned long start, unsigned long nr_pages,1355unsigned int gup_flags, struct page **pages,1356int *locked)1357{1358long ret = 0, i = 0;1359struct vm_area_struct *vma = NULL;1360unsigned long page_mask = 0;13611362if (!nr_pages)1363return 0;13641365start = untagged_addr_remote(mm, start);13661367VM_WARN_ON_ONCE(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));13681369/* FOLL_GET and FOLL_PIN are mutually exclusive. */1370VM_WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) ==1371(FOLL_PIN | FOLL_GET));13721373do {1374struct page *page;1375unsigned int page_increm;13761377/* first iteration or cross vma bound */1378if (!vma || start >= vma->vm_end) {1379/*1380* MADV_POPULATE_(READ|WRITE) wants to handle VMA1381* lookups+error reporting differently.1382*/1383if (gup_flags & FOLL_MADV_POPULATE) {1384vma = vma_lookup(mm, start);1385if (!vma) {1386ret = -ENOMEM;1387goto out;1388}1389if (check_vma_flags(vma, gup_flags)) {1390ret = -EINVAL;1391goto out;1392}1393goto retry;1394}1395vma = gup_vma_lookup(mm, start);1396if (!vma && in_gate_area(mm, start)) {1397ret = get_gate_page(mm, start & PAGE_MASK,1398gup_flags, &vma,1399pages ? &page : NULL);1400if (ret)1401goto out;1402page_mask = 0;1403goto next_page;1404}14051406if (!vma) {1407ret = -EFAULT;1408goto out;1409}1410ret = check_vma_flags(vma, gup_flags);1411if (ret)1412goto out;1413}1414retry:1415/*1416* If we have a pending SIGKILL, don't keep faulting pages and1417* potentially allocating memory.1418*/1419if (fatal_signal_pending(current)) {1420ret = -EINTR;1421goto out;1422}1423cond_resched();14241425page = follow_page_mask(vma, start, gup_flags, &page_mask);1426if (!page || PTR_ERR(page) == -EMLINK) {1427ret = faultin_page(vma, start, gup_flags,1428PTR_ERR(page) == -EMLINK, locked);1429switch (ret) {1430case 0:1431goto retry;1432case -EBUSY:1433case -EAGAIN:1434ret = 0;1435fallthrough;1436case -EFAULT:1437case -ENOMEM:1438case -EHWPOISON:1439goto out;1440}1441BUG();1442} else if (PTR_ERR(page) == -EEXIST) {1443/*1444* Proper page table entry exists, but no corresponding1445* struct page. If the caller expects **pages to be1446* filled in, bail out now, because that can't be done1447* for this page.1448*/1449if (pages) {1450ret = PTR_ERR(page);1451goto out;1452}1453} else if (IS_ERR(page)) {1454ret = PTR_ERR(page);1455goto out;1456}1457next_page:1458page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);1459if (page_increm > nr_pages)1460page_increm = nr_pages;14611462if (pages) {1463struct page *subpage;1464unsigned int j;14651466/*1467* This must be a large folio (and doesn't need to1468* be the whole folio; it can be part of it), do1469* the refcount work for all the subpages too.1470*1471* NOTE: here the page may not be the head page1472* e.g. when start addr is not thp-size aligned.1473* try_grab_folio() should have taken care of tail1474* pages.1475*/1476if (page_increm > 1) {1477struct folio *folio = page_folio(page);14781479/*1480* Since we already hold refcount on the1481* large folio, this should never fail.1482*/1483if (try_grab_folio(folio, page_increm - 1,1484gup_flags)) {1485/*1486* Release the 1st page ref if the1487* folio is problematic, fail hard.1488*/1489gup_put_folio(folio, 1, gup_flags);1490ret = -EFAULT;1491goto out;1492}1493}14941495for (j = 0; j < page_increm; j++) {1496subpage = page + j;1497pages[i + j] = subpage;1498flush_anon_page(vma, subpage, start + j * PAGE_SIZE);1499flush_dcache_page(subpage);1500}1501}15021503i += page_increm;1504start += page_increm * PAGE_SIZE;1505nr_pages -= page_increm;1506} while (nr_pages);1507out:1508return i ? i : ret;1509}15101511static bool vma_permits_fault(struct vm_area_struct *vma,1512unsigned int fault_flags)1513{1514bool write = !!(fault_flags & FAULT_FLAG_WRITE);1515bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);1516vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;15171518if (!(vm_flags & vma->vm_flags))1519return false;15201521/*1522* The architecture might have a hardware protection1523* mechanism other than read/write that can deny access.1524*1525* gup always represents data access, not instruction1526* fetches, so execute=false here:1527*/1528if (!arch_vma_access_permitted(vma, write, false, foreign))1529return false;15301531return true;1532}15331534/**1535* fixup_user_fault() - manually resolve a user page fault1536* @mm: mm_struct of target mm1537* @address: user address1538* @fault_flags:flags to pass down to handle_mm_fault()1539* @unlocked: did we unlock the mmap_lock while retrying, maybe NULL if caller1540* does not allow retry. If NULL, the caller must guarantee1541* that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.1542*1543* This is meant to be called in the specific scenario where for locking reasons1544* we try to access user memory in atomic context (within a pagefault_disable()1545* section), this returns -EFAULT, and we want to resolve the user fault before1546* trying again.1547*1548* Typically this is meant to be used by the futex code.1549*1550* The main difference with get_user_pages() is that this function will1551* unconditionally call handle_mm_fault() which will in turn perform all the1552* necessary SW fixup of the dirty and young bits in the PTE, while1553* get_user_pages() only guarantees to update these in the struct page.1554*1555* This is important for some architectures where those bits also gate the1556* access permission to the page because they are maintained in software. On1557* such architectures, gup() will not be enough to make a subsequent access1558* succeed.1559*1560* This function will not return with an unlocked mmap_lock. So it has not the1561* same semantics wrt the @mm->mmap_lock as does filemap_fault().1562*/1563int fixup_user_fault(struct mm_struct *mm,1564unsigned long address, unsigned int fault_flags,1565bool *unlocked)1566{1567struct vm_area_struct *vma;1568vm_fault_t ret;15691570address = untagged_addr_remote(mm, address);15711572if (unlocked)1573fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;15741575retry:1576vma = gup_vma_lookup(mm, address);1577if (!vma)1578return -EFAULT;15791580if (!vma_permits_fault(vma, fault_flags))1581return -EFAULT;15821583if ((fault_flags & FAULT_FLAG_KILLABLE) &&1584fatal_signal_pending(current))1585return -EINTR;15861587ret = handle_mm_fault(vma, address, fault_flags, NULL);15881589if (ret & VM_FAULT_COMPLETED) {1590/*1591* NOTE: it's a pity that we need to retake the lock here1592* to pair with the unlock() in the callers. Ideally we1593* could tell the callers so they do not need to unlock.1594*/1595mmap_read_lock(mm);1596*unlocked = true;1597return 0;1598}15991600if (ret & VM_FAULT_ERROR) {1601int err = vm_fault_to_errno(ret, 0);16021603if (err)1604return err;1605BUG();1606}16071608if (ret & VM_FAULT_RETRY) {1609mmap_read_lock(mm);1610*unlocked = true;1611fault_flags |= FAULT_FLAG_TRIED;1612goto retry;1613}16141615return 0;1616}1617EXPORT_SYMBOL_GPL(fixup_user_fault);16181619/*1620* GUP always responds to fatal signals. When FOLL_INTERRUPTIBLE is1621* specified, it'll also respond to generic signals. The caller of GUP1622* that has FOLL_INTERRUPTIBLE should take care of the GUP interruption.1623*/1624static bool gup_signal_pending(unsigned int flags)1625{1626if (fatal_signal_pending(current))1627return true;16281629if (!(flags & FOLL_INTERRUPTIBLE))1630return false;16311632return signal_pending(current);1633}16341635/*1636* Locking: (*locked == 1) means that the mmap_lock has already been acquired by1637* the caller. This function may drop the mmap_lock. If it does so, then it will1638* set (*locked = 0).1639*1640* (*locked == 0) means that the caller expects this function to acquire and1641* drop the mmap_lock. Therefore, the value of *locked will still be zero when1642* the function returns, even though it may have changed temporarily during1643* function execution.1644*1645* Please note that this function, unlike __get_user_pages(), will not return 01646* for nr_pages > 0, unless FOLL_NOWAIT is used.1647*/1648static __always_inline long __get_user_pages_locked(struct mm_struct *mm,1649unsigned long start,1650unsigned long nr_pages,1651struct page **pages,1652int *locked,1653unsigned int flags)1654{1655long ret, pages_done;1656bool must_unlock = false;16571658if (!nr_pages)1659return 0;16601661/*1662* The internal caller expects GUP to manage the lock internally and the1663* lock must be released when this returns.1664*/1665if (!*locked) {1666if (mmap_read_lock_killable(mm))1667return -EAGAIN;1668must_unlock = true;1669*locked = 1;1670}1671else1672mmap_assert_locked(mm);16731674if (flags & FOLL_PIN)1675mm_set_has_pinned_flag(mm);16761677/*1678* FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior1679* is to set FOLL_GET if the caller wants pages[] filled in (but has1680* carelessly failed to specify FOLL_GET), so keep doing that, but only1681* for FOLL_GET, not for the newer FOLL_PIN.1682*1683* FOLL_PIN always expects pages to be non-null, but no need to assert1684* that here, as any failures will be obvious enough.1685*/1686if (pages && !(flags & FOLL_PIN))1687flags |= FOLL_GET;16881689pages_done = 0;1690for (;;) {1691ret = __get_user_pages(mm, start, nr_pages, flags, pages,1692locked);1693if (!(flags & FOLL_UNLOCKABLE)) {1694/* VM_FAULT_RETRY couldn't trigger, bypass */1695pages_done = ret;1696break;1697}16981699/* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */1700VM_WARN_ON_ONCE(!*locked && (ret < 0 || ret >= nr_pages));17011702if (ret > 0) {1703nr_pages -= ret;1704pages_done += ret;1705if (!nr_pages)1706break;1707}1708if (*locked) {1709/*1710* VM_FAULT_RETRY didn't trigger or it was a1711* FOLL_NOWAIT.1712*/1713if (!pages_done)1714pages_done = ret;1715break;1716}1717/*1718* VM_FAULT_RETRY triggered, so seek to the faulting offset.1719* For the prefault case (!pages) we only update counts.1720*/1721if (likely(pages))1722pages += ret;1723start += ret << PAGE_SHIFT;17241725/* The lock was temporarily dropped, so we must unlock later */1726must_unlock = true;17271728retry:1729/*1730* Repeat on the address that fired VM_FAULT_RETRY1731* with both FAULT_FLAG_ALLOW_RETRY and1732* FAULT_FLAG_TRIED. Note that GUP can be interrupted1733* by fatal signals of even common signals, depending on1734* the caller's request. So we need to check it before we1735* start trying again otherwise it can loop forever.1736*/1737if (gup_signal_pending(flags)) {1738if (!pages_done)1739pages_done = -EINTR;1740break;1741}17421743ret = mmap_read_lock_killable(mm);1744if (ret) {1745if (!pages_done)1746pages_done = ret;1747break;1748}17491750*locked = 1;1751ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,1752pages, locked);1753if (!*locked) {1754/* Continue to retry until we succeeded */1755VM_WARN_ON_ONCE(ret != 0);1756goto retry;1757}1758if (ret != 1) {1759VM_WARN_ON_ONCE(ret > 1);1760if (!pages_done)1761pages_done = ret;1762break;1763}1764nr_pages--;1765pages_done++;1766if (!nr_pages)1767break;1768if (likely(pages))1769pages++;1770start += PAGE_SIZE;1771}1772if (must_unlock && *locked) {1773/*1774* We either temporarily dropped the lock, or the caller1775* requested that we both acquire and drop the lock. Either way,1776* we must now unlock, and notify the caller of that state.1777*/1778mmap_read_unlock(mm);1779*locked = 0;1780}17811782/*1783* Failing to pin anything implies something has gone wrong (except when1784* FOLL_NOWAIT is specified).1785*/1786if (WARN_ON_ONCE(pages_done == 0 && !(flags & FOLL_NOWAIT)))1787return -EFAULT;17881789return pages_done;1790}17911792/**1793* populate_vma_page_range() - populate a range of pages in the vma.1794* @vma: target vma1795* @start: start address1796* @end: end address1797* @locked: whether the mmap_lock is still held1798*1799* This takes care of mlocking the pages too if VM_LOCKED is set.1800*1801* Return either number of pages pinned in the vma, or a negative error1802* code on error.1803*1804* vma->vm_mm->mmap_lock must be held.1805*1806* If @locked is NULL, it may be held for read or write and will1807* be unperturbed.1808*1809* If @locked is non-NULL, it must held for read only and may be1810* released. If it's released, *@locked will be set to 0.1811*/1812long populate_vma_page_range(struct vm_area_struct *vma,1813unsigned long start, unsigned long end, int *locked)1814{1815struct mm_struct *mm = vma->vm_mm;1816unsigned long nr_pages = (end - start) / PAGE_SIZE;1817int local_locked = 1;1818int gup_flags;1819long ret;18201821VM_WARN_ON_ONCE(!PAGE_ALIGNED(start));1822VM_WARN_ON_ONCE(!PAGE_ALIGNED(end));1823VM_WARN_ON_ONCE_VMA(start < vma->vm_start, vma);1824VM_WARN_ON_ONCE_VMA(end > vma->vm_end, vma);1825mmap_assert_locked(mm);18261827/*1828* Rightly or wrongly, the VM_LOCKONFAULT case has never used1829* faultin_page() to break COW, so it has no work to do here.1830*/1831if (vma->vm_flags & VM_LOCKONFAULT)1832return nr_pages;18331834/* ... similarly, we've never faulted in PROT_NONE pages */1835if (!vma_is_accessible(vma))1836return -EFAULT;18371838gup_flags = FOLL_TOUCH;1839/*1840* We want to touch writable mappings with a write fault in order1841* to break COW, except for shared mappings because these don't COW1842* and we would not want to dirty them for nothing.1843*1844* Otherwise, do a read fault, and use FOLL_FORCE in case it's not1845* readable (ie write-only or executable).1846*/1847if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)1848gup_flags |= FOLL_WRITE;1849else1850gup_flags |= FOLL_FORCE;18511852if (locked)1853gup_flags |= FOLL_UNLOCKABLE;18541855/*1856* We made sure addr is within a VMA, so the following will1857* not result in a stack expansion that recurses back here.1858*/1859ret = __get_user_pages(mm, start, nr_pages, gup_flags,1860NULL, locked ? locked : &local_locked);1861lru_add_drain();1862return ret;1863}18641865/*1866* faultin_page_range() - populate (prefault) page tables inside the1867* given range readable/writable1868*1869* This takes care of mlocking the pages, too, if VM_LOCKED is set.1870*1871* @mm: the mm to populate page tables in1872* @start: start address1873* @end: end address1874* @write: whether to prefault readable or writable1875* @locked: whether the mmap_lock is still held1876*1877* Returns either number of processed pages in the MM, or a negative error1878* code on error (see __get_user_pages()). Note that this function reports1879* errors related to VMAs, such as incompatible mappings, as expected by1880* MADV_POPULATE_(READ|WRITE).1881*1882* The range must be page-aligned.1883*1884* mm->mmap_lock must be held. If it's released, *@locked will be set to 0.1885*/1886long faultin_page_range(struct mm_struct *mm, unsigned long start,1887unsigned long end, bool write, int *locked)1888{1889unsigned long nr_pages = (end - start) / PAGE_SIZE;1890int gup_flags;1891long ret;18921893VM_WARN_ON_ONCE(!PAGE_ALIGNED(start));1894VM_WARN_ON_ONCE(!PAGE_ALIGNED(end));1895mmap_assert_locked(mm);18961897/*1898* FOLL_TOUCH: Mark page accessed and thereby young; will also mark1899* the page dirty with FOLL_WRITE -- which doesn't make a1900* difference with !FOLL_FORCE, because the page is writable1901* in the page table.1902* FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit1903* a poisoned page.1904* !FOLL_FORCE: Require proper access permissions.1905*/1906gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE |1907FOLL_MADV_POPULATE;1908if (write)1909gup_flags |= FOLL_WRITE;19101911ret = __get_user_pages_locked(mm, start, nr_pages, NULL, locked,1912gup_flags);1913lru_add_drain();1914return ret;1915}19161917/*1918* __mm_populate - populate and/or mlock pages within a range of address space.1919*1920* This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap1921* flags. VMAs must be already marked with the desired vm_flags, and1922* mmap_lock must not be held.1923*/1924int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)1925{1926struct mm_struct *mm = current->mm;1927unsigned long end, nstart, nend;1928struct vm_area_struct *vma = NULL;1929int locked = 0;1930long ret = 0;19311932end = start + len;19331934for (nstart = start; nstart < end; nstart = nend) {1935/*1936* We want to fault in pages for [nstart; end) address range.1937* Find first corresponding VMA.1938*/1939if (!locked) {1940locked = 1;1941mmap_read_lock(mm);1942vma = find_vma_intersection(mm, nstart, end);1943} else if (nstart >= vma->vm_end)1944vma = find_vma_intersection(mm, vma->vm_end, end);19451946if (!vma)1947break;1948/*1949* Set [nstart; nend) to intersection of desired address1950* range with the first VMA. Also, skip undesirable VMA types.1951*/1952nend = min(end, vma->vm_end);1953if (vma->vm_flags & (VM_IO | VM_PFNMAP))1954continue;1955if (nstart < vma->vm_start)1956nstart = vma->vm_start;1957/*1958* Now fault in a range of pages. populate_vma_page_range()1959* double checks the vma flags, so that it won't mlock pages1960* if the vma was already munlocked.1961*/1962ret = populate_vma_page_range(vma, nstart, nend, &locked);1963if (ret < 0) {1964if (ignore_errors) {1965ret = 0;1966continue; /* continue at next VMA */1967}1968break;1969}1970nend = nstart + ret * PAGE_SIZE;1971ret = 0;1972}1973if (locked)1974mmap_read_unlock(mm);1975return ret; /* 0 or negative error code */1976}1977#else /* CONFIG_MMU */1978static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,1979unsigned long nr_pages, struct page **pages,1980int *locked, unsigned int foll_flags)1981{1982struct vm_area_struct *vma;1983bool must_unlock = false;1984vm_flags_t vm_flags;1985long i;19861987if (!nr_pages)1988return 0;19891990/*1991* The internal caller expects GUP to manage the lock internally and the1992* lock must be released when this returns.1993*/1994if (!*locked) {1995if (mmap_read_lock_killable(mm))1996return -EAGAIN;1997must_unlock = true;1998*locked = 1;1999}20002001/* calculate required read or write permissions.2002* If FOLL_FORCE is set, we only require the "MAY" flags.2003*/2004vm_flags = (foll_flags & FOLL_WRITE) ?2005(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);2006vm_flags &= (foll_flags & FOLL_FORCE) ?2007(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);20082009for (i = 0; i < nr_pages; i++) {2010vma = find_vma(mm, start);2011if (!vma)2012break;20132014/* protect what we can, including chardevs */2015if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||2016!(vm_flags & vma->vm_flags))2017break;20182019if (pages) {2020pages[i] = virt_to_page((void *)start);2021if (pages[i])2022get_page(pages[i]);2023}20242025start = (start + PAGE_SIZE) & PAGE_MASK;2026}20272028if (must_unlock && *locked) {2029mmap_read_unlock(mm);2030*locked = 0;2031}20322033return i ? : -EFAULT;2034}2035#endif /* !CONFIG_MMU */20362037/**2038* fault_in_writeable - fault in userspace address range for writing2039* @uaddr: start of address range2040* @size: size of address range2041*2042* Returns the number of bytes not faulted in (like copy_to_user() and2043* copy_from_user()).2044*/2045size_t fault_in_writeable(char __user *uaddr, size_t size)2046{2047const unsigned long start = (unsigned long)uaddr;2048const unsigned long end = start + size;2049unsigned long cur;20502051if (unlikely(size == 0))2052return 0;2053if (!user_write_access_begin(uaddr, size))2054return size;20552056/* Stop once we overflow to 0. */2057for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))2058unsafe_put_user(0, (char __user *)cur, out);2059out:2060user_write_access_end();2061if (size > cur - start)2062return size - (cur - start);2063return 0;2064}2065EXPORT_SYMBOL(fault_in_writeable);20662067/**2068* fault_in_subpage_writeable - fault in an address range for writing2069* @uaddr: start of address range2070* @size: size of address range2071*2072* Fault in a user address range for writing while checking for permissions at2073* sub-page granularity (e.g. arm64 MTE). This function should be used when2074* the caller cannot guarantee forward progress of a copy_to_user() loop.2075*2076* Returns the number of bytes not faulted in (like copy_to_user() and2077* copy_from_user()).2078*/2079size_t fault_in_subpage_writeable(char __user *uaddr, size_t size)2080{2081size_t faulted_in;20822083/*2084* Attempt faulting in at page granularity first for page table2085* permission checking. The arch-specific probe_subpage_writeable()2086* functions may not check for this.2087*/2088faulted_in = size - fault_in_writeable(uaddr, size);2089if (faulted_in)2090faulted_in -= probe_subpage_writeable(uaddr, faulted_in);20912092return size - faulted_in;2093}2094EXPORT_SYMBOL(fault_in_subpage_writeable);20952096/*2097* fault_in_safe_writeable - fault in an address range for writing2098* @uaddr: start of address range2099* @size: length of address range2100*2101* Faults in an address range for writing. This is primarily useful when we2102* already know that some or all of the pages in the address range aren't in2103* memory.2104*2105* Unlike fault_in_writeable(), this function is non-destructive.2106*2107* Note that we don't pin or otherwise hold the pages referenced that we fault2108* in. There's no guarantee that they'll stay in memory for any duration of2109* time.2110*2111* Returns the number of bytes not faulted in, like copy_to_user() and2112* copy_from_user().2113*/2114size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)2115{2116const unsigned long start = (unsigned long)uaddr;2117const unsigned long end = start + size;2118unsigned long cur;2119struct mm_struct *mm = current->mm;2120bool unlocked = false;21212122if (unlikely(size == 0))2123return 0;21242125mmap_read_lock(mm);2126/* Stop once we overflow to 0. */2127for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))2128if (fixup_user_fault(mm, cur, FAULT_FLAG_WRITE, &unlocked))2129break;2130mmap_read_unlock(mm);21312132if (size > cur - start)2133return size - (cur - start);2134return 0;2135}2136EXPORT_SYMBOL(fault_in_safe_writeable);21372138/**2139* fault_in_readable - fault in userspace address range for reading2140* @uaddr: start of user address range2141* @size: size of user address range2142*2143* Returns the number of bytes not faulted in (like copy_to_user() and2144* copy_from_user()).2145*/2146size_t fault_in_readable(const char __user *uaddr, size_t size)2147{2148const unsigned long start = (unsigned long)uaddr;2149const unsigned long end = start + size;2150unsigned long cur;2151volatile char c;21522153if (unlikely(size == 0))2154return 0;2155if (!user_read_access_begin(uaddr, size))2156return size;21572158/* Stop once we overflow to 0. */2159for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE))2160unsafe_get_user(c, (const char __user *)cur, out);2161out:2162user_read_access_end();2163(void)c;2164if (size > cur - start)2165return size - (cur - start);2166return 0;2167}2168EXPORT_SYMBOL(fault_in_readable);21692170/**2171* get_dump_page() - pin user page in memory while writing it to core dump2172* @addr: user address2173* @locked: a pointer to an int denoting whether the mmap sem is held2174*2175* Returns struct page pointer of user page pinned for dump,2176* to be freed afterwards by put_page().2177*2178* Returns NULL on any kind of failure - a hole must then be inserted into2179* the corefile, to preserve alignment with its headers; and also returns2180* NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -2181* allowing a hole to be left in the corefile to save disk space.2182*2183* Called without mmap_lock (takes and releases the mmap_lock by itself).2184*/2185#ifdef CONFIG_ELF_CORE2186struct page *get_dump_page(unsigned long addr, int *locked)2187{2188struct page *page;2189int ret;21902191ret = __get_user_pages_locked(current->mm, addr, 1, &page, locked,2192FOLL_FORCE | FOLL_DUMP | FOLL_GET);2193return (ret == 1) ? page : NULL;2194}2195#endif /* CONFIG_ELF_CORE */21962197#ifdef CONFIG_MIGRATION21982199/*2200* An array of either pages or folios ("pofs"). Although it may seem tempting to2201* avoid this complication, by simply interpreting a list of folios as a list of2202* pages, that approach won't work in the longer term, because eventually the2203* layouts of struct page and struct folio will become completely different.2204* Furthermore, this pof approach avoids excessive page_folio() calls.2205*/2206struct pages_or_folios {2207union {2208struct page **pages;2209struct folio **folios;2210void **entries;2211};2212bool has_folios;2213long nr_entries;2214};22152216static struct folio *pofs_get_folio(struct pages_or_folios *pofs, long i)2217{2218if (pofs->has_folios)2219return pofs->folios[i];2220return page_folio(pofs->pages[i]);2221}22222223static void pofs_clear_entry(struct pages_or_folios *pofs, long i)2224{2225pofs->entries[i] = NULL;2226}22272228static void pofs_unpin(struct pages_or_folios *pofs)2229{2230if (pofs->has_folios)2231unpin_folios(pofs->folios, pofs->nr_entries);2232else2233unpin_user_pages(pofs->pages, pofs->nr_entries);2234}22352236static struct folio *pofs_next_folio(struct folio *folio,2237struct pages_or_folios *pofs, long *index_ptr)2238{2239long i = *index_ptr + 1;22402241if (!pofs->has_folios && folio_test_large(folio)) {2242const unsigned long start_pfn = folio_pfn(folio);2243const unsigned long end_pfn = start_pfn + folio_nr_pages(folio);22442245for (; i < pofs->nr_entries; i++) {2246unsigned long pfn = page_to_pfn(pofs->pages[i]);22472248/* Is this page part of this folio? */2249if (pfn < start_pfn || pfn >= end_pfn)2250break;2251}2252}22532254if (unlikely(i == pofs->nr_entries))2255return NULL;2256*index_ptr = i;22572258return pofs_get_folio(pofs, i);2259}22602261/*2262* Returns the number of collected folios. Return value is always >= 0.2263*/2264static unsigned long collect_longterm_unpinnable_folios(2265struct list_head *movable_folio_list,2266struct pages_or_folios *pofs)2267{2268unsigned long collected = 0;2269struct folio *folio;2270int drained = 0;2271long i = 0;22722273for (folio = pofs_get_folio(pofs, i); folio;2274folio = pofs_next_folio(folio, pofs, &i)) {22752276if (folio_is_longterm_pinnable(folio))2277continue;22782279collected++;22802281if (folio_is_device_coherent(folio))2282continue;22832284if (folio_test_hugetlb(folio)) {2285folio_isolate_hugetlb(folio, movable_folio_list);2286continue;2287}22882289if (drained == 0 && folio_may_be_lru_cached(folio) &&2290folio_ref_count(folio) !=2291folio_expected_ref_count(folio) + 1) {2292lru_add_drain();2293drained = 1;2294}2295if (drained == 1 && folio_may_be_lru_cached(folio) &&2296folio_ref_count(folio) !=2297folio_expected_ref_count(folio) + 1) {2298lru_add_drain_all();2299drained = 2;2300}23012302if (!folio_isolate_lru(folio))2303continue;23042305list_add_tail(&folio->lru, movable_folio_list);2306node_stat_mod_folio(folio,2307NR_ISOLATED_ANON + folio_is_file_lru(folio),2308folio_nr_pages(folio));2309}23102311return collected;2312}23132314/*2315* Unpins all folios and migrates device coherent folios and movable_folio_list.2316* Returns -EAGAIN if all folios were successfully migrated or -errno for2317* failure (or partial success).2318*/2319static int2320migrate_longterm_unpinnable_folios(struct list_head *movable_folio_list,2321struct pages_or_folios *pofs)2322{2323int ret;2324unsigned long i;23252326for (i = 0; i < pofs->nr_entries; i++) {2327struct folio *folio = pofs_get_folio(pofs, i);23282329if (folio_is_device_coherent(folio)) {2330/*2331* Migration will fail if the folio is pinned, so2332* convert the pin on the source folio to a normal2333* reference.2334*/2335pofs_clear_entry(pofs, i);2336folio_get(folio);2337gup_put_folio(folio, 1, FOLL_PIN);23382339if (migrate_device_coherent_folio(folio)) {2340ret = -EBUSY;2341goto err;2342}23432344continue;2345}23462347/*2348* We can't migrate folios with unexpected references, so drop2349* the reference obtained by __get_user_pages_locked().2350* Migrating folios have been added to movable_folio_list after2351* calling folio_isolate_lru() which takes a reference so the2352* folio won't be freed if it's migrating.2353*/2354unpin_folio(folio);2355pofs_clear_entry(pofs, i);2356}23572358if (!list_empty(movable_folio_list)) {2359struct migration_target_control mtc = {2360.nid = NUMA_NO_NODE,2361.gfp_mask = GFP_USER | __GFP_NOWARN,2362.reason = MR_LONGTERM_PIN,2363};23642365if (migrate_pages(movable_folio_list, alloc_migration_target,2366NULL, (unsigned long)&mtc, MIGRATE_SYNC,2367MR_LONGTERM_PIN, NULL)) {2368ret = -ENOMEM;2369goto err;2370}2371}23722373putback_movable_pages(movable_folio_list);23742375return -EAGAIN;23762377err:2378pofs_unpin(pofs);2379putback_movable_pages(movable_folio_list);23802381return ret;2382}23832384static long2385check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs)2386{2387LIST_HEAD(movable_folio_list);2388unsigned long collected;23892390collected = collect_longterm_unpinnable_folios(&movable_folio_list,2391pofs);2392if (!collected)2393return 0;23942395return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs);2396}23972398/*2399* Check whether all folios are *allowed* to be pinned indefinitely (long term).2400* Rather confusingly, all folios in the range are required to be pinned via2401* FOLL_PIN, before calling this routine.2402*2403* Return values:2404*2405* 0: if everything is OK and all folios in the range are allowed to be pinned,2406* then this routine leaves all folios pinned and returns zero for success.2407*2408* -EAGAIN: if any folios in the range are not allowed to be pinned, then this2409* routine will migrate those folios away, unpin all the folios in the range. If2410* migration of the entire set of folios succeeds, then -EAGAIN is returned. The2411* caller should re-pin the entire range with FOLL_PIN and then call this2412* routine again.2413*2414* -ENOMEM, or any other -errno: if an error *other* than -EAGAIN occurs, this2415* indicates a migration failure. The caller should give up, and propagate the2416* error back up the call stack. The caller does not need to unpin any folios in2417* that case, because this routine will do the unpinning.2418*/2419static long check_and_migrate_movable_folios(unsigned long nr_folios,2420struct folio **folios)2421{2422struct pages_or_folios pofs = {2423.folios = folios,2424.has_folios = true,2425.nr_entries = nr_folios,2426};24272428return check_and_migrate_movable_pages_or_folios(&pofs);2429}24302431/*2432* Return values and behavior are the same as those for2433* check_and_migrate_movable_folios().2434*/2435static long check_and_migrate_movable_pages(unsigned long nr_pages,2436struct page **pages)2437{2438struct pages_or_folios pofs = {2439.pages = pages,2440.has_folios = false,2441.nr_entries = nr_pages,2442};24432444return check_and_migrate_movable_pages_or_folios(&pofs);2445}2446#else2447static long check_and_migrate_movable_pages(unsigned long nr_pages,2448struct page **pages)2449{2450return 0;2451}24522453static long check_and_migrate_movable_folios(unsigned long nr_folios,2454struct folio **folios)2455{2456return 0;2457}2458#endif /* CONFIG_MIGRATION */24592460/*2461* __gup_longterm_locked() is a wrapper for __get_user_pages_locked which2462* allows us to process the FOLL_LONGTERM flag.2463*/2464static long __gup_longterm_locked(struct mm_struct *mm,2465unsigned long start,2466unsigned long nr_pages,2467struct page **pages,2468int *locked,2469unsigned int gup_flags)2470{2471unsigned int flags;2472long rc, nr_pinned_pages;24732474if (!(gup_flags & FOLL_LONGTERM))2475return __get_user_pages_locked(mm, start, nr_pages, pages,2476locked, gup_flags);24772478flags = memalloc_pin_save();2479do {2480nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages,2481pages, locked,2482gup_flags);2483if (nr_pinned_pages <= 0) {2484rc = nr_pinned_pages;2485break;2486}24872488/* FOLL_LONGTERM implies FOLL_PIN */2489rc = check_and_migrate_movable_pages(nr_pinned_pages, pages);2490} while (rc == -EAGAIN);2491memalloc_pin_restore(flags);2492return rc ? rc : nr_pinned_pages;2493}24942495/*2496* Check that the given flags are valid for the exported gup/pup interface, and2497* update them with the required flags that the caller must have set.2498*/2499static bool is_valid_gup_args(struct page **pages, int *locked,2500unsigned int *gup_flags_p, unsigned int to_set)2501{2502unsigned int gup_flags = *gup_flags_p;25032504/*2505* These flags not allowed to be specified externally to the gup2506* interfaces:2507* - FOLL_TOUCH/FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only2508* - FOLL_REMOTE is internal only, set in (get|pin)_user_pages_remote()2509* - FOLL_UNLOCKABLE is internal only and used if locked is !NULL2510*/2511if (WARN_ON_ONCE(gup_flags & INTERNAL_GUP_FLAGS))2512return false;25132514gup_flags |= to_set;2515if (locked) {2516/* At the external interface locked must be set */2517if (WARN_ON_ONCE(*locked != 1))2518return false;25192520gup_flags |= FOLL_UNLOCKABLE;2521}25222523/* FOLL_GET and FOLL_PIN are mutually exclusive. */2524if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) ==2525(FOLL_PIN | FOLL_GET)))2526return false;25272528/* LONGTERM can only be specified when pinning */2529if (WARN_ON_ONCE(!(gup_flags & FOLL_PIN) && (gup_flags & FOLL_LONGTERM)))2530return false;25312532/* Pages input must be given if using GET/PIN */2533if (WARN_ON_ONCE((gup_flags & (FOLL_GET | FOLL_PIN)) && !pages))2534return false;25352536/* We want to allow the pgmap to be hot-unplugged at all times */2537if (WARN_ON_ONCE((gup_flags & FOLL_LONGTERM) &&2538(gup_flags & FOLL_PCI_P2PDMA)))2539return false;25402541*gup_flags_p = gup_flags;2542return true;2543}25442545#ifdef CONFIG_MMU2546/**2547* get_user_pages_remote() - pin user pages in memory2548* @mm: mm_struct of target mm2549* @start: starting user address2550* @nr_pages: number of pages from start to pin2551* @gup_flags: flags modifying lookup behaviour2552* @pages: array that receives pointers to the pages pinned.2553* Should be at least nr_pages long. Or NULL, if caller2554* only intends to ensure the pages are faulted in.2555* @locked: pointer to lock flag indicating whether lock is held and2556* subsequently whether VM_FAULT_RETRY functionality can be2557* utilised. Lock must initially be held.2558*2559* Returns either number of pages pinned (which may be less than the2560* number requested), or an error. Details about the return value:2561*2562* -- If nr_pages is 0, returns 0.2563* -- If nr_pages is >0, but no pages were pinned, returns -errno.2564* -- If nr_pages is >0, and some pages were pinned, returns the number of2565* pages pinned. Again, this may be less than nr_pages.2566*2567* The caller is responsible for releasing returned @pages, via put_page().2568*2569* Must be called with mmap_lock held for read or write.2570*2571* get_user_pages_remote walks a process's page tables and takes a reference2572* to each struct page that each user address corresponds to at a given2573* instant. That is, it takes the page that would be accessed if a user2574* thread accesses the given user virtual address at that instant.2575*2576* This does not guarantee that the page exists in the user mappings when2577* get_user_pages_remote returns, and there may even be a completely different2578* page there in some cases (eg. if mmapped pagecache has been invalidated2579* and subsequently re-faulted). However it does guarantee that the page2580* won't be freed completely. And mostly callers simply care that the page2581* contains data that was valid *at some point in time*. Typically, an IO2582* or similar operation cannot guarantee anything stronger anyway because2583* locks can't be held over the syscall boundary.2584*2585* If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page2586* is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must2587* be called after the page is finished with, and before put_page is called.2588*2589* get_user_pages_remote is typically used for fewer-copy IO operations,2590* to get a handle on the memory by some means other than accesses2591* via the user virtual addresses. The pages may be submitted for2592* DMA to devices or accessed via their kernel linear mapping (via the2593* kmap APIs). Care should be taken to use the correct cache flushing APIs.2594*2595* See also get_user_pages_fast, for performance critical applications.2596*2597* get_user_pages_remote should be phased out in favor of2598* get_user_pages_locked|unlocked or get_user_pages_fast. Nothing2599* should use get_user_pages_remote because it cannot pass2600* FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.2601*/2602long get_user_pages_remote(struct mm_struct *mm,2603unsigned long start, unsigned long nr_pages,2604unsigned int gup_flags, struct page **pages,2605int *locked)2606{2607int local_locked = 1;26082609if (!is_valid_gup_args(pages, locked, &gup_flags,2610FOLL_TOUCH | FOLL_REMOTE))2611return -EINVAL;26122613return __get_user_pages_locked(mm, start, nr_pages, pages,2614locked ? locked : &local_locked,2615gup_flags);2616}2617EXPORT_SYMBOL(get_user_pages_remote);26182619#else /* CONFIG_MMU */2620long get_user_pages_remote(struct mm_struct *mm,2621unsigned long start, unsigned long nr_pages,2622unsigned int gup_flags, struct page **pages,2623int *locked)2624{2625return 0;2626}2627#endif /* !CONFIG_MMU */26282629/**2630* get_user_pages() - pin user pages in memory2631* @start: starting user address2632* @nr_pages: number of pages from start to pin2633* @gup_flags: flags modifying lookup behaviour2634* @pages: array that receives pointers to the pages pinned.2635* Should be at least nr_pages long. Or NULL, if caller2636* only intends to ensure the pages are faulted in.2637*2638* This is the same as get_user_pages_remote(), just with a less-flexible2639* calling convention where we assume that the mm being operated on belongs to2640* the current task, and doesn't allow passing of a locked parameter. We also2641* obviously don't pass FOLL_REMOTE in here.2642*/2643long get_user_pages(unsigned long start, unsigned long nr_pages,2644unsigned int gup_flags, struct page **pages)2645{2646int locked = 1;26472648if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH))2649return -EINVAL;26502651return __get_user_pages_locked(current->mm, start, nr_pages, pages,2652&locked, gup_flags);2653}2654EXPORT_SYMBOL(get_user_pages);26552656/*2657* get_user_pages_unlocked() is suitable to replace the form:2658*2659* mmap_read_lock(mm);2660* get_user_pages(mm, ..., pages, NULL);2661* mmap_read_unlock(mm);2662*2663* with:2664*2665* get_user_pages_unlocked(mm, ..., pages);2666*2667* It is functionally equivalent to get_user_pages_fast so2668* get_user_pages_fast should be used instead if specific gup_flags2669* (e.g. FOLL_FORCE) are not required.2670*/2671long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,2672struct page **pages, unsigned int gup_flags)2673{2674int locked = 0;26752676if (!is_valid_gup_args(pages, NULL, &gup_flags,2677FOLL_TOUCH | FOLL_UNLOCKABLE))2678return -EINVAL;26792680return __get_user_pages_locked(current->mm, start, nr_pages, pages,2681&locked, gup_flags);2682}2683EXPORT_SYMBOL(get_user_pages_unlocked);26842685/*2686* GUP-fast2687*2688* get_user_pages_fast attempts to pin user pages by walking the page2689* tables directly and avoids taking locks. Thus the walker needs to be2690* protected from page table pages being freed from under it, and should2691* block any THP splits.2692*2693* One way to achieve this is to have the walker disable interrupts, and2694* rely on IPIs from the TLB flushing code blocking before the page table2695* pages are freed. This is unsuitable for architectures that do not need2696* to broadcast an IPI when invalidating TLBs.2697*2698* Another way to achieve this is to batch up page table containing pages2699* belonging to more than one mm_user, then rcu_sched a callback to free those2700* pages. Disabling interrupts will allow the gup_fast() walker to both block2701* the rcu_sched callback, and an IPI that we broadcast for splitting THPs2702* (which is a relatively rare event). The code below adopts this strategy.2703*2704* Before activating this code, please be aware that the following assumptions2705* are currently made:2706*2707* *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to2708* free pages containing page tables or TLB flushing requires IPI broadcast.2709*2710* *) ptes can be read atomically by the architecture.2711*2712* *) valid user addesses are below TASK_MAX_SIZE2713*2714* The last two assumptions can be relaxed by the addition of helper functions.2715*2716* This code is based heavily on the PowerPC implementation by Nick Piggin.2717*/2718#ifdef CONFIG_HAVE_GUP_FAST2719/*2720* Used in the GUP-fast path to determine whether GUP is permitted to work on2721* a specific folio.2722*2723* This call assumes the caller has pinned the folio, that the lowest page table2724* level still points to this folio, and that interrupts have been disabled.2725*2726* GUP-fast must reject all secretmem folios.2727*2728* Writing to pinned file-backed dirty tracked folios is inherently problematic2729* (see comment describing the writable_file_mapping_allowed() function). We2730* therefore try to avoid the most egregious case of a long-term mapping doing2731* so.2732*2733* This function cannot be as thorough as that one as the VMA is not available2734* in the fast path, so instead we whitelist known good cases and if in doubt,2735* fall back to the slow path.2736*/2737static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)2738{2739bool reject_file_backed = false;2740struct address_space *mapping;2741bool check_secretmem = false;2742unsigned long mapping_flags;27432744/*2745* If we aren't pinning then no problematic write can occur. A long term2746* pin is the most egregious case so this is the one we disallow.2747*/2748if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) ==2749(FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE))2750reject_file_backed = true;27512752/* We hold a folio reference, so we can safely access folio fields. */27532754/* secretmem folios are always order-0 folios. */2755if (IS_ENABLED(CONFIG_SECRETMEM) && !folio_test_large(folio))2756check_secretmem = true;27572758if (!reject_file_backed && !check_secretmem)2759return true;27602761if (WARN_ON_ONCE(folio_test_slab(folio)))2762return false;27632764/* hugetlb neither requires dirty-tracking nor can be secretmem. */2765if (folio_test_hugetlb(folio))2766return true;27672768/*2769* GUP-fast disables IRQs. When IRQS are disabled, RCU grace periods2770* cannot proceed, which means no actions performed under RCU can2771* proceed either.2772*2773* inodes and thus their mappings are freed under RCU, which means the2774* mapping cannot be freed beneath us and thus we can safely dereference2775* it.2776*/2777lockdep_assert_irqs_disabled();27782779/*2780* However, there may be operations which _alter_ the mapping, so ensure2781* we read it once and only once.2782*/2783mapping = READ_ONCE(folio->mapping);27842785/*2786* The mapping may have been truncated, in any case we cannot determine2787* if this mapping is safe - fall back to slow path to determine how to2788* proceed.2789*/2790if (!mapping)2791return false;27922793/* Anonymous folios pose no problem. */2794mapping_flags = (unsigned long)mapping & FOLIO_MAPPING_FLAGS;2795if (mapping_flags)2796return mapping_flags & FOLIO_MAPPING_ANON;27972798/*2799* At this point, we know the mapping is non-null and points to an2800* address_space object.2801*/2802if (check_secretmem && secretmem_mapping(mapping))2803return false;2804/* The only remaining allowed file system is shmem. */2805return !reject_file_backed || shmem_mapping(mapping);2806}28072808static void __maybe_unused gup_fast_undo_dev_pagemap(int *nr, int nr_start,2809unsigned int flags, struct page **pages)2810{2811while ((*nr) - nr_start) {2812struct folio *folio = page_folio(pages[--(*nr)]);28132814folio_clear_referenced(folio);2815gup_put_folio(folio, 1, flags);2816}2817}28182819#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL2820/*2821* GUP-fast relies on pte change detection to avoid concurrent pgtable2822* operations.2823*2824* To pin the page, GUP-fast needs to do below in order:2825* (1) pin the page (by prefetching pte), then (2) check pte not changed.2826*2827* For the rest of pgtable operations where pgtable updates can be racy2828* with GUP-fast, we need to do (1) clear pte, then (2) check whether page2829* is pinned.2830*2831* Above will work for all pte-level operations, including THP split.2832*2833* For THP collapse, it's a bit more complicated because GUP-fast may be2834* walking a pgtable page that is being freed (pte is still valid but pmd2835* can be cleared already). To avoid race in such condition, we need to2836* also check pmd here to make sure pmd doesn't change (corresponds to2837* pmdp_collapse_flush() in the THP collapse code path).2838*/2839static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,2840unsigned long end, unsigned int flags, struct page **pages,2841int *nr)2842{2843int ret = 0;2844pte_t *ptep, *ptem;28452846ptem = ptep = pte_offset_map(&pmd, addr);2847if (!ptep)2848return 0;2849do {2850pte_t pte = ptep_get_lockless(ptep);2851struct page *page;2852struct folio *folio;28532854/*2855* Always fallback to ordinary GUP on PROT_NONE-mapped pages:2856* pte_access_permitted() better should reject these pages2857* either way: otherwise, GUP-fast might succeed in2858* cases where ordinary GUP would fail due to VMA access2859* permissions.2860*/2861if (pte_protnone(pte))2862goto pte_unmap;28632864if (!pte_access_permitted(pte, flags & FOLL_WRITE))2865goto pte_unmap;28662867if (pte_special(pte))2868goto pte_unmap;28692870/* If it's not marked as special it must have a valid memmap. */2871VM_WARN_ON_ONCE(!pfn_valid(pte_pfn(pte)));2872page = pte_page(pte);28732874folio = try_grab_folio_fast(page, 1, flags);2875if (!folio)2876goto pte_unmap;28772878if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||2879unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {2880gup_put_folio(folio, 1, flags);2881goto pte_unmap;2882}28832884if (!gup_fast_folio_allowed(folio, flags)) {2885gup_put_folio(folio, 1, flags);2886goto pte_unmap;2887}28882889if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) {2890gup_put_folio(folio, 1, flags);2891goto pte_unmap;2892}28932894/*2895* We need to make the page accessible if and only if we are2896* going to access its content (the FOLL_PIN case). Please2897* see Documentation/core-api/pin_user_pages.rst for2898* details.2899*/2900if ((flags & FOLL_PIN) && arch_make_folio_accessible(folio)) {2901gup_put_folio(folio, 1, flags);2902goto pte_unmap;2903}2904folio_set_referenced(folio);2905pages[*nr] = page;2906(*nr)++;2907} while (ptep++, addr += PAGE_SIZE, addr != end);29082909ret = 1;29102911pte_unmap:2912pte_unmap(ptem);2913return ret;2914}2915#else29162917/*2918* If we can't determine whether or not a pte is special, then fail immediately2919* for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not2920* to be special.2921*2922* For a futex to be placed on a THP tail page, get_futex_key requires a2923* get_user_pages_fast_only implementation that can pin pages. Thus it's still2924* useful to have gup_fast_pmd_leaf even if we can't operate on ptes.2925*/2926static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,2927unsigned long end, unsigned int flags, struct page **pages,2928int *nr)2929{2930return 0;2931}2932#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */29332934static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,2935unsigned long end, unsigned int flags, struct page **pages,2936int *nr)2937{2938struct page *page;2939struct folio *folio;2940int refs;29412942if (!pmd_access_permitted(orig, flags & FOLL_WRITE))2943return 0;29442945if (pmd_special(orig))2946return 0;29472948refs = (end - addr) >> PAGE_SHIFT;2949page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);29502951folio = try_grab_folio_fast(page, refs, flags);2952if (!folio)2953return 0;29542955if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {2956gup_put_folio(folio, refs, flags);2957return 0;2958}29592960if (!gup_fast_folio_allowed(folio, flags)) {2961gup_put_folio(folio, refs, flags);2962return 0;2963}2964if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {2965gup_put_folio(folio, refs, flags);2966return 0;2967}29682969pages += *nr;2970*nr += refs;2971for (; refs; refs--)2972*(pages++) = page++;2973folio_set_referenced(folio);2974return 1;2975}29762977static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,2978unsigned long end, unsigned int flags, struct page **pages,2979int *nr)2980{2981struct page *page;2982struct folio *folio;2983int refs;29842985if (!pud_access_permitted(orig, flags & FOLL_WRITE))2986return 0;29872988if (pud_special(orig))2989return 0;29902991refs = (end - addr) >> PAGE_SHIFT;2992page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);29932994folio = try_grab_folio_fast(page, refs, flags);2995if (!folio)2996return 0;29972998if (unlikely(pud_val(orig) != pud_val(*pudp))) {2999gup_put_folio(folio, refs, flags);3000return 0;3001}30023003if (!gup_fast_folio_allowed(folio, flags)) {3004gup_put_folio(folio, refs, flags);3005return 0;3006}30073008if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {3009gup_put_folio(folio, refs, flags);3010return 0;3011}30123013pages += *nr;3014*nr += refs;3015for (; refs; refs--)3016*(pages++) = page++;3017folio_set_referenced(folio);3018return 1;3019}30203021static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,3022unsigned long end, unsigned int flags, struct page **pages,3023int *nr)3024{3025unsigned long next;3026pmd_t *pmdp;30273028pmdp = pmd_offset_lockless(pudp, pud, addr);3029do {3030pmd_t pmd = pmdp_get_lockless(pmdp);30313032next = pmd_addr_end(addr, end);3033if (!pmd_present(pmd))3034return 0;30353036if (unlikely(pmd_leaf(pmd))) {3037/* See gup_fast_pte_range() */3038if (pmd_protnone(pmd))3039return 0;30403041if (!gup_fast_pmd_leaf(pmd, pmdp, addr, next, flags,3042pages, nr))3043return 0;30443045} else if (!gup_fast_pte_range(pmd, pmdp, addr, next, flags,3046pages, nr))3047return 0;3048} while (pmdp++, addr = next, addr != end);30493050return 1;3051}30523053static int gup_fast_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr,3054unsigned long end, unsigned int flags, struct page **pages,3055int *nr)3056{3057unsigned long next;3058pud_t *pudp;30593060pudp = pud_offset_lockless(p4dp, p4d, addr);3061do {3062pud_t pud = READ_ONCE(*pudp);30633064next = pud_addr_end(addr, end);3065if (unlikely(!pud_present(pud)))3066return 0;3067if (unlikely(pud_leaf(pud))) {3068if (!gup_fast_pud_leaf(pud, pudp, addr, next, flags,3069pages, nr))3070return 0;3071} else if (!gup_fast_pmd_range(pudp, pud, addr, next, flags,3072pages, nr))3073return 0;3074} while (pudp++, addr = next, addr != end);30753076return 1;3077}30783079static int gup_fast_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,3080unsigned long end, unsigned int flags, struct page **pages,3081int *nr)3082{3083unsigned long next;3084p4d_t *p4dp;30853086p4dp = p4d_offset_lockless(pgdp, pgd, addr);3087do {3088p4d_t p4d = READ_ONCE(*p4dp);30893090next = p4d_addr_end(addr, end);3091if (!p4d_present(p4d))3092return 0;3093BUILD_BUG_ON(p4d_leaf(p4d));3094if (!gup_fast_pud_range(p4dp, p4d, addr, next, flags,3095pages, nr))3096return 0;3097} while (p4dp++, addr = next, addr != end);30983099return 1;3100}31013102static void gup_fast_pgd_range(unsigned long addr, unsigned long end,3103unsigned int flags, struct page **pages, int *nr)3104{3105unsigned long next;3106pgd_t *pgdp;31073108pgdp = pgd_offset(current->mm, addr);3109do {3110pgd_t pgd = READ_ONCE(*pgdp);31113112next = pgd_addr_end(addr, end);3113if (pgd_none(pgd))3114return;3115BUILD_BUG_ON(pgd_leaf(pgd));3116if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags,3117pages, nr))3118return;3119} while (pgdp++, addr = next, addr != end);3120}3121#else3122static inline void gup_fast_pgd_range(unsigned long addr, unsigned long end,3123unsigned int flags, struct page **pages, int *nr)3124{3125}3126#endif /* CONFIG_HAVE_GUP_FAST */31273128#ifndef gup_fast_permitted3129/*3130* Check if it's allowed to use get_user_pages_fast_only() for the range, or3131* we need to fall back to the slow version:3132*/3133static bool gup_fast_permitted(unsigned long start, unsigned long end)3134{3135return true;3136}3137#endif31383139static unsigned long gup_fast(unsigned long start, unsigned long end,3140unsigned int gup_flags, struct page **pages)3141{3142unsigned long flags;3143int nr_pinned = 0;3144unsigned seq;31453146if (!IS_ENABLED(CONFIG_HAVE_GUP_FAST) ||3147!gup_fast_permitted(start, end))3148return 0;31493150if (gup_flags & FOLL_PIN) {3151if (!raw_seqcount_try_begin(¤t->mm->write_protect_seq, seq))3152return 0;3153}31543155/*3156* Disable interrupts. The nested form is used, in order to allow full,3157* general purpose use of this routine.3158*3159* With interrupts disabled, we block page table pages from being freed3160* from under us. See struct mmu_table_batch comments in3161* include/asm-generic/tlb.h for more details.3162*3163* We do not adopt an rcu_read_lock() here as we also want to block IPIs3164* that come from callers of tlb_remove_table_sync_one().3165*/3166local_irq_save(flags);3167gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned);3168local_irq_restore(flags);31693170/*3171* When pinning pages for DMA there could be a concurrent write protect3172* from fork() via copy_page_range(), in this case always fail GUP-fast.3173*/3174if (gup_flags & FOLL_PIN) {3175if (read_seqcount_retry(¤t->mm->write_protect_seq, seq)) {3176gup_fast_unpin_user_pages(pages, nr_pinned);3177return 0;3178} else {3179sanity_check_pinned_pages(pages, nr_pinned);3180}3181}3182return nr_pinned;3183}31843185static int gup_fast_fallback(unsigned long start, unsigned long nr_pages,3186unsigned int gup_flags, struct page **pages)3187{3188unsigned long len, end;3189unsigned long nr_pinned;3190int locked = 0;3191int ret;31923193if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |3194FOLL_FORCE | FOLL_PIN | FOLL_GET |3195FOLL_FAST_ONLY | FOLL_NOFAULT |3196FOLL_PCI_P2PDMA | FOLL_HONOR_NUMA_FAULT)))3197return -EINVAL;31983199if (gup_flags & FOLL_PIN)3200mm_set_has_pinned_flag(current->mm);32013202if (!(gup_flags & FOLL_FAST_ONLY))3203might_lock_read(¤t->mm->mmap_lock);32043205start = untagged_addr(start) & PAGE_MASK;3206len = nr_pages << PAGE_SHIFT;3207if (check_add_overflow(start, len, &end))3208return -EOVERFLOW;3209if (end > TASK_SIZE_MAX)3210return -EFAULT;32113212nr_pinned = gup_fast(start, end, gup_flags, pages);3213if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)3214return nr_pinned;32153216/* Slow path: try to get the remaining pages with get_user_pages */3217start += nr_pinned << PAGE_SHIFT;3218pages += nr_pinned;3219ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned,3220pages, &locked,3221gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE);3222if (ret < 0) {3223/*3224* The caller has to unpin the pages we already pinned so3225* returning -errno is not an option3226*/3227if (nr_pinned)3228return nr_pinned;3229return ret;3230}3231return ret + nr_pinned;3232}32333234/**3235* get_user_pages_fast_only() - pin user pages in memory3236* @start: starting user address3237* @nr_pages: number of pages from start to pin3238* @gup_flags: flags modifying pin behaviour3239* @pages: array that receives pointers to the pages pinned.3240* Should be at least nr_pages long.3241*3242* Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to3243* the regular GUP.3244*3245* If the architecture does not support this function, simply return with no3246* pages pinned.3247*3248* Careful, careful! COW breaking can go either way, so a non-write3249* access can get ambiguous page results. If you call this function without3250* 'write' set, you'd better be sure that you're ok with that ambiguity.3251*/3252int get_user_pages_fast_only(unsigned long start, int nr_pages,3253unsigned int gup_flags, struct page **pages)3254{3255/*3256* Internally (within mm/gup.c), gup fast variants must set FOLL_GET,3257* because gup fast is always a "pin with a +1 page refcount" request.3258*3259* FOLL_FAST_ONLY is required in order to match the API description of3260* this routine: no fall back to regular ("slow") GUP.3261*/3262if (!is_valid_gup_args(pages, NULL, &gup_flags,3263FOLL_GET | FOLL_FAST_ONLY))3264return -EINVAL;32653266return gup_fast_fallback(start, nr_pages, gup_flags, pages);3267}3268EXPORT_SYMBOL_GPL(get_user_pages_fast_only);32693270/**3271* get_user_pages_fast() - pin user pages in memory3272* @start: starting user address3273* @nr_pages: number of pages from start to pin3274* @gup_flags: flags modifying pin behaviour3275* @pages: array that receives pointers to the pages pinned.3276* Should be at least nr_pages long.3277*3278* Attempt to pin user pages in memory without taking mm->mmap_lock.3279* If not successful, it will fall back to taking the lock and3280* calling get_user_pages().3281*3282* Returns number of pages pinned. This may be fewer than the number requested.3283* If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns3284* -errno.3285*/3286int get_user_pages_fast(unsigned long start, int nr_pages,3287unsigned int gup_flags, struct page **pages)3288{3289/*3290* The caller may or may not have explicitly set FOLL_GET; either way is3291* OK. However, internally (within mm/gup.c), gup fast variants must set3292* FOLL_GET, because gup fast is always a "pin with a +1 page refcount"3293* request.3294*/3295if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET))3296return -EINVAL;3297return gup_fast_fallback(start, nr_pages, gup_flags, pages);3298}3299EXPORT_SYMBOL_GPL(get_user_pages_fast);33003301/**3302* pin_user_pages_fast() - pin user pages in memory without taking locks3303*3304* @start: starting user address3305* @nr_pages: number of pages from start to pin3306* @gup_flags: flags modifying pin behaviour3307* @pages: array that receives pointers to the pages pinned.3308* Should be at least nr_pages long.3309*3310* Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See3311* get_user_pages_fast() for documentation on the function arguments, because3312* the arguments here are identical.3313*3314* FOLL_PIN means that the pages must be released via unpin_user_page(). Please3315* see Documentation/core-api/pin_user_pages.rst for further details.3316*3317* Note that if a zero_page is amongst the returned pages, it will not have3318* pins in it and unpin_user_page() will not remove pins from it.3319*/3320int pin_user_pages_fast(unsigned long start, int nr_pages,3321unsigned int gup_flags, struct page **pages)3322{3323if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))3324return -EINVAL;3325return gup_fast_fallback(start, nr_pages, gup_flags, pages);3326}3327EXPORT_SYMBOL_GPL(pin_user_pages_fast);33283329/**3330* pin_user_pages_remote() - pin pages of a remote process3331*3332* @mm: mm_struct of target mm3333* @start: starting user address3334* @nr_pages: number of pages from start to pin3335* @gup_flags: flags modifying lookup behaviour3336* @pages: array that receives pointers to the pages pinned.3337* Should be at least nr_pages long.3338* @locked: pointer to lock flag indicating whether lock is held and3339* subsequently whether VM_FAULT_RETRY functionality can be3340* utilised. Lock must initially be held.3341*3342* Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See3343* get_user_pages_remote() for documentation on the function arguments, because3344* the arguments here are identical.3345*3346* FOLL_PIN means that the pages must be released via unpin_user_page(). Please3347* see Documentation/core-api/pin_user_pages.rst for details.3348*3349* Note that if a zero_page is amongst the returned pages, it will not have3350* pins in it and unpin_user_page*() will not remove pins from it.3351*/3352long pin_user_pages_remote(struct mm_struct *mm,3353unsigned long start, unsigned long nr_pages,3354unsigned int gup_flags, struct page **pages,3355int *locked)3356{3357int local_locked = 1;33583359if (!is_valid_gup_args(pages, locked, &gup_flags,3360FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE))3361return 0;3362return __gup_longterm_locked(mm, start, nr_pages, pages,3363locked ? locked : &local_locked,3364gup_flags);3365}3366EXPORT_SYMBOL(pin_user_pages_remote);33673368/**3369* pin_user_pages() - pin user pages in memory for use by other devices3370*3371* @start: starting user address3372* @nr_pages: number of pages from start to pin3373* @gup_flags: flags modifying lookup behaviour3374* @pages: array that receives pointers to the pages pinned.3375* Should be at least nr_pages long.3376*3377* Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and3378* FOLL_PIN is set.3379*3380* FOLL_PIN means that the pages must be released via unpin_user_page(). Please3381* see Documentation/core-api/pin_user_pages.rst for details.3382*3383* Note that if a zero_page is amongst the returned pages, it will not have3384* pins in it and unpin_user_page*() will not remove pins from it.3385*/3386long pin_user_pages(unsigned long start, unsigned long nr_pages,3387unsigned int gup_flags, struct page **pages)3388{3389int locked = 1;33903391if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))3392return 0;3393return __gup_longterm_locked(current->mm, start, nr_pages,3394pages, &locked, gup_flags);3395}3396EXPORT_SYMBOL(pin_user_pages);33973398/*3399* pin_user_pages_unlocked() is the FOLL_PIN variant of3400* get_user_pages_unlocked(). Behavior is the same, except that this one sets3401* FOLL_PIN and rejects FOLL_GET.3402*3403* Note that if a zero_page is amongst the returned pages, it will not have3404* pins in it and unpin_user_page*() will not remove pins from it.3405*/3406long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,3407struct page **pages, unsigned int gup_flags)3408{3409int locked = 0;34103411if (!is_valid_gup_args(pages, NULL, &gup_flags,3412FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE))3413return 0;34143415return __gup_longterm_locked(current->mm, start, nr_pages, pages,3416&locked, gup_flags);3417}3418EXPORT_SYMBOL(pin_user_pages_unlocked);34193420/**3421* memfd_pin_folios() - pin folios associated with a memfd3422* @memfd: the memfd whose folios are to be pinned3423* @start: the first memfd offset3424* @end: the last memfd offset (inclusive)3425* @folios: array that receives pointers to the folios pinned3426* @max_folios: maximum number of entries in @folios3427* @offset: the offset into the first folio3428*3429* Attempt to pin folios associated with a memfd in the contiguous range3430* [start, end]. Given that a memfd is either backed by shmem or hugetlb,3431* the folios can either be found in the page cache or need to be allocated3432* if necessary. Once the folios are located, they are all pinned via3433* FOLL_PIN and @offset is populatedwith the offset into the first folio.3434* And, eventually, these pinned folios must be released either using3435* unpin_folios() or unpin_folio().3436*3437* It must be noted that the folios may be pinned for an indefinite amount3438* of time. And, in most cases, the duration of time they may stay pinned3439* would be controlled by the userspace. This behavior is effectively the3440* same as using FOLL_LONGTERM with other GUP APIs.3441*3442* Returns number of folios pinned, which could be less than @max_folios3443* as it depends on the folio sizes that cover the range [start, end].3444* If no folios were pinned, it returns -errno.3445*/3446long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,3447struct folio **folios, unsigned int max_folios,3448pgoff_t *offset)3449{3450unsigned int flags, nr_folios, nr_found;3451unsigned int i, pgshift = PAGE_SHIFT;3452pgoff_t start_idx, end_idx;3453struct folio *folio = NULL;3454struct folio_batch fbatch;3455struct hstate *h;3456long ret = -EINVAL;34573458if (start < 0 || start > end || !max_folios)3459return -EINVAL;34603461if (!memfd)3462return -EINVAL;34633464if (!shmem_file(memfd) && !is_file_hugepages(memfd))3465return -EINVAL;34663467if (end >= i_size_read(file_inode(memfd)))3468return -EINVAL;34693470if (is_file_hugepages(memfd)) {3471h = hstate_file(memfd);3472pgshift = huge_page_shift(h);3473}34743475flags = memalloc_pin_save();3476do {3477nr_folios = 0;3478start_idx = start >> pgshift;3479end_idx = end >> pgshift;3480if (is_file_hugepages(memfd)) {3481start_idx <<= huge_page_order(h);3482end_idx <<= huge_page_order(h);3483}34843485folio_batch_init(&fbatch);3486while (start_idx <= end_idx && nr_folios < max_folios) {3487/*3488* In most cases, we should be able to find the folios3489* in the page cache. If we cannot find them for some3490* reason, we try to allocate them and add them to the3491* page cache.3492*/3493nr_found = filemap_get_folios_contig(memfd->f_mapping,3494&start_idx,3495end_idx,3496&fbatch);3497if (folio) {3498folio_put(folio);3499folio = NULL;3500}35013502for (i = 0; i < nr_found; i++) {3503folio = fbatch.folios[i];35043505if (try_grab_folio(folio, 1, FOLL_PIN)) {3506folio_batch_release(&fbatch);3507ret = -EINVAL;3508goto err;3509}35103511if (nr_folios == 0)3512*offset = offset_in_folio(folio, start);35133514folios[nr_folios] = folio;3515if (++nr_folios == max_folios)3516break;3517}35183519folio = NULL;3520folio_batch_release(&fbatch);3521if (!nr_found) {3522folio = memfd_alloc_folio(memfd, start_idx);3523if (IS_ERR(folio)) {3524ret = PTR_ERR(folio);3525if (ret != -EEXIST)3526goto err;3527folio = NULL;3528}3529}3530}35313532ret = check_and_migrate_movable_folios(nr_folios, folios);3533} while (ret == -EAGAIN);35343535memalloc_pin_restore(flags);3536return ret ? ret : nr_folios;3537err:3538memalloc_pin_restore(flags);3539unpin_folios(folios, nr_folios);35403541return ret;3542}3543EXPORT_SYMBOL_GPL(memfd_pin_folios);35443545/**3546* folio_add_pins() - add pins to an already-pinned folio3547* @folio: the folio to add more pins to3548* @pins: number of pins to add3549*3550* Try to add more pins to an already-pinned folio. The semantics3551* of the pin (e.g., FOLL_WRITE) follow any existing pin and cannot3552* be changed.3553*3554* This function is helpful when having obtained a pin on a large folio3555* using memfd_pin_folios(), but wanting to logically unpin parts3556* (e.g., individual pages) of the folio later, for example, using3557* unpin_user_page_range_dirty_lock().3558*3559* This is not the right interface to initially pin a folio.3560*/3561int folio_add_pins(struct folio *folio, unsigned int pins)3562{3563VM_WARN_ON_ONCE(!folio_maybe_dma_pinned(folio));35643565return try_grab_folio(folio, pins, FOLL_PIN);3566}3567EXPORT_SYMBOL_GPL(folio_add_pins);356835693570