blob: 54b0a694712e26d442935695ed910163616a711f [file] [log] [blame]
Thomas Gleixner457c8992019-05-19 13:08:55 +01001// SPDX-License-Identifier: GPL-2.0-only
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07002#include <linux/kernel.h>
3#include <linux/errno.h>
4#include <linux/err.h>
5#include <linux/spinlock.h>
6
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07007#include <linux/mm.h>
Dan Williams3565fce2016-01-15 16:56:55 -08008#include <linux/memremap.h>
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07009#include <linux/pagemap.h>
10#include <linux/rmap.h>
11#include <linux/swap.h>
12#include <linux/swapops.h>
13
Ingo Molnar174cd4b2017-02-02 19:15:33 +010014#include <linux/sched/signal.h>
Steve Capper2667f502014-10-09 15:29:14 -070015#include <linux/rwsem.h>
Aneesh Kumar K.Vf30c59e2014-11-05 21:57:40 +053016#include <linux/hugetlb.h>
Aneesh Kumar K.V9a4e9f32019-03-05 15:47:44 -080017#include <linux/migrate.h>
18#include <linux/mm_inline.h>
19#include <linux/sched/mm.h>
Kirill A. Shutemov1027e442015-09-04 15:47:55 -070020
Minchan Kim6e12c5b2021-03-18 09:56:10 -070021#include <linux/page_pinner.h>
22
Dave Hansen33a709b2016-02-12 13:02:19 -080023#include <asm/mmu_context.h>
Kirill A. Shutemov1027e442015-09-04 15:47:55 -070024#include <asm/tlbflush.h>
Steve Capper2667f502014-10-09 15:29:14 -070025
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -070026#include "internal.h"
27
Keith Buschdf06b372018-10-26 15:10:28 -070028struct follow_page_context {
29 struct dev_pagemap *pgmap;
30 unsigned int page_mask;
31};
32
John Hubbard47e29d32020-04-01 21:05:33 -070033static void hpage_pincount_add(struct page *page, int refs)
34{
35 VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
36 VM_BUG_ON_PAGE(page != compound_head(page), page);
37
38 atomic_add(refs, compound_pincount_ptr(page));
39}
40
41static void hpage_pincount_sub(struct page *page, int refs)
42{
43 VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
44 VM_BUG_ON_PAGE(page != compound_head(page), page);
45
46 atomic_sub(refs, compound_pincount_ptr(page));
47}
48
Jann Horn9109e152021-06-28 19:33:23 -070049/* Equivalent to calling put_page() @refs times. */
50static void put_page_refs(struct page *page, int refs)
51{
52#ifdef CONFIG_DEBUG_VM
53 if (VM_WARN_ON_ONCE_PAGE(page_ref_count(page) < refs, page))
54 return;
55#endif
56
57 /*
58 * Calling put_page() for each ref is unnecessarily slow. Only the last
59 * ref needs a put_page().
60 */
61 if (refs > 1)
62 page_ref_sub(page, refs - 1);
63 put_page(page);
64}
65
John Hubbarda707cdd2020-01-30 22:12:21 -080066/*
67 * Return the compound head page with ref appropriately incremented,
68 * or NULL if that failed.
69 */
70static inline struct page *try_get_compound_head(struct page *page, int refs)
71{
72 struct page *head = compound_head(page);
73
74 if (WARN_ON_ONCE(page_ref_count(head) < 0))
75 return NULL;
76 if (unlikely(!page_cache_add_speculative(head, refs)))
77 return NULL;
Jann Horn9109e152021-06-28 19:33:23 -070078
79 /*
80 * At this point we have a stable reference to the head page; but it
81 * could be that between the compound_head() lookup and the refcount
82 * increment, the compound page was split, in which case we'd end up
83 * holding a reference on a page that has nothing to do with the page
84 * we were given anymore.
85 * So now that the head page is stable, recheck that the pages still
86 * belong together.
87 */
88 if (unlikely(compound_head(page) != head)) {
89 put_page_refs(head, refs);
90 return NULL;
91 }
92
John Hubbarda707cdd2020-01-30 22:12:21 -080093 return head;
94}
95
John Hubbard3faa52c2020-04-01 21:05:29 -070096/*
97 * try_grab_compound_head() - attempt to elevate a page's refcount, by a
98 * flags-dependent amount.
99 *
100 * "grab" names in this file mean, "look at flags to decide whether to use
101 * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
102 *
103 * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
104 * same time. (That's true throughout the get_user_pages*() and
105 * pin_user_pages*() APIs.) Cases:
106 *
107 * FOLL_GET: page's refcount will be incremented by 1.
108 * FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
109 *
110 * Return: head page (with refcount appropriately incremented) for success, or
111 * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's
112 * considered failure, and furthermore, a likely bug in the caller, so a warning
113 * is also emitted.
114 */
115static __maybe_unused struct page *try_grab_compound_head(struct page *page,
116 int refs,
117 unsigned int flags)
118{
Minchan Kim6e12c5b2021-03-18 09:56:10 -0700119 if (flags & FOLL_GET) {
120 struct page *head = try_get_compound_head(page, refs);
121 if (head)
122 set_page_pinner(head, compound_order(head));
123 return head;
124 } else if (flags & FOLL_PIN) {
John Hubbard1970dc62020-04-01 21:05:37 -0700125 int orig_refs = refs;
126
John Hubbard47e29d32020-04-01 21:05:33 -0700127 /*
Pingfan Liudf3a0a22020-04-01 21:06:04 -0700128 * Can't do FOLL_LONGTERM + FOLL_PIN with CMA in the gup fast
129 * path, so fail and let the caller fall back to the slow path.
130 */
131 if (unlikely(flags & FOLL_LONGTERM) &&
132 is_migrate_cma_page(page))
133 return NULL;
134
135 /*
Jann Horn9109e152021-06-28 19:33:23 -0700136 * CAUTION: Don't use compound_head() on the page before this
137 * point, the result won't be stable.
138 */
139 page = try_get_compound_head(page, refs);
140 if (!page)
141 return NULL;
142
143 /*
John Hubbard47e29d32020-04-01 21:05:33 -0700144 * When pinning a compound page of order > 1 (which is what
145 * hpage_pincount_available() checks for), use an exact count to
146 * track it, via hpage_pincount_add/_sub().
147 *
148 * However, be sure to *also* increment the normal page refcount
149 * field at least once, so that the page really is pinned.
150 */
John Hubbard47e29d32020-04-01 21:05:33 -0700151 if (hpage_pincount_available(page))
152 hpage_pincount_add(page, refs);
Jann Horn9109e152021-06-28 19:33:23 -0700153 else
154 page_ref_add(page, refs * (GUP_PIN_COUNTING_BIAS - 1));
John Hubbard47e29d32020-04-01 21:05:33 -0700155
John Hubbard1970dc62020-04-01 21:05:37 -0700156 mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED,
157 orig_refs);
158
John Hubbard47e29d32020-04-01 21:05:33 -0700159 return page;
John Hubbard3faa52c2020-04-01 21:05:29 -0700160 }
161
162 WARN_ON_ONCE(1);
163 return NULL;
164}
165
Jason Gunthorpecfde6c12020-12-14 19:05:51 -0800166static void put_compound_head(struct page *page, int refs, unsigned int flags)
167{
168 if (flags & FOLL_PIN) {
169 mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED,
170 refs);
171
172 if (hpage_pincount_available(page))
173 hpage_pincount_sub(page, refs);
174 else
175 refs *= GUP_PIN_COUNTING_BIAS;
176 }
177
Minchan Kim6e12c5b2021-03-18 09:56:10 -0700178 if (flags & FOLL_GET)
179 reset_page_pinner(page, compound_order(page));
Jann Horn9109e152021-06-28 19:33:23 -0700180 put_page_refs(page, refs);
Jason Gunthorpecfde6c12020-12-14 19:05:51 -0800181}
182
John Hubbard3faa52c2020-04-01 21:05:29 -0700183/**
184 * try_grab_page() - elevate a page's refcount by a flag-dependent amount
185 *
186 * This might not do anything at all, depending on the flags argument.
187 *
188 * "grab" names in this file mean, "look at flags to decide whether to use
189 * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
190 *
191 * @page: pointer to page to be grabbed
192 * @flags: gup flags: these are the FOLL_* flag values.
193 *
194 * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
195 * time. Cases:
196 *
197 * FOLL_GET: page's refcount will be incremented by 1.
198 * FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
199 *
200 * Return: true for success, or if no action was required (if neither FOLL_PIN
201 * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
202 * FOLL_PIN was set, but the page could not be grabbed.
203 */
204bool __must_check try_grab_page(struct page *page, unsigned int flags)
205{
206 WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));
207
Minchan Kim6e12c5b2021-03-18 09:56:10 -0700208 if (flags & FOLL_GET) {
209 bool ret = try_get_page(page);
210
211 if (ret) {
212 page = compound_head(page);
213 set_page_pinner(page, compound_order(page));
214 }
215 return ret;
216 } else if (flags & FOLL_PIN) {
John Hubbard47e29d32020-04-01 21:05:33 -0700217 int refs = 1;
218
John Hubbard3faa52c2020-04-01 21:05:29 -0700219 page = compound_head(page);
220
221 if (WARN_ON_ONCE(page_ref_count(page) <= 0))
222 return false;
223
John Hubbard47e29d32020-04-01 21:05:33 -0700224 if (hpage_pincount_available(page))
225 hpage_pincount_add(page, 1);
226 else
227 refs = GUP_PIN_COUNTING_BIAS;
228
229 /*
230 * Similar to try_grab_compound_head(): even if using the
231 * hpage_pincount_add/_sub() routines, be sure to
232 * *also* increment the normal page refcount field at least
233 * once, so that the page really is pinned.
234 */
235 page_ref_add(page, refs);
John Hubbard1970dc62020-04-01 21:05:37 -0700236
237 mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1);
John Hubbard3faa52c2020-04-01 21:05:29 -0700238 }
239
240 return true;
241}
242
John Hubbard3faa52c2020-04-01 21:05:29 -0700243/**
244 * unpin_user_page() - release a dma-pinned page
245 * @page: pointer to page to be released
246 *
247 * Pages that were pinned via pin_user_pages*() must be released via either
248 * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
249 * that such pages can be separately tracked and uniquely handled. In
250 * particular, interactions with RDMA and filesystems need special handling.
251 */
252void unpin_user_page(struct page *page)
253{
Jason Gunthorpecfde6c12020-12-14 19:05:51 -0800254 put_compound_head(compound_head(page), 1, FOLL_PIN);
John Hubbard3faa52c2020-04-01 21:05:29 -0700255}
256EXPORT_SYMBOL(unpin_user_page);
257
Minchan Kim6e12c5b2021-03-18 09:56:10 -0700258/*
259 * put_user_page() - release a page obtained using get_user_pages() or
260 * follow_page(FOLL_GET)
261 * @page: pointer to page to be released
262 *
263 * Pages that were obtained via get_user_pages()/follow_page(FOLL_GET) must be
264 * released via put_user_page.
265 * note: If it's not a page from GUP or follow_page(FOLL_GET), it's harmless.
266 */
267void put_user_page(struct page *page)
268{
269 struct page *head = compound_head(page);
270
271 reset_page_pinner(head, compound_order(head));
272 put_page(page);
273}
274EXPORT_SYMBOL(put_user_page);
275
John Hubbardfc1d8e72019-05-13 17:19:08 -0700276/**
John Hubbardf1f6a7d2020-01-30 22:13:35 -0800277 * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
akpm@linux-foundation.org2d15eb32019-09-23 15:35:04 -0700278 * @pages: array of pages to be maybe marked dirty, and definitely released.
John Hubbardfc1d8e72019-05-13 17:19:08 -0700279 * @npages: number of pages in the @pages array.
akpm@linux-foundation.org2d15eb32019-09-23 15:35:04 -0700280 * @make_dirty: whether to mark the pages dirty
John Hubbardfc1d8e72019-05-13 17:19:08 -0700281 *
282 * "gup-pinned page" refers to a page that has had one of the get_user_pages()
283 * variants called on that page.
284 *
285 * For each page in the @pages array, make that page (or its head page, if a
akpm@linux-foundation.org2d15eb32019-09-23 15:35:04 -0700286 * compound page) dirty, if @make_dirty is true, and if the page was previously
John Hubbardf1f6a7d2020-01-30 22:13:35 -0800287 * listed as clean. In any case, releases all pages using unpin_user_page(),
288 * possibly via unpin_user_pages(), for the non-dirty case.
John Hubbardfc1d8e72019-05-13 17:19:08 -0700289 *
John Hubbardf1f6a7d2020-01-30 22:13:35 -0800290 * Please see the unpin_user_page() documentation for details.
John Hubbardfc1d8e72019-05-13 17:19:08 -0700291 *
akpm@linux-foundation.org2d15eb32019-09-23 15:35:04 -0700292 * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
293 * required, then the caller should a) verify that this is really correct,
294 * because _lock() is usually required, and b) hand code it:
John Hubbardf1f6a7d2020-01-30 22:13:35 -0800295 * set_page_dirty_lock(), unpin_user_page().
John Hubbardfc1d8e72019-05-13 17:19:08 -0700296 *
297 */
John Hubbardf1f6a7d2020-01-30 22:13:35 -0800298void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
299 bool make_dirty)
John Hubbardfc1d8e72019-05-13 17:19:08 -0700300{
akpm@linux-foundation.org2d15eb32019-09-23 15:35:04 -0700301 unsigned long index;
John Hubbardfc1d8e72019-05-13 17:19:08 -0700302
akpm@linux-foundation.org2d15eb32019-09-23 15:35:04 -0700303 /*
304 * TODO: this can be optimized for huge pages: if a series of pages is
305 * physically contiguous and part of the same compound page, then a
306 * single operation to the head page should suffice.
307 */
308
309 if (!make_dirty) {
John Hubbardf1f6a7d2020-01-30 22:13:35 -0800310 unpin_user_pages(pages, npages);
akpm@linux-foundation.org2d15eb32019-09-23 15:35:04 -0700311 return;
312 }
313
314 for (index = 0; index < npages; index++) {
315 struct page *page = compound_head(pages[index]);
316 /*
317 * Checking PageDirty at this point may race with
318 * clear_page_dirty_for_io(), but that's OK. Two key
319 * cases:
320 *
321 * 1) This code sees the page as already dirty, so it
322 * skips the call to set_page_dirty(). That could happen
323 * because clear_page_dirty_for_io() called
324 * page_mkclean(), followed by set_page_dirty().
325 * However, now the page is going to get written back,
326 * which meets the original intention of setting it
327 * dirty, so all is well: clear_page_dirty_for_io() goes
328 * on to call TestClearPageDirty(), and write the page
329 * back.
330 *
331 * 2) This code sees the page as clean, so it calls
332 * set_page_dirty(). The page stays dirty, despite being
333 * written back, so it gets written back again in the
334 * next writeback cycle. This is harmless.
335 */
336 if (!PageDirty(page))
337 set_page_dirty_lock(page);
John Hubbardf1f6a7d2020-01-30 22:13:35 -0800338 unpin_user_page(page);
akpm@linux-foundation.org2d15eb32019-09-23 15:35:04 -0700339 }
John Hubbardfc1d8e72019-05-13 17:19:08 -0700340}
John Hubbardf1f6a7d2020-01-30 22:13:35 -0800341EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
John Hubbardfc1d8e72019-05-13 17:19:08 -0700342
343/**
John Hubbardf1f6a7d2020-01-30 22:13:35 -0800344 * unpin_user_pages() - release an array of gup-pinned pages.
John Hubbardfc1d8e72019-05-13 17:19:08 -0700345 * @pages: array of pages to be marked dirty and released.
346 * @npages: number of pages in the @pages array.
347 *
John Hubbardf1f6a7d2020-01-30 22:13:35 -0800348 * For each page in the @pages array, release the page using unpin_user_page().
John Hubbardfc1d8e72019-05-13 17:19:08 -0700349 *
John Hubbardf1f6a7d2020-01-30 22:13:35 -0800350 * Please see the unpin_user_page() documentation for details.
John Hubbardfc1d8e72019-05-13 17:19:08 -0700351 */
John Hubbardf1f6a7d2020-01-30 22:13:35 -0800352void unpin_user_pages(struct page **pages, unsigned long npages)
John Hubbardfc1d8e72019-05-13 17:19:08 -0700353{
354 unsigned long index;
355
356 /*
John Hubbard146608bb2020-10-13 16:52:01 -0700357 * If this WARN_ON() fires, then the system *might* be leaking pages (by
358 * leaving them pinned), but probably not. More likely, gup/pup returned
359 * a hard -ERRNO error to the caller, who erroneously passed it here.
360 */
361 if (WARN_ON(IS_ERR_VALUE(npages)))
362 return;
363 /*
John Hubbardfc1d8e72019-05-13 17:19:08 -0700364 * TODO: this can be optimized for huge pages: if a series of pages is
365 * physically contiguous and part of the same compound page, then a
366 * single operation to the head page should suffice.
367 */
368 for (index = 0; index < npages; index++)
John Hubbardf1f6a7d2020-01-30 22:13:35 -0800369 unpin_user_page(pages[index]);
John Hubbardfc1d8e72019-05-13 17:19:08 -0700370}
John Hubbardf1f6a7d2020-01-30 22:13:35 -0800371EXPORT_SYMBOL(unpin_user_pages);
John Hubbardfc1d8e72019-05-13 17:19:08 -0700372
Christoph Hellwig050a9ad2019-07-11 20:57:21 -0700373#ifdef CONFIG_MMU
Kirill A. Shutemov69e68b42014-06-04 16:08:11 -0700374static struct page *no_page_table(struct vm_area_struct *vma,
375 unsigned int flags)
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -0700376{
Kirill A. Shutemov69e68b42014-06-04 16:08:11 -0700377 /*
378 * When core dumping an enormous anonymous area that nobody
379 * has touched so far, we don't want to allocate unnecessary pages or
380 * page tables. Return error instead of NULL to skip handle_mm_fault,
381 * then get_dump_page() will return NULL to leave a hole in the dump.
382 * But we can only make this optimization where a hole would surely
383 * be zero-filled if handle_mm_fault() actually did handle it.
384 */
Anshuman Khanduala0137f12020-04-06 20:03:55 -0700385 if ((flags & FOLL_DUMP) &&
386 (vma_is_anonymous(vma) || !vma->vm_ops->fault))
Kirill A. Shutemov69e68b42014-06-04 16:08:11 -0700387 return ERR_PTR(-EFAULT);
388 return NULL;
389}
390
Kirill A. Shutemov1027e442015-09-04 15:47:55 -0700391static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
392 pte_t *pte, unsigned int flags)
393{
394 /* No page to get reference */
395 if (flags & FOLL_GET)
396 return -EFAULT;
397
398 if (flags & FOLL_TOUCH) {
399 pte_t entry = *pte;
400
401 if (flags & FOLL_WRITE)
402 entry = pte_mkdirty(entry);
403 entry = pte_mkyoung(entry);
404
405 if (!pte_same(*pte, entry)) {
406 set_pte_at(vma->vm_mm, address, pte, entry);
407 update_mmu_cache(vma, address, pte);
408 }
409 }
410
411 /* Proper page table entry exists, but no corresponding struct page */
412 return -EEXIST;
413}
414
Linus Torvalds19be0ea2016-10-13 13:07:36 -0700415/*
Peter Xua308c712020-08-21 19:49:57 -0400416 * FOLL_FORCE can write to even unwritable pte's, but only
417 * after we've gone through a COW cycle and they are dirty.
Linus Torvalds19be0ea2016-10-13 13:07:36 -0700418 */
419static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
420{
Peter Xua308c712020-08-21 19:49:57 -0400421 return pte_write(pte) ||
422 ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
Linus Torvalds19be0ea2016-10-13 13:07:36 -0700423}
424
Kirill A. Shutemov69e68b42014-06-04 16:08:11 -0700425static struct page *follow_page_pte(struct vm_area_struct *vma,
Keith Buschdf06b372018-10-26 15:10:28 -0700426 unsigned long address, pmd_t *pmd, unsigned int flags,
427 struct dev_pagemap **pgmap)
Kirill A. Shutemov69e68b42014-06-04 16:08:11 -0700428{
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -0700429 struct mm_struct *mm = vma->vm_mm;
Kirill A. Shutemov69e68b42014-06-04 16:08:11 -0700430 struct page *page;
431 spinlock_t *ptl;
432 pte_t *ptep, pte;
Claudio Imbrendaf28d4362020-04-01 21:05:56 -0700433 int ret;
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -0700434
John Hubbardeddb1c22020-01-30 22:12:54 -0800435 /* FOLL_GET and FOLL_PIN are mutually exclusive. */
436 if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
437 (FOLL_PIN | FOLL_GET)))
438 return ERR_PTR(-EINVAL);
Kirill A. Shutemov69e68b42014-06-04 16:08:11 -0700439retry:
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -0700440 if (unlikely(pmd_bad(*pmd)))
Kirill A. Shutemov69e68b42014-06-04 16:08:11 -0700441 return no_page_table(vma, flags);
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -0700442
443 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -0700444 pte = *ptep;
445 if (!pte_present(pte)) {
446 swp_entry_t entry;
447 /*
448 * KSM's break_ksm() relies upon recognizing a ksm page
449 * even while it is being migrated, so for that case we
450 * need migration_entry_wait().
451 */
452 if (likely(!(flags & FOLL_MIGRATION)))
453 goto no_page;
Kirill A. Shutemov0661a332015-02-10 14:10:04 -0800454 if (pte_none(pte))
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -0700455 goto no_page;
456 entry = pte_to_swp_entry(pte);
457 if (!is_migration_entry(entry))
458 goto no_page;
459 pte_unmap_unlock(ptep, ptl);
460 migration_entry_wait(mm, pmd, address);
Kirill A. Shutemov69e68b42014-06-04 16:08:11 -0700461 goto retry;
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -0700462 }
Mel Gorman8a0516e2015-02-12 14:58:22 -0800463 if ((flags & FOLL_NUMA) && pte_protnone(pte))
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -0700464 goto no_page;
Linus Torvalds19be0ea2016-10-13 13:07:36 -0700465 if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
Kirill A. Shutemov69e68b42014-06-04 16:08:11 -0700466 pte_unmap_unlock(ptep, ptl);
467 return NULL;
468 }
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -0700469
470 page = vm_normal_page(vma, address, pte);
John Hubbard3faa52c2020-04-01 21:05:29 -0700471 if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
Dan Williams3565fce2016-01-15 16:56:55 -0800472 /*
John Hubbard3faa52c2020-04-01 21:05:29 -0700473 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
474 * case since they are only valid while holding the pgmap
475 * reference.
Dan Williams3565fce2016-01-15 16:56:55 -0800476 */
Keith Buschdf06b372018-10-26 15:10:28 -0700477 *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
478 if (*pgmap)
Dan Williams3565fce2016-01-15 16:56:55 -0800479 page = pte_page(pte);
480 else
481 goto no_page;
482 } else if (unlikely(!page)) {
Kirill A. Shutemov1027e442015-09-04 15:47:55 -0700483 if (flags & FOLL_DUMP) {
484 /* Avoid special (like zero) pages in core dumps */
485 page = ERR_PTR(-EFAULT);
486 goto out;
487 }
488
489 if (is_zero_pfn(pte_pfn(pte))) {
490 page = pte_page(pte);
491 } else {
Kirill A. Shutemov1027e442015-09-04 15:47:55 -0700492 ret = follow_pfn_pte(vma, address, ptep, flags);
493 page = ERR_PTR(ret);
494 goto out;
495 }
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -0700496 }
497
Kirill A. Shutemov6742d292016-01-15 16:52:28 -0800498 if (flags & FOLL_SPLIT && PageTransCompound(page)) {
Kirill A. Shutemov6742d292016-01-15 16:52:28 -0800499 get_page(page);
500 pte_unmap_unlock(ptep, ptl);
501 lock_page(page);
502 ret = split_huge_page(page);
503 unlock_page(page);
504 put_page(page);
505 if (ret)
506 return ERR_PTR(ret);
507 goto retry;
508 }
509
John Hubbard3faa52c2020-04-01 21:05:29 -0700510 /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
511 if (unlikely(!try_grab_page(page, flags))) {
512 page = ERR_PTR(-ENOMEM);
513 goto out;
Linus Torvalds8fde12c2019-04-11 10:49:19 -0700514 }
Claudio Imbrendaf28d4362020-04-01 21:05:56 -0700515 /*
516 * We need to make the page accessible if and only if we are going
517 * to access its content (the FOLL_PIN case). Please see
518 * Documentation/core-api/pin_user_pages.rst for details.
519 */
520 if (flags & FOLL_PIN) {
521 ret = arch_make_page_accessible(page);
522 if (ret) {
523 unpin_user_page(page);
524 page = ERR_PTR(ret);
525 goto out;
526 }
527 }
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -0700528 if (flags & FOLL_TOUCH) {
529 if ((flags & FOLL_WRITE) &&
530 !pte_dirty(pte) && !PageDirty(page))
531 set_page_dirty(page);
532 /*
533 * pte_mkyoung() would be more correct here, but atomic care
534 * is needed to avoid losing the dirty bit: it is easier to use
535 * mark_page_accessed().
536 */
537 mark_page_accessed(page);
538 }
Eric B Munsonde60f5f2015-11-05 18:51:36 -0800539 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
Kirill A. Shutemove90309c2016-01-15 16:54:33 -0800540 /* Do not mlock pte-mapped THP */
541 if (PageTransCompound(page))
542 goto out;
543
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -0700544 /*
545 * The preliminary mapping check is mainly to avoid the
546 * pointless overhead of lock_page on the ZERO_PAGE
547 * which might bounce very badly if there is contention.
548 *
549 * If the page is already locked, we don't need to
550 * handle it now - vmscan will handle it later if and
551 * when it attempts to reclaim the page.
552 */
553 if (page->mapping && trylock_page(page)) {
554 lru_add_drain(); /* push cached pages to LRU */
555 /*
556 * Because we lock page here, and migration is
557 * blocked by the pte's page reference, and we
558 * know the page is still mapped, we don't even
559 * need to check for file-cache page truncation.
560 */
561 mlock_vma_page(page);
562 unlock_page(page);
563 }
564 }
Kirill A. Shutemov1027e442015-09-04 15:47:55 -0700565out:
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -0700566 pte_unmap_unlock(ptep, ptl);
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -0700567 return page;
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -0700568no_page:
569 pte_unmap_unlock(ptep, ptl);
570 if (!pte_none(pte))
Kirill A. Shutemov69e68b42014-06-04 16:08:11 -0700571 return NULL;
572 return no_page_table(vma, flags);
573}
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -0700574
Aneesh Kumar K.V080dbb62017-07-06 15:38:44 -0700575static struct page *follow_pmd_mask(struct vm_area_struct *vma,
576 unsigned long address, pud_t *pudp,
Keith Buschdf06b372018-10-26 15:10:28 -0700577 unsigned int flags,
578 struct follow_page_context *ctx)
Kirill A. Shutemov69e68b42014-06-04 16:08:11 -0700579{
Huang Ying68827282018-06-07 17:06:34 -0700580 pmd_t *pmd, pmdval;
Kirill A. Shutemov69e68b42014-06-04 16:08:11 -0700581 spinlock_t *ptl;
582 struct page *page;
583 struct mm_struct *mm = vma->vm_mm;
584
Aneesh Kumar K.V080dbb62017-07-06 15:38:44 -0700585 pmd = pmd_offset(pudp, address);
Huang Ying68827282018-06-07 17:06:34 -0700586 /*
587 * The READ_ONCE() will stabilize the pmdval in a register or
588 * on the stack so that it will stop changing under the code.
589 */
590 pmdval = READ_ONCE(*pmd);
591 if (pmd_none(pmdval))
Kirill A. Shutemov69e68b42014-06-04 16:08:11 -0700592 return no_page_table(vma, flags);
Wei Yangbe9d3042020-01-30 22:12:14 -0800593 if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
Naoya Horiguchie66f17f2015-02-11 15:25:22 -0800594 page = follow_huge_pmd(mm, address, pmd, flags);
595 if (page)
596 return page;
597 return no_page_table(vma, flags);
Kirill A. Shutemov69e68b42014-06-04 16:08:11 -0700598 }
Huang Ying68827282018-06-07 17:06:34 -0700599 if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
Aneesh Kumar K.V4dc71452017-07-06 15:38:56 -0700600 page = follow_huge_pd(vma, address,
Huang Ying68827282018-06-07 17:06:34 -0700601 __hugepd(pmd_val(pmdval)), flags,
Aneesh Kumar K.V4dc71452017-07-06 15:38:56 -0700602 PMD_SHIFT);
603 if (page)
604 return page;
605 return no_page_table(vma, flags);
606 }
Zi Yan84c3fc42017-09-08 16:11:01 -0700607retry:
Huang Ying68827282018-06-07 17:06:34 -0700608 if (!pmd_present(pmdval)) {
Zi Yan84c3fc42017-09-08 16:11:01 -0700609 if (likely(!(flags & FOLL_MIGRATION)))
610 return no_page_table(vma, flags);
611 VM_BUG_ON(thp_migration_supported() &&
Huang Ying68827282018-06-07 17:06:34 -0700612 !is_pmd_migration_entry(pmdval));
613 if (is_pmd_migration_entry(pmdval))
Zi Yan84c3fc42017-09-08 16:11:01 -0700614 pmd_migration_entry_wait(mm, pmd);
Huang Ying68827282018-06-07 17:06:34 -0700615 pmdval = READ_ONCE(*pmd);
616 /*
617 * MADV_DONTNEED may convert the pmd to null because
Michel Lespinassec1e8d7c2020-06-08 21:33:54 -0700618 * mmap_lock is held in read mode
Huang Ying68827282018-06-07 17:06:34 -0700619 */
620 if (pmd_none(pmdval))
621 return no_page_table(vma, flags);
Zi Yan84c3fc42017-09-08 16:11:01 -0700622 goto retry;
623 }
Huang Ying68827282018-06-07 17:06:34 -0700624 if (pmd_devmap(pmdval)) {
Dan Williams3565fce2016-01-15 16:56:55 -0800625 ptl = pmd_lock(mm, pmd);
Keith Buschdf06b372018-10-26 15:10:28 -0700626 page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
Dan Williams3565fce2016-01-15 16:56:55 -0800627 spin_unlock(ptl);
628 if (page)
629 return page;
630 }
Huang Ying68827282018-06-07 17:06:34 -0700631 if (likely(!pmd_trans_huge(pmdval)))
Keith Buschdf06b372018-10-26 15:10:28 -0700632 return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
Kirill A. Shutemov6742d292016-01-15 16:52:28 -0800633
Huang Ying68827282018-06-07 17:06:34 -0700634 if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
Aneesh Kumar K.Vdb08f202017-02-24 14:59:53 -0800635 return no_page_table(vma, flags);
636
Zi Yan84c3fc42017-09-08 16:11:01 -0700637retry_locked:
Kirill A. Shutemov6742d292016-01-15 16:52:28 -0800638 ptl = pmd_lock(mm, pmd);
Huang Ying68827282018-06-07 17:06:34 -0700639 if (unlikely(pmd_none(*pmd))) {
640 spin_unlock(ptl);
641 return no_page_table(vma, flags);
642 }
Zi Yan84c3fc42017-09-08 16:11:01 -0700643 if (unlikely(!pmd_present(*pmd))) {
644 spin_unlock(ptl);
645 if (likely(!(flags & FOLL_MIGRATION)))
646 return no_page_table(vma, flags);
647 pmd_migration_entry_wait(mm, pmd);
648 goto retry_locked;
649 }
Kirill A. Shutemov6742d292016-01-15 16:52:28 -0800650 if (unlikely(!pmd_trans_huge(*pmd))) {
651 spin_unlock(ptl);
Keith Buschdf06b372018-10-26 15:10:28 -0700652 return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
Kirill A. Shutemov69e68b42014-06-04 16:08:11 -0700653 }
Song Liubfe7b002019-09-23 15:38:25 -0700654 if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
Kirill A. Shutemov6742d292016-01-15 16:52:28 -0800655 int ret;
656 page = pmd_page(*pmd);
657 if (is_huge_zero_page(page)) {
658 spin_unlock(ptl);
659 ret = 0;
Kirill A. Shutemov78ddc532016-01-15 16:52:42 -0800660 split_huge_pmd(vma, pmd, address);
Naoya Horiguchi337d9ab2016-07-26 15:24:03 -0700661 if (pmd_trans_unstable(pmd))
662 ret = -EBUSY;
Song Liubfe7b002019-09-23 15:38:25 -0700663 } else if (flags & FOLL_SPLIT) {
Linus Torvalds8fde12c2019-04-11 10:49:19 -0700664 if (unlikely(!try_get_page(page))) {
665 spin_unlock(ptl);
666 return ERR_PTR(-ENOMEM);
667 }
Kirill A. Shutemov6742d292016-01-15 16:52:28 -0800668 spin_unlock(ptl);
669 lock_page(page);
670 ret = split_huge_page(page);
671 unlock_page(page);
672 put_page(page);
Kirill A. Shutemovbaa355f2016-07-26 15:25:51 -0700673 if (pmd_none(*pmd))
674 return no_page_table(vma, flags);
Song Liubfe7b002019-09-23 15:38:25 -0700675 } else { /* flags & FOLL_SPLIT_PMD */
676 spin_unlock(ptl);
677 split_huge_pmd(vma, pmd, address);
678 ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
Kirill A. Shutemov6742d292016-01-15 16:52:28 -0800679 }
680
681 return ret ? ERR_PTR(ret) :
Keith Buschdf06b372018-10-26 15:10:28 -0700682 follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
Kirill A. Shutemov6742d292016-01-15 16:52:28 -0800683 }
Kirill A. Shutemov6742d292016-01-15 16:52:28 -0800684 page = follow_trans_huge_pmd(vma, address, pmd, flags);
685 spin_unlock(ptl);
Keith Buschdf06b372018-10-26 15:10:28 -0700686 ctx->page_mask = HPAGE_PMD_NR - 1;
Kirill A. Shutemov6742d292016-01-15 16:52:28 -0800687 return page;
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -0700688}
689
Aneesh Kumar K.V080dbb62017-07-06 15:38:44 -0700690static struct page *follow_pud_mask(struct vm_area_struct *vma,
691 unsigned long address, p4d_t *p4dp,
Keith Buschdf06b372018-10-26 15:10:28 -0700692 unsigned int flags,
693 struct follow_page_context *ctx)
Aneesh Kumar K.V080dbb62017-07-06 15:38:44 -0700694{
695 pud_t *pud;
696 spinlock_t *ptl;
697 struct page *page;
698 struct mm_struct *mm = vma->vm_mm;
699
700 pud = pud_offset(p4dp, address);
701 if (pud_none(*pud))
702 return no_page_table(vma, flags);
Wei Yangbe9d3042020-01-30 22:12:14 -0800703 if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
Aneesh Kumar K.V080dbb62017-07-06 15:38:44 -0700704 page = follow_huge_pud(mm, address, pud, flags);
705 if (page)
706 return page;
707 return no_page_table(vma, flags);
708 }
Aneesh Kumar K.V4dc71452017-07-06 15:38:56 -0700709 if (is_hugepd(__hugepd(pud_val(*pud)))) {
710 page = follow_huge_pd(vma, address,
711 __hugepd(pud_val(*pud)), flags,
712 PUD_SHIFT);
713 if (page)
714 return page;
715 return no_page_table(vma, flags);
716 }
Aneesh Kumar K.V080dbb62017-07-06 15:38:44 -0700717 if (pud_devmap(*pud)) {
718 ptl = pud_lock(mm, pud);
Keith Buschdf06b372018-10-26 15:10:28 -0700719 page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
Aneesh Kumar K.V080dbb62017-07-06 15:38:44 -0700720 spin_unlock(ptl);
721 if (page)
722 return page;
723 }
724 if (unlikely(pud_bad(*pud)))
725 return no_page_table(vma, flags);
726
Keith Buschdf06b372018-10-26 15:10:28 -0700727 return follow_pmd_mask(vma, address, pud, flags, ctx);
Aneesh Kumar K.V080dbb62017-07-06 15:38:44 -0700728}
729
Aneesh Kumar K.V080dbb62017-07-06 15:38:44 -0700730static struct page *follow_p4d_mask(struct vm_area_struct *vma,
731 unsigned long address, pgd_t *pgdp,
Keith Buschdf06b372018-10-26 15:10:28 -0700732 unsigned int flags,
733 struct follow_page_context *ctx)
Aneesh Kumar K.V080dbb62017-07-06 15:38:44 -0700734{
735 p4d_t *p4d;
Aneesh Kumar K.V4dc71452017-07-06 15:38:56 -0700736 struct page *page;
Aneesh Kumar K.V080dbb62017-07-06 15:38:44 -0700737
738 p4d = p4d_offset(pgdp, address);
739 if (p4d_none(*p4d))
740 return no_page_table(vma, flags);
741 BUILD_BUG_ON(p4d_huge(*p4d));
742 if (unlikely(p4d_bad(*p4d)))
743 return no_page_table(vma, flags);
744
Aneesh Kumar K.V4dc71452017-07-06 15:38:56 -0700745 if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
746 page = follow_huge_pd(vma, address,
747 __hugepd(p4d_val(*p4d)), flags,
748 P4D_SHIFT);
749 if (page)
750 return page;
751 return no_page_table(vma, flags);
752 }
Keith Buschdf06b372018-10-26 15:10:28 -0700753 return follow_pud_mask(vma, address, p4d, flags, ctx);
Aneesh Kumar K.V080dbb62017-07-06 15:38:44 -0700754}
755
756/**
757 * follow_page_mask - look up a page descriptor from a user-virtual address
758 * @vma: vm_area_struct mapping @address
759 * @address: virtual address to look up
760 * @flags: flags modifying lookup behaviour
Mike Rapoport78179552018-11-16 15:08:29 -0800761 * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
762 * pointer to output page_mask
Aneesh Kumar K.V080dbb62017-07-06 15:38:44 -0700763 *
764 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
765 *
Mike Rapoport78179552018-11-16 15:08:29 -0800766 * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
767 * the device's dev_pagemap metadata to avoid repeating expensive lookups.
768 *
769 * On output, the @ctx->page_mask is set according to the size of the page.
770 *
771 * Return: the mapped (struct page *), %NULL if no mapping exists, or
Aneesh Kumar K.V080dbb62017-07-06 15:38:44 -0700772 * an error pointer if there is a mapping to something not represented
773 * by a page descriptor (see also vm_normal_page()).
774 */
Bharath Vedarthama7030ae2019-07-11 20:54:34 -0700775static struct page *follow_page_mask(struct vm_area_struct *vma,
Aneesh Kumar K.V080dbb62017-07-06 15:38:44 -0700776 unsigned long address, unsigned int flags,
Keith Buschdf06b372018-10-26 15:10:28 -0700777 struct follow_page_context *ctx)
Aneesh Kumar K.V080dbb62017-07-06 15:38:44 -0700778{
779 pgd_t *pgd;
780 struct page *page;
781 struct mm_struct *mm = vma->vm_mm;
782
Keith Buschdf06b372018-10-26 15:10:28 -0700783 ctx->page_mask = 0;
Aneesh Kumar K.V080dbb62017-07-06 15:38:44 -0700784
785 /* make this handle hugepd */
786 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
787 if (!IS_ERR(page)) {
John Hubbard3faa52c2020-04-01 21:05:29 -0700788 WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
Aneesh Kumar K.V080dbb62017-07-06 15:38:44 -0700789 return page;
790 }
791
792 pgd = pgd_offset(mm, address);
793
794 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
795 return no_page_table(vma, flags);
796
Anshuman Khandualfaaa5b62017-07-06 15:38:50 -0700797 if (pgd_huge(*pgd)) {
798 page = follow_huge_pgd(mm, address, pgd, flags);
799 if (page)
800 return page;
801 return no_page_table(vma, flags);
802 }
Aneesh Kumar K.V4dc71452017-07-06 15:38:56 -0700803 if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
804 page = follow_huge_pd(vma, address,
805 __hugepd(pgd_val(*pgd)), flags,
806 PGDIR_SHIFT);
807 if (page)
808 return page;
809 return no_page_table(vma, flags);
810 }
Anshuman Khandualfaaa5b62017-07-06 15:38:50 -0700811
Keith Buschdf06b372018-10-26 15:10:28 -0700812 return follow_p4d_mask(vma, address, pgd, flags, ctx);
813}
814
815struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
816 unsigned int foll_flags)
817{
818 struct follow_page_context ctx = { NULL };
819 struct page *page;
820
821 page = follow_page_mask(vma, address, foll_flags, &ctx);
822 if (ctx.pgmap)
823 put_dev_pagemap(ctx.pgmap);
824 return page;
Aneesh Kumar K.V080dbb62017-07-06 15:38:44 -0700825}
826
Kirill A. Shutemovf2b495c2014-06-04 16:08:11 -0700827static int get_gate_page(struct mm_struct *mm, unsigned long address,
828 unsigned int gup_flags, struct vm_area_struct **vma,
829 struct page **page)
830{
831 pgd_t *pgd;
Kirill A. Shutemovc2febaf2017-03-09 17:24:07 +0300832 p4d_t *p4d;
Kirill A. Shutemovf2b495c2014-06-04 16:08:11 -0700833 pud_t *pud;
834 pmd_t *pmd;
835 pte_t *pte;
836 int ret = -EFAULT;
837
838 /* user gate pages are read-only */
839 if (gup_flags & FOLL_WRITE)
840 return -EFAULT;
841 if (address > TASK_SIZE)
842 pgd = pgd_offset_k(address);
843 else
844 pgd = pgd_offset_gate(mm, address);
Andy Lutomirskib5d1c392019-07-11 20:57:43 -0700845 if (pgd_none(*pgd))
846 return -EFAULT;
Kirill A. Shutemovc2febaf2017-03-09 17:24:07 +0300847 p4d = p4d_offset(pgd, address);
Andy Lutomirskib5d1c392019-07-11 20:57:43 -0700848 if (p4d_none(*p4d))
849 return -EFAULT;
Kirill A. Shutemovc2febaf2017-03-09 17:24:07 +0300850 pud = pud_offset(p4d, address);
Andy Lutomirskib5d1c392019-07-11 20:57:43 -0700851 if (pud_none(*pud))
852 return -EFAULT;
Kirill A. Shutemovf2b495c2014-06-04 16:08:11 -0700853 pmd = pmd_offset(pud, address);
Zi Yan84c3fc42017-09-08 16:11:01 -0700854 if (!pmd_present(*pmd))
Kirill A. Shutemovf2b495c2014-06-04 16:08:11 -0700855 return -EFAULT;
856 VM_BUG_ON(pmd_trans_huge(*pmd));
857 pte = pte_offset_map(pmd, address);
858 if (pte_none(*pte))
859 goto unmap;
860 *vma = get_gate_vma(mm);
861 if (!page)
862 goto out;
863 *page = vm_normal_page(*vma, address, *pte);
864 if (!*page) {
865 if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
866 goto unmap;
867 *page = pte_page(*pte);
868 }
Dave Hansen9fa2dd92020-09-03 13:40:28 -0700869 if (unlikely(!try_grab_page(*page, gup_flags))) {
Linus Torvalds8fde12c2019-04-11 10:49:19 -0700870 ret = -ENOMEM;
871 goto unmap;
872 }
Kirill A. Shutemovf2b495c2014-06-04 16:08:11 -0700873out:
874 ret = 0;
875unmap:
876 pte_unmap(pte);
877 return ret;
878}
879
Paul Cassella9a95f3c2014-08-06 16:07:24 -0700880/*
Michel Lespinassec1e8d7c2020-06-08 21:33:54 -0700881 * mmap_lock must be held on entry. If @locked != NULL and *@flags
882 * does not include FOLL_NOWAIT, the mmap_lock may be released. If it
Peter Xu4f6da932020-04-01 21:07:58 -0700883 * is, *@locked will be set to 0 and -EBUSY returned.
Paul Cassella9a95f3c2014-08-06 16:07:24 -0700884 */
Peter Xu64019a22020-08-11 18:39:01 -0700885static int faultin_page(struct vm_area_struct *vma,
Peter Xu4f6da932020-04-01 21:07:58 -0700886 unsigned long address, unsigned int *flags, int *locked)
Kirill A. Shutemov16744482014-06-04 16:08:12 -0700887{
Kirill A. Shutemov16744482014-06-04 16:08:12 -0700888 unsigned int fault_flags = 0;
Souptick Joarder2b740302018-08-23 17:01:36 -0700889 vm_fault_t ret;
Kirill A. Shutemov16744482014-06-04 16:08:12 -0700890
Eric B Munsonde60f5f2015-11-05 18:51:36 -0800891 /* mlock all present pages, but do not fault in new pages */
892 if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
893 return -ENOENT;
Kirill A. Shutemov16744482014-06-04 16:08:12 -0700894 if (*flags & FOLL_WRITE)
895 fault_flags |= FAULT_FLAG_WRITE;
Dave Hansen1b2ee122016-02-12 13:02:21 -0800896 if (*flags & FOLL_REMOTE)
897 fault_flags |= FAULT_FLAG_REMOTE;
Peter Xu4f6da932020-04-01 21:07:58 -0700898 if (locked)
Peter Xu71335f32020-04-01 21:08:53 -0700899 fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
Kirill A. Shutemov16744482014-06-04 16:08:12 -0700900 if (*flags & FOLL_NOWAIT)
901 fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
Andres Lagar-Cavilla234b2392014-09-17 10:51:48 -0700902 if (*flags & FOLL_TRIED) {
Peter Xu4426e942020-04-01 21:08:49 -0700903 /*
904 * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED
905 * can co-exist
906 */
Andres Lagar-Cavilla234b2392014-09-17 10:51:48 -0700907 fault_flags |= FAULT_FLAG_TRIED;
908 }
Kirill A. Shutemov16744482014-06-04 16:08:12 -0700909
Peter Xubce617e2020-08-11 18:37:44 -0700910 ret = handle_mm_fault(vma, address, fault_flags, NULL);
Kirill A. Shutemov16744482014-06-04 16:08:12 -0700911 if (ret & VM_FAULT_ERROR) {
James Morse9a291a72017-06-02 14:46:46 -0700912 int err = vm_fault_to_errno(ret, *flags);
913
914 if (err)
915 return err;
Kirill A. Shutemov16744482014-06-04 16:08:12 -0700916 BUG();
917 }
918
Kirill A. Shutemov16744482014-06-04 16:08:12 -0700919 if (ret & VM_FAULT_RETRY) {
Peter Xu4f6da932020-04-01 21:07:58 -0700920 if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
921 *locked = 0;
Kirill A. Shutemov16744482014-06-04 16:08:12 -0700922 return -EBUSY;
923 }
924
925 /*
926 * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
927 * necessary, even if maybe_mkwrite decided not to set pte_write. We
928 * can thus safely do subsequent page lookups as if they were reads.
929 * But only do so when looping for pte_write is futile: in some cases
930 * userspace may also be wanting to write to the gotten user page,
931 * which a read fault here might prevent (a readonly page might get
932 * reCOWed by userspace write).
933 */
934 if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
Mario Leinweber29231172018-04-05 16:24:18 -0700935 *flags |= FOLL_COW;
Kirill A. Shutemov16744482014-06-04 16:08:12 -0700936 return 0;
937}
938
Kirill A. Shutemovfa5bb202014-06-04 16:08:13 -0700939static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
940{
941 vm_flags_t vm_flags = vma->vm_flags;
Dave Hansen1b2ee122016-02-12 13:02:21 -0800942 int write = (gup_flags & FOLL_WRITE);
943 int foreign = (gup_flags & FOLL_REMOTE);
Kirill A. Shutemovfa5bb202014-06-04 16:08:13 -0700944
945 if (vm_flags & (VM_IO | VM_PFNMAP))
946 return -EFAULT;
947
Willy Tarreau7f7ccc22018-05-11 08:11:44 +0200948 if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
949 return -EFAULT;
950
Jason Gunthorpe78ea29e2020-12-14 19:05:48 -0800951 if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
952 return -EOPNOTSUPP;
953
Dave Hansen1b2ee122016-02-12 13:02:21 -0800954 if (write) {
Kirill A. Shutemovfa5bb202014-06-04 16:08:13 -0700955 if (!(vm_flags & VM_WRITE)) {
956 if (!(gup_flags & FOLL_FORCE))
957 return -EFAULT;
958 /*
959 * We used to let the write,force case do COW in a
960 * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
961 * set a breakpoint in a read-only mapping of an
962 * executable, without corrupting the file (yet only
963 * when that file had been opened for writing!).
964 * Anon pages in shared mappings are surprising: now
965 * just reject it.
966 */
Hugh Dickins46435362016-01-30 18:03:16 -0800967 if (!is_cow_mapping(vm_flags))
Kirill A. Shutemovfa5bb202014-06-04 16:08:13 -0700968 return -EFAULT;
Kirill A. Shutemovfa5bb202014-06-04 16:08:13 -0700969 }
970 } else if (!(vm_flags & VM_READ)) {
971 if (!(gup_flags & FOLL_FORCE))
972 return -EFAULT;
973 /*
974 * Is there actually any vma we can reach here which does not
975 * have VM_MAYREAD set?
976 */
977 if (!(vm_flags & VM_MAYREAD))
978 return -EFAULT;
979 }
Dave Hansend61172b2016-02-12 13:02:24 -0800980 /*
981 * gups are always data accesses, not instruction
982 * fetches, so execute=false here
983 */
984 if (!arch_vma_access_permitted(vma, write, false, foreign))
Dave Hansen33a709b2016-02-12 13:02:19 -0800985 return -EFAULT;
Kirill A. Shutemovfa5bb202014-06-04 16:08:13 -0700986 return 0;
987}
988
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -0700989/**
990 * __get_user_pages() - pin user pages in memory
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -0700991 * @mm: mm_struct of target mm
992 * @start: starting user address
993 * @nr_pages: number of pages from start to pin
994 * @gup_flags: flags modifying pin behaviour
995 * @pages: array that receives pointers to the pages pinned.
996 * Should be at least nr_pages long. Or NULL, if caller
997 * only intends to ensure the pages are faulted in.
998 * @vmas: array of pointers to vmas corresponding to each page.
999 * Or NULL if the caller does not require them.
Michel Lespinassec1e8d7c2020-06-08 21:33:54 -07001000 * @locked: whether we're still with the mmap_lock held
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001001 *
Liu Xiangd2dfbe42019-11-30 17:49:53 -08001002 * Returns either number of pages pinned (which may be less than the
1003 * number requested), or an error. Details about the return value:
1004 *
1005 * -- If nr_pages is 0, returns 0.
1006 * -- If nr_pages is >0, but no pages were pinned, returns -errno.
1007 * -- If nr_pages is >0, and some pages were pinned, returns the number of
1008 * pages pinned. Again, this may be less than nr_pages.
Michal Hocko2d3a36a2020-06-03 16:03:25 -07001009 * -- 0 return value is possible when the fault would need to be retried.
Liu Xiangd2dfbe42019-11-30 17:49:53 -08001010 *
1011 * The caller is responsible for releasing returned @pages, via put_page().
1012 *
Michel Lespinassec1e8d7c2020-06-08 21:33:54 -07001013 * @vmas are valid only as long as mmap_lock is held.
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001014 *
Michel Lespinassec1e8d7c2020-06-08 21:33:54 -07001015 * Must be called with mmap_lock held. It may be released. See below.
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001016 *
1017 * __get_user_pages walks a process's page tables and takes a reference to
1018 * each struct page that each user address corresponds to at a given
1019 * instant. That is, it takes the page that would be accessed if a user
1020 * thread accesses the given user virtual address at that instant.
1021 *
1022 * This does not guarantee that the page exists in the user mappings when
1023 * __get_user_pages returns, and there may even be a completely different
1024 * page there in some cases (eg. if mmapped pagecache has been invalidated
1025 * and subsequently re faulted). However it does guarantee that the page
1026 * won't be freed completely. And mostly callers simply care that the page
1027 * contains data that was valid *at some point in time*. Typically, an IO
1028 * or similar operation cannot guarantee anything stronger anyway because
1029 * locks can't be held over the syscall boundary.
1030 *
1031 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
1032 * the page is written to, set_page_dirty (or set_page_dirty_lock, as
1033 * appropriate) must be called after the page is finished with, and
1034 * before put_page is called.
1035 *
Michel Lespinassec1e8d7c2020-06-08 21:33:54 -07001036 * If @locked != NULL, *@locked will be set to 0 when mmap_lock is
Peter Xu4f6da932020-04-01 21:07:58 -07001037 * released by an up_read(). That can happen if @gup_flags does not
1038 * have FOLL_NOWAIT.
Paul Cassella9a95f3c2014-08-06 16:07:24 -07001039 *
Peter Xu4f6da932020-04-01 21:07:58 -07001040 * A caller using such a combination of @locked and @gup_flags
Michel Lespinassec1e8d7c2020-06-08 21:33:54 -07001041 * must therefore hold the mmap_lock for reading only, and recognize
Paul Cassella9a95f3c2014-08-06 16:07:24 -07001042 * when it's been released. Otherwise, it must be held for either
1043 * reading or writing and will not be released.
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001044 *
1045 * In most cases, get_user_pages or get_user_pages_fast should be used
1046 * instead of __get_user_pages. __get_user_pages should be used only if
1047 * you need some special @gup_flags.
1048 */
Peter Xu64019a22020-08-11 18:39:01 -07001049static long __get_user_pages(struct mm_struct *mm,
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001050 unsigned long start, unsigned long nr_pages,
1051 unsigned int gup_flags, struct page **pages,
Peter Xu4f6da932020-04-01 21:07:58 -07001052 struct vm_area_struct **vmas, int *locked)
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001053{
Keith Buschdf06b372018-10-26 15:10:28 -07001054 long ret = 0, i = 0;
Kirill A. Shutemovfa5bb202014-06-04 16:08:13 -07001055 struct vm_area_struct *vma = NULL;
Keith Buschdf06b372018-10-26 15:10:28 -07001056 struct follow_page_context ctx = { NULL };
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001057
1058 if (!nr_pages)
1059 return 0;
1060
Andrey Konovalovf9652592019-09-25 16:48:34 -07001061 start = untagged_addr(start);
1062
John Hubbardeddb1c22020-01-30 22:12:54 -08001063 VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001064
1065 /*
1066 * If FOLL_FORCE is set then do not force a full fault as the hinting
1067 * fault information is unrelated to the reference behaviour of a task
1068 * using the address space
1069 */
1070 if (!(gup_flags & FOLL_FORCE))
1071 gup_flags |= FOLL_NUMA;
1072
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001073 do {
Kirill A. Shutemovfa5bb202014-06-04 16:08:13 -07001074 struct page *page;
1075 unsigned int foll_flags = gup_flags;
1076 unsigned int page_increm;
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001077
Kirill A. Shutemovfa5bb202014-06-04 16:08:13 -07001078 /* first iteration or cross vma bound */
1079 if (!vma || start >= vma->vm_end) {
1080 vma = find_extend_vma(mm, start);
1081 if (!vma && in_gate_area(mm, start)) {
Kirill A. Shutemovfa5bb202014-06-04 16:08:13 -07001082 ret = get_gate_page(mm, start & PAGE_MASK,
1083 gup_flags, &vma,
1084 pages ? &pages[i] : NULL);
1085 if (ret)
John Hubbard08be37b2018-11-30 14:08:53 -08001086 goto out;
Keith Buschdf06b372018-10-26 15:10:28 -07001087 ctx.page_mask = 0;
Kirill A. Shutemovfa5bb202014-06-04 16:08:13 -07001088 goto next_page;
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001089 }
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001090
Jason Gunthorpe78ea29e2020-12-14 19:05:48 -08001091 if (!vma) {
Keith Buschdf06b372018-10-26 15:10:28 -07001092 ret = -EFAULT;
1093 goto out;
1094 }
Jason Gunthorpe78ea29e2020-12-14 19:05:48 -08001095 ret = check_vma_flags(vma, gup_flags);
1096 if (ret)
1097 goto out;
1098
Kirill A. Shutemovfa5bb202014-06-04 16:08:13 -07001099 if (is_vm_hugetlb_page(vma)) {
1100 i = follow_hugetlb_page(mm, vma, pages, vmas,
1101 &start, &nr_pages, i,
Peter Xua308c712020-08-21 19:49:57 -04001102 gup_flags, locked);
Peter Xuad415db2020-04-01 21:08:02 -07001103 if (locked && *locked == 0) {
1104 /*
1105 * We've got a VM_FAULT_RETRY
Michel Lespinassec1e8d7c2020-06-08 21:33:54 -07001106 * and we've lost mmap_lock.
Peter Xuad415db2020-04-01 21:08:02 -07001107 * We must stop here.
1108 */
1109 BUG_ON(gup_flags & FOLL_NOWAIT);
1110 BUG_ON(ret != 0);
1111 goto out;
1112 }
Kirill A. Shutemovfa5bb202014-06-04 16:08:13 -07001113 continue;
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001114 }
Kirill A. Shutemovfa5bb202014-06-04 16:08:13 -07001115 }
1116retry:
1117 /*
1118 * If we have a pending SIGKILL, don't keep faulting pages and
1119 * potentially allocating memory.
1120 */
Davidlohr Buesofa45f112019-01-03 15:28:55 -08001121 if (fatal_signal_pending(current)) {
Michal Hockod180870d2020-04-20 18:13:55 -07001122 ret = -EINTR;
Keith Buschdf06b372018-10-26 15:10:28 -07001123 goto out;
1124 }
Kirill A. Shutemovfa5bb202014-06-04 16:08:13 -07001125 cond_resched();
Keith Buschdf06b372018-10-26 15:10:28 -07001126
1127 page = follow_page_mask(vma, start, foll_flags, &ctx);
Kirill A. Shutemovfa5bb202014-06-04 16:08:13 -07001128 if (!page) {
Peter Xu64019a22020-08-11 18:39:01 -07001129 ret = faultin_page(vma, start, &foll_flags, locked);
Kirill A. Shutemovfa5bb202014-06-04 16:08:13 -07001130 switch (ret) {
1131 case 0:
1132 goto retry;
Keith Buschdf06b372018-10-26 15:10:28 -07001133 case -EBUSY:
1134 ret = 0;
Joe Perchese4a9bc52020-04-06 20:08:39 -07001135 fallthrough;
Kirill A. Shutemovfa5bb202014-06-04 16:08:13 -07001136 case -EFAULT:
1137 case -ENOMEM:
1138 case -EHWPOISON:
Keith Buschdf06b372018-10-26 15:10:28 -07001139 goto out;
Kirill A. Shutemovfa5bb202014-06-04 16:08:13 -07001140 case -ENOENT:
1141 goto next_page;
1142 }
1143 BUG();
Kirill A. Shutemov1027e442015-09-04 15:47:55 -07001144 } else if (PTR_ERR(page) == -EEXIST) {
1145 /*
1146 * Proper page table entry exists, but no corresponding
1147 * struct page.
1148 */
1149 goto next_page;
1150 } else if (IS_ERR(page)) {
Keith Buschdf06b372018-10-26 15:10:28 -07001151 ret = PTR_ERR(page);
1152 goto out;
Kirill A. Shutemov1027e442015-09-04 15:47:55 -07001153 }
Kirill A. Shutemovfa5bb202014-06-04 16:08:13 -07001154 if (pages) {
1155 pages[i] = page;
1156 flush_anon_page(vma, page, start);
1157 flush_dcache_page(page);
Keith Buschdf06b372018-10-26 15:10:28 -07001158 ctx.page_mask = 0;
Kirill A. Shutemovfa5bb202014-06-04 16:08:13 -07001159 }
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001160next_page:
Kirill A. Shutemovfa5bb202014-06-04 16:08:13 -07001161 if (vmas) {
1162 vmas[i] = vma;
Keith Buschdf06b372018-10-26 15:10:28 -07001163 ctx.page_mask = 0;
Kirill A. Shutemovfa5bb202014-06-04 16:08:13 -07001164 }
Keith Buschdf06b372018-10-26 15:10:28 -07001165 page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
Kirill A. Shutemovfa5bb202014-06-04 16:08:13 -07001166 if (page_increm > nr_pages)
1167 page_increm = nr_pages;
1168 i += page_increm;
1169 start += page_increm * PAGE_SIZE;
1170 nr_pages -= page_increm;
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001171 } while (nr_pages);
Keith Buschdf06b372018-10-26 15:10:28 -07001172out:
1173 if (ctx.pgmap)
1174 put_dev_pagemap(ctx.pgmap);
1175 return i ? i : ret;
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001176}
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001177
Tobias Klauser771ab432016-12-12 16:41:53 -08001178static bool vma_permits_fault(struct vm_area_struct *vma,
1179 unsigned int fault_flags)
Dave Hansend4925e02016-02-12 13:02:16 -08001180{
Dave Hansen1b2ee122016-02-12 13:02:21 -08001181 bool write = !!(fault_flags & FAULT_FLAG_WRITE);
1182 bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
Dave Hansen33a709b2016-02-12 13:02:19 -08001183 vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
Dave Hansend4925e02016-02-12 13:02:16 -08001184
1185 if (!(vm_flags & vma->vm_flags))
1186 return false;
1187
Dave Hansen33a709b2016-02-12 13:02:19 -08001188 /*
1189 * The architecture might have a hardware protection
Dave Hansen1b2ee122016-02-12 13:02:21 -08001190 * mechanism other than read/write that can deny access.
Dave Hansend61172b2016-02-12 13:02:24 -08001191 *
1192 * gup always represents data access, not instruction
1193 * fetches, so execute=false here:
Dave Hansen33a709b2016-02-12 13:02:19 -08001194 */
Dave Hansend61172b2016-02-12 13:02:24 -08001195 if (!arch_vma_access_permitted(vma, write, false, foreign))
Dave Hansen33a709b2016-02-12 13:02:19 -08001196 return false;
1197
Dave Hansend4925e02016-02-12 13:02:16 -08001198 return true;
1199}
1200
Souptick Joarderadc8cb42020-06-01 21:48:24 -07001201/**
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001202 * fixup_user_fault() - manually resolve a user page fault
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001203 * @mm: mm_struct of target mm
1204 * @address: user address
1205 * @fault_flags:flags to pass down to handle_mm_fault()
Michel Lespinassec1e8d7c2020-06-08 21:33:54 -07001206 * @unlocked: did we unlock the mmap_lock while retrying, maybe NULL if caller
Miles Chen548b6a12020-06-01 21:48:33 -07001207 * does not allow retry. If NULL, the caller must guarantee
1208 * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001209 *
1210 * This is meant to be called in the specific scenario where for locking reasons
1211 * we try to access user memory in atomic context (within a pagefault_disable()
1212 * section), this returns -EFAULT, and we want to resolve the user fault before
1213 * trying again.
1214 *
1215 * Typically this is meant to be used by the futex code.
1216 *
1217 * The main difference with get_user_pages() is that this function will
1218 * unconditionally call handle_mm_fault() which will in turn perform all the
1219 * necessary SW fixup of the dirty and young bits in the PTE, while
Dominik Dingel4a9e1cd2016-01-15 16:57:04 -08001220 * get_user_pages() only guarantees to update these in the struct page.
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001221 *
1222 * This is important for some architectures where those bits also gate the
1223 * access permission to the page because they are maintained in software. On
1224 * such architectures, gup() will not be enough to make a subsequent access
1225 * succeed.
1226 *
Michel Lespinassec1e8d7c2020-06-08 21:33:54 -07001227 * This function will not return with an unlocked mmap_lock. So it has not the
1228 * same semantics wrt the @mm->mmap_lock as does filemap_fault().
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001229 */
Peter Xu64019a22020-08-11 18:39:01 -07001230int fixup_user_fault(struct mm_struct *mm,
Dominik Dingel4a9e1cd2016-01-15 16:57:04 -08001231 unsigned long address, unsigned int fault_flags,
1232 bool *unlocked)
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001233{
1234 struct vm_area_struct *vma;
Souptick Joarder2b740302018-08-23 17:01:36 -07001235 vm_fault_t ret, major = 0;
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001236
Andrey Konovalovf9652592019-09-25 16:48:34 -07001237 address = untagged_addr(address);
1238
Dominik Dingel4a9e1cd2016-01-15 16:57:04 -08001239 if (unlocked)
Peter Xu71335f32020-04-01 21:08:53 -07001240 fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
Dominik Dingel4a9e1cd2016-01-15 16:57:04 -08001241
1242retry:
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001243 vma = find_extend_vma(mm, address);
1244 if (!vma || address < vma->vm_start)
1245 return -EFAULT;
1246
Dave Hansend4925e02016-02-12 13:02:16 -08001247 if (!vma_permits_fault(vma, fault_flags))
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001248 return -EFAULT;
1249
Peter Xu475f4dfc2020-05-13 17:50:41 -07001250 if ((fault_flags & FAULT_FLAG_KILLABLE) &&
1251 fatal_signal_pending(current))
1252 return -EINTR;
1253
Peter Xubce617e2020-08-11 18:37:44 -07001254 ret = handle_mm_fault(vma, address, fault_flags, NULL);
Dominik Dingel4a9e1cd2016-01-15 16:57:04 -08001255 major |= ret & VM_FAULT_MAJOR;
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001256 if (ret & VM_FAULT_ERROR) {
James Morse9a291a72017-06-02 14:46:46 -07001257 int err = vm_fault_to_errno(ret, 0);
1258
1259 if (err)
1260 return err;
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001261 BUG();
1262 }
Dominik Dingel4a9e1cd2016-01-15 16:57:04 -08001263
1264 if (ret & VM_FAULT_RETRY) {
Michel Lespinassed8ed45c2020-06-08 21:33:25 -07001265 mmap_read_lock(mm);
Peter Xu475f4dfc2020-05-13 17:50:41 -07001266 *unlocked = true;
1267 fault_flags |= FAULT_FLAG_TRIED;
1268 goto retry;
Dominik Dingel4a9e1cd2016-01-15 16:57:04 -08001269 }
1270
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001271 return 0;
1272}
Paolo Bonziniadd6a0c2016-06-07 17:51:18 +02001273EXPORT_SYMBOL_GPL(fixup_user_fault);
Kirill A. Shutemov4bbd4c72014-06-04 16:08:10 -07001274
Michal Hocko2d3a36a2020-06-03 16:03:25 -07001275/*
1276 * Please note that this function, unlike __get_user_pages will not
1277 * return 0 for nr_pages > 0 without FOLL_NOWAIT
1278 */
Peter Xu64019a22020-08-11 18:39:01 -07001279static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
Andrea Arcangelif0818f42015-02-11 15:27:17 -08001280 unsigned long start,
1281 unsigned long nr_pages,
Andrea Arcangelif0818f42015-02-11 15:27:17 -08001282 struct page **pages,
1283 struct vm_area_struct **vmas,
Al Viroe7167122017-11-19 11:32:05 -05001284 int *locked,
Andrea Arcangeli0fd71a52015-02-11 15:27:20 -08001285 unsigned int flags)
Andrea Arcangelif0818f42015-02-11 15:27:17 -08001286{
Andrea Arcangelif0818f42015-02-11 15:27:17 -08001287 long ret, pages_done;
1288 bool lock_dropped;
1289
1290 if (locked) {
1291 /* if VM_FAULT_RETRY can be returned, vmas become invalid */
1292 BUG_ON(vmas);
1293 /* check caller initialized locked */
1294 BUG_ON(*locked != 1);
1295 }
1296
Peter Xu008cfe42020-09-25 18:25:57 -04001297 if (flags & FOLL_PIN)
Jason A. Donenfelda4d63c32020-09-28 12:35:07 +02001298 atomic_set(&mm->has_pinned, 1);
Peter Xu008cfe42020-09-25 18:25:57 -04001299
John Hubbardeddb1c22020-01-30 22:12:54 -08001300 /*
1301 * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
1302 * is to set FOLL_GET if the caller wants pages[] filled in (but has
1303 * carelessly failed to specify FOLL_GET), so keep doing that, but only
1304 * for FOLL_GET, not for the newer FOLL_PIN.
1305 *
1306 * FOLL_PIN always expects pages to be non-null, but no need to assert
1307 * that here, as any failures will be obvious enough.
1308 */
1309 if (pages && !(flags & FOLL_PIN))
Andrea Arcangelif0818f42015-02-11 15:27:17 -08001310 flags |= FOLL_GET;
Andrea Arcangelif0818f42015-02-11 15:27:17 -08001311
1312 pages_done = 0;
1313 lock_dropped = false;
1314 for (;;) {
Peter Xu64019a22020-08-11 18:39:01 -07001315 ret = __get_user_pages(mm, start, nr_pages, flags, pages,
Andrea Arcangelif0818f42015-02-11 15:27:17 -08001316 vmas, locked);
1317 if (!locked)
1318 /* VM_FAULT_RETRY couldn't trigger, bypass */
1319 return ret;
1320
1321 /* VM_FAULT_RETRY cannot return errors */
1322 if (!*locked) {
1323 BUG_ON(ret < 0);
1324 BUG_ON(ret >= nr_pages);
1325 }
1326
Andrea Arcangelif0818f42015-02-11 15:27:17 -08001327 if (ret > 0) {
1328 nr_pages -= ret;
1329 pages_done += ret;
1330 if (!nr_pages)
1331 break;
1332 }
1333 if (*locked) {
Andrea Arcangeli96312e62018-03-09 15:51:06 -08001334 /*
1335 * VM_FAULT_RETRY didn't trigger or it was a
1336 * FOLL_NOWAIT.
1337 */
Andrea Arcangelif0818f42015-02-11 15:27:17 -08001338 if (!pages_done)
1339 pages_done = ret;
1340 break;
1341 }
Mike Rapoportdf172772019-05-31 22:30:33 -07001342 /*
1343 * VM_FAULT_RETRY triggered, so seek to the faulting offset.
1344 * For the prefault case (!pages) we only update counts.
1345 */
1346 if (likely(pages))
1347 pages += ret;
Andrea Arcangelif0818f42015-02-11 15:27:17 -08001348 start += ret << PAGE_SHIFT;
Peter Xu4426e942020-04-01 21:08:49 -07001349 lock_dropped = true;
Andrea Arcangelif0818f42015-02-11 15:27:17 -08001350
Peter Xu4426e942020-04-01 21:08:49 -07001351retry:
Andrea Arcangelif0818f42015-02-11 15:27:17 -08001352 /*
1353 * Repeat on the address that fired VM_FAULT_RETRY
Peter Xu4426e942020-04-01 21:08:49 -07001354 * with both FAULT_FLAG_ALLOW_RETRY and
1355 * FAULT_FLAG_TRIED. Note that GUP can be interrupted
1356 * by fatal signals, so we need to check it before we
1357 * start trying again otherwise it can loop forever.
Andrea Arcangelif0818f42015-02-11 15:27:17 -08001358 */
Peter Xu4426e942020-04-01 21:08:49 -07001359
Hillf Dantonae46d2a2020-04-08 11:59:24 -04001360 if (fatal_signal_pending(current)) {
1361 if (!pages_done)
1362 pages_done = -EINTR;
Peter Xu4426e942020-04-01 21:08:49 -07001363 break;
Hillf Dantonae46d2a2020-04-08 11:59:24 -04001364 }
Peter Xu4426e942020-04-01 21:08:49 -07001365
Michel Lespinassed8ed45c2020-06-08 21:33:25 -07001366 ret = mmap_read_lock_killable(mm);
Peter Xu71335f32020-04-01 21:08:53 -07001367 if (ret) {
1368 BUG_ON(ret > 0);
1369 if (!pages_done)
1370 pages_done = ret;
1371 break;
1372 }
Peter Xu4426e942020-04-01 21:08:49 -07001373
Peter Xuc7b6a562020-04-07 21:40:10 -04001374 *locked = 1;
Peter Xu64019a22020-08-11 18:39:01 -07001375 ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
Peter Xu4426e942020-04-01 21:08:49 -07001376 pages, NULL, locked);
1377 if (!*locked) {
1378 /* Continue to retry until we succeeded */
1379 BUG_ON(ret != 0);
1380 goto retry;
1381 }
Andrea Arcangelif0818f42015-02-11 15:27:17 -08001382 if (ret != 1) {
1383 BUG_ON(ret > 1);
1384 if (!pages_done)
1385 pages_done = ret;
1386 break;
1387 }
1388 nr_pages--;
1389 pages_done++;
1390 if (!nr_pages)
1391 break;
Mike Rapoportdf172772019-05-31 22:30:33 -07001392 if (likely(pages))
1393 pages++;
Andrea Arcangelif0818f42015-02-11 15:27:17 -08001394 start += PAGE_SIZE;
1395 }
Al Viroe7167122017-11-19 11:32:05 -05001396 if (lock_dropped && *locked) {
Andrea Arcangelif0818f42015-02-11 15:27:17 -08001397 /*
1398 * We must let the caller know we temporarily dropped the lock
1399 * and so the critical section protected by it was lost.
1400 */
Michel Lespinassed8ed45c2020-06-08 21:33:25 -07001401 mmap_read_unlock(mm);
Andrea Arcangelif0818f42015-02-11 15:27:17 -08001402 *locked = 0;
1403 }
1404 return pages_done;
1405}
1406
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001407/**
1408 * populate_vma_page_range() - populate a range of pages in the vma.
1409 * @vma: target vma
1410 * @start: start address
1411 * @end: end address
Michel Lespinassec1e8d7c2020-06-08 21:33:54 -07001412 * @locked: whether the mmap_lock is still held
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001413 *
1414 * This takes care of mlocking the pages too if VM_LOCKED is set.
1415 *
Tang Yizhou0a36f7f2020-08-06 23:20:01 -07001416 * Return either number of pages pinned in the vma, or a negative error
1417 * code on error.
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001418 *
Michel Lespinassec1e8d7c2020-06-08 21:33:54 -07001419 * vma->vm_mm->mmap_lock must be held.
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001420 *
Peter Xu4f6da932020-04-01 21:07:58 -07001421 * If @locked is NULL, it may be held for read or write and will
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001422 * be unperturbed.
1423 *
Peter Xu4f6da932020-04-01 21:07:58 -07001424 * If @locked is non-NULL, it must held for read only and may be
1425 * released. If it's released, *@locked will be set to 0.
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001426 */
1427long populate_vma_page_range(struct vm_area_struct *vma,
Peter Xu4f6da932020-04-01 21:07:58 -07001428 unsigned long start, unsigned long end, int *locked)
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001429{
1430 struct mm_struct *mm = vma->vm_mm;
1431 unsigned long nr_pages = (end - start) / PAGE_SIZE;
1432 int gup_flags;
1433
1434 VM_BUG_ON(start & ~PAGE_MASK);
1435 VM_BUG_ON(end & ~PAGE_MASK);
1436 VM_BUG_ON_VMA(start < vma->vm_start, vma);
1437 VM_BUG_ON_VMA(end > vma->vm_end, vma);
Michel Lespinasse42fc5412020-06-08 21:33:44 -07001438 mmap_assert_locked(mm);
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001439
1440 gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
1441 if (vma->vm_flags & VM_LOCKONFAULT)
1442 gup_flags &= ~FOLL_POPULATE;
1443 /*
1444 * We want to touch writable mappings with a write fault in order
1445 * to break COW, except for shared mappings because these don't COW
1446 * and we would not want to dirty them for nothing.
1447 */
1448 if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
1449 gup_flags |= FOLL_WRITE;
1450
1451 /*
1452 * We want mlock to succeed for regions that have any permissions
1453 * other than PROT_NONE.
1454 */
Anshuman Khandual3122e802020-04-06 20:03:47 -07001455 if (vma_is_accessible(vma))
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001456 gup_flags |= FOLL_FORCE;
1457
1458 /*
1459 * We made sure addr is within a VMA, so the following will
1460 * not result in a stack expansion that recurses back here.
1461 */
Peter Xu64019a22020-08-11 18:39:01 -07001462 return __get_user_pages(mm, start, nr_pages, gup_flags,
Peter Xu4f6da932020-04-01 21:07:58 -07001463 NULL, NULL, locked);
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001464}
1465
1466/*
1467 * __mm_populate - populate and/or mlock pages within a range of address space.
1468 *
1469 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
1470 * flags. VMAs must be already marked with the desired vm_flags, and
Michel Lespinassec1e8d7c2020-06-08 21:33:54 -07001471 * mmap_lock must not be held.
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001472 */
1473int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
1474{
1475 struct mm_struct *mm = current->mm;
1476 unsigned long end, nstart, nend;
1477 struct vm_area_struct *vma = NULL;
1478 int locked = 0;
1479 long ret = 0;
1480
1481 end = start + len;
1482
1483 for (nstart = start; nstart < end; nstart = nend) {
1484 /*
1485 * We want to fault in pages for [nstart; end) address range.
1486 * Find first corresponding VMA.
1487 */
1488 if (!locked) {
1489 locked = 1;
Michel Lespinassed8ed45c2020-06-08 21:33:25 -07001490 mmap_read_lock(mm);
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001491 vma = find_vma(mm, nstart);
1492 } else if (nstart >= vma->vm_end)
1493 vma = vma->vm_next;
1494 if (!vma || vma->vm_start >= end)
1495 break;
1496 /*
1497 * Set [nstart; nend) to intersection of desired address
1498 * range with the first VMA. Also, skip undesirable VMA types.
1499 */
1500 nend = min(end, vma->vm_end);
1501 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1502 continue;
1503 if (nstart < vma->vm_start)
1504 nstart = vma->vm_start;
1505 /*
1506 * Now fault in a range of pages. populate_vma_page_range()
1507 * double checks the vma flags, so that it won't mlock pages
1508 * if the vma was already munlocked.
1509 */
1510 ret = populate_vma_page_range(vma, nstart, nend, &locked);
1511 if (ret < 0) {
1512 if (ignore_errors) {
1513 ret = 0;
1514 continue; /* continue at next VMA */
1515 }
1516 break;
1517 }
1518 nend = nstart + ret * PAGE_SIZE;
1519 ret = 0;
1520 }
1521 if (locked)
Michel Lespinassed8ed45c2020-06-08 21:33:25 -07001522 mmap_read_unlock(mm);
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001523 return ret; /* 0 or negative error code */
1524}
Christoph Hellwig050a9ad2019-07-11 20:57:21 -07001525#else /* CONFIG_MMU */
Peter Xu64019a22020-08-11 18:39:01 -07001526static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
Christoph Hellwig050a9ad2019-07-11 20:57:21 -07001527 unsigned long nr_pages, struct page **pages,
1528 struct vm_area_struct **vmas, int *locked,
1529 unsigned int foll_flags)
1530{
1531 struct vm_area_struct *vma;
1532 unsigned long vm_flags;
1533 int i;
1534
1535 /* calculate required read or write permissions.
1536 * If FOLL_FORCE is set, we only require the "MAY" flags.
1537 */
1538 vm_flags = (foll_flags & FOLL_WRITE) ?
1539 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1540 vm_flags &= (foll_flags & FOLL_FORCE) ?
1541 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1542
1543 for (i = 0; i < nr_pages; i++) {
1544 vma = find_vma(mm, start);
1545 if (!vma)
1546 goto finish_or_fault;
1547
1548 /* protect what we can, including chardevs */
1549 if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1550 !(vm_flags & vma->vm_flags))
1551 goto finish_or_fault;
1552
1553 if (pages) {
1554 pages[i] = virt_to_page(start);
1555 if (pages[i])
1556 get_page(pages[i]);
1557 }
1558 if (vmas)
1559 vmas[i] = vma;
1560 start = (start + PAGE_SIZE) & PAGE_MASK;
1561 }
1562
1563 return i;
1564
1565finish_or_fault:
1566 return i ? : -EFAULT;
1567}
1568#endif /* !CONFIG_MMU */
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001569
Jann Horn8f942ee2020-10-15 20:12:40 -07001570/**
1571 * get_dump_page() - pin user page in memory while writing it to core dump
1572 * @addr: user address
1573 *
1574 * Returns struct page pointer of user page pinned for dump,
1575 * to be freed afterwards by put_page().
1576 *
1577 * Returns NULL on any kind of failure - a hole must then be inserted into
1578 * the corefile, to preserve alignment with its headers; and also returns
1579 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
1580 * allowing a hole to be left in the corefile to save diskspace.
1581 *
Jann Horn7f3bfab2020-10-15 20:12:57 -07001582 * Called without mmap_lock (takes and releases the mmap_lock by itself).
Jann Horn8f942ee2020-10-15 20:12:40 -07001583 */
1584#ifdef CONFIG_ELF_CORE
1585struct page *get_dump_page(unsigned long addr)
1586{
Jann Horn7f3bfab2020-10-15 20:12:57 -07001587 struct mm_struct *mm = current->mm;
Jann Horn8f942ee2020-10-15 20:12:40 -07001588 struct page *page;
Jann Horn7f3bfab2020-10-15 20:12:57 -07001589 int locked = 1;
1590 int ret;
Jann Horn8f942ee2020-10-15 20:12:40 -07001591
Jann Horn7f3bfab2020-10-15 20:12:57 -07001592 if (mmap_read_lock_killable(mm))
Jann Horn8f942ee2020-10-15 20:12:40 -07001593 return NULL;
Jann Horn7f3bfab2020-10-15 20:12:57 -07001594 ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked,
1595 FOLL_FORCE | FOLL_DUMP | FOLL_GET);
1596 if (locked)
1597 mmap_read_unlock(mm);
1598 return (ret == 1) ? page : NULL;
Jann Horn8f942ee2020-10-15 20:12:40 -07001599}
1600#endif /* CONFIG_ELF_CORE */
1601
Aneesh Kumar K.V9a4e9f32019-03-05 15:47:44 -08001602#ifdef CONFIG_CMA
Peter Xu64019a22020-08-11 18:39:01 -07001603static long check_and_migrate_cma_pages(struct mm_struct *mm,
Ira Weiny932f4a62019-05-13 17:17:03 -07001604 unsigned long start,
1605 unsigned long nr_pages,
Aneesh Kumar K.V9a4e9f32019-03-05 15:47:44 -08001606 struct page **pages,
Ira Weiny932f4a62019-05-13 17:17:03 -07001607 struct vm_area_struct **vmas,
1608 unsigned int gup_flags)
Aneesh Kumar K.V9a4e9f32019-03-05 15:47:44 -08001609{
Pavel Tatashin673422b2021-05-04 18:38:49 -07001610 unsigned long i, isolation_error_count;
1611 bool drain_allow;
Aneesh Kumar K.V9a4e9f32019-03-05 15:47:44 -08001612 LIST_HEAD(cma_page_list);
zhong jiangb96cc652019-11-30 17:49:50 -08001613 long ret = nr_pages;
Pavel Tatashin7df511e2021-05-04 18:38:42 -07001614 struct page *prev_head, *head;
Joonsoo Kimed03d922020-08-11 18:37:41 -07001615 struct migration_target_control mtc = {
1616 .nid = NUMA_NO_NODE,
1617 .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_NOWARN,
1618 };
Aneesh Kumar K.V9a4e9f32019-03-05 15:47:44 -08001619
1620check_again:
Pavel Tatashin7df511e2021-05-04 18:38:42 -07001621 prev_head = NULL;
Pavel Tatashin673422b2021-05-04 18:38:49 -07001622 isolation_error_count = 0;
1623 drain_allow = true;
Pavel Tatashin7df511e2021-05-04 18:38:42 -07001624 for (i = 0; i < nr_pages; i++) {
1625 head = compound_head(pages[i]);
1626 if (head == prev_head)
1627 continue;
1628 prev_head = head;
Aneesh Kumar K.V9a4e9f32019-03-05 15:47:44 -08001629 /*
1630 * If we get a page from the CMA zone, since we are going to
1631 * be pinning these entries, we might as well move them out
1632 * of the CMA zone if possible.
1633 */
Pingfan Liuaa712392019-07-11 20:57:39 -07001634 if (is_migrate_cma_page(head)) {
Pavel Tatashin673422b2021-05-04 18:38:49 -07001635 if (PageHuge(head)) {
1636 if (!isolate_huge_page(head, &cma_page_list))
1637 isolation_error_count++;
1638 } else {
Aneesh Kumar K.V9a4e9f32019-03-05 15:47:44 -08001639 if (!PageLRU(head) && drain_allow) {
1640 lru_add_drain_all();
1641 drain_allow = false;
1642 }
1643
Pavel Tatashin673422b2021-05-04 18:38:49 -07001644 if (isolate_lru_page(head)) {
1645 isolation_error_count++;
1646 continue;
Aneesh Kumar K.V9a4e9f32019-03-05 15:47:44 -08001647 }
Pavel Tatashin673422b2021-05-04 18:38:49 -07001648 list_add_tail(&head->lru, &cma_page_list);
1649 mod_node_page_state(page_pgdat(head),
1650 NR_ISOLATED_ANON +
1651 page_is_file_lru(head),
1652 thp_nr_pages(head));
Aneesh Kumar K.V9a4e9f32019-03-05 15:47:44 -08001653 }
1654 }
1655 }
1656
Pavel Tatashin673422b2021-05-04 18:38:49 -07001657 /*
1658 * If list is empty, and no isolation errors, means that all pages are
1659 * in the correct zone.
1660 */
1661 if (list_empty(&cma_page_list) && !isolation_error_count)
1662 return ret;
1663
Aneesh Kumar K.V9a4e9f32019-03-05 15:47:44 -08001664 if (!list_empty(&cma_page_list)) {
1665 /*
1666 * drop the above get_user_pages reference.
1667 */
Jason Gunthorpe96e1fac2020-11-13 22:51:56 -08001668 if (gup_flags & FOLL_PIN)
1669 unpin_user_pages(pages, nr_pages);
1670 else
1671 for (i = 0; i < nr_pages; i++)
1672 put_page(pages[i]);
Aneesh Kumar K.V9a4e9f32019-03-05 15:47:44 -08001673
Pavel Tatashin096c9482021-05-04 18:38:46 -07001674 ret = migrate_pages(&cma_page_list, alloc_migration_target,
1675 NULL, (unsigned long)&mtc, MIGRATE_SYNC,
1676 MR_CONTIG_RANGE);
1677 if (ret) {
Aneesh Kumar K.V9a4e9f32019-03-05 15:47:44 -08001678 if (!list_empty(&cma_page_list))
1679 putback_movable_pages(&cma_page_list);
Pavel Tatashin096c9482021-05-04 18:38:46 -07001680 return ret > 0 ? -ENOMEM : ret;
Aneesh Kumar K.V9a4e9f32019-03-05 15:47:44 -08001681 }
Ira Weiny932f4a62019-05-13 17:17:03 -07001682
Pavel Tatashin673422b2021-05-04 18:38:49 -07001683 /* We unpinned pages before migration, pin them again */
1684 ret = __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
1685 NULL, gup_flags);
1686 if (ret <= 0)
1687 return ret;
1688 nr_pages = ret;
Aneesh Kumar K.V9a4e9f32019-03-05 15:47:44 -08001689 }
1690
Pavel Tatashin673422b2021-05-04 18:38:49 -07001691 /*
1692 * check again because pages were unpinned, and we also might have
1693 * had isolation errors and need more pages to migrate.
1694 */
1695 goto check_again;
Aneesh Kumar K.V9a4e9f32019-03-05 15:47:44 -08001696}
1697#else
Peter Xu64019a22020-08-11 18:39:01 -07001698static long check_and_migrate_cma_pages(struct mm_struct *mm,
Ira Weiny932f4a62019-05-13 17:17:03 -07001699 unsigned long start,
1700 unsigned long nr_pages,
1701 struct page **pages,
1702 struct vm_area_struct **vmas,
1703 unsigned int gup_flags)
Aneesh Kumar K.V9a4e9f32019-03-05 15:47:44 -08001704{
1705 return nr_pages;
1706}
Christoph Hellwig050a9ad2019-07-11 20:57:21 -07001707#endif /* CONFIG_CMA */
Aneesh Kumar K.V9a4e9f32019-03-05 15:47:44 -08001708
Dan Williams2bb6d282017-11-29 16:10:35 -08001709/*
Ira Weiny932f4a62019-05-13 17:17:03 -07001710 * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
1711 * allows us to process the FOLL_LONGTERM flag.
Dan Williams2bb6d282017-11-29 16:10:35 -08001712 */
Peter Xu64019a22020-08-11 18:39:01 -07001713static long __gup_longterm_locked(struct mm_struct *mm,
Ira Weiny932f4a62019-05-13 17:17:03 -07001714 unsigned long start,
1715 unsigned long nr_pages,
1716 struct page **pages,
1717 struct vm_area_struct **vmas,
1718 unsigned int gup_flags)
Dan Williams2bb6d282017-11-29 16:10:35 -08001719{
Ira Weiny932f4a62019-05-13 17:17:03 -07001720 unsigned long flags = 0;
Jason Gunthorpe78ea29e2020-12-14 19:05:48 -08001721 long rc;
Dan Williams2bb6d282017-11-29 16:10:35 -08001722
Jason Gunthorpe78ea29e2020-12-14 19:05:48 -08001723 if (gup_flags & FOLL_LONGTERM)
Ira Weiny932f4a62019-05-13 17:17:03 -07001724 flags = memalloc_nocma_save();
Dan Williams2bb6d282017-11-29 16:10:35 -08001725
Jason Gunthorpe78ea29e2020-12-14 19:05:48 -08001726 rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL,
1727 gup_flags);
Dan Williams2bb6d282017-11-29 16:10:35 -08001728
Ira Weiny932f4a62019-05-13 17:17:03 -07001729 if (gup_flags & FOLL_LONGTERM) {
Jason Gunthorpe78ea29e2020-12-14 19:05:48 -08001730 if (rc > 0)
1731 rc = check_and_migrate_cma_pages(mm, start, rc, pages,
1732 vmas, gup_flags);
Joonsoo Kim41b4dc12020-08-11 18:37:34 -07001733 memalloc_nocma_restore(flags);
Aneesh Kumar K.V9a4e9f32019-03-05 15:47:44 -08001734 }
Dan Williams2bb6d282017-11-29 16:10:35 -08001735 return rc;
1736}
Ira Weiny932f4a62019-05-13 17:17:03 -07001737
Barry Song447f3e42020-10-13 16:51:58 -07001738static bool is_valid_gup_flags(unsigned int gup_flags)
1739{
1740 /*
1741 * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
1742 * never directly by the caller, so enforce that with an assertion:
1743 */
1744 if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
1745 return false;
1746 /*
1747 * FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying
1748 * that is, FOLL_LONGTERM is a specific case, more restrictive case of
1749 * FOLL_PIN.
1750 */
1751 if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
1752 return false;
1753
1754 return true;
1755}
1756
John Hubbard22bf29b2020-04-01 21:05:10 -07001757#ifdef CONFIG_MMU
Peter Xu64019a22020-08-11 18:39:01 -07001758static long __get_user_pages_remote(struct mm_struct *mm,
John Hubbard22bf29b2020-04-01 21:05:10 -07001759 unsigned long start, unsigned long nr_pages,
1760 unsigned int gup_flags, struct page **pages,
1761 struct vm_area_struct **vmas, int *locked)
1762{
1763 /*
1764 * Parts of FOLL_LONGTERM behavior are incompatible with
1765 * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
1766 * vmas. However, this only comes up if locked is set, and there are
1767 * callers that do request FOLL_LONGTERM, but do not set locked. So,
1768 * allow what we can.
1769 */
1770 if (gup_flags & FOLL_LONGTERM) {
1771 if (WARN_ON_ONCE(locked))
1772 return -EINVAL;
1773 /*
1774 * This will check the vmas (even if our vmas arg is NULL)
1775 * and return -ENOTSUPP if DAX isn't allowed in this case:
1776 */
Peter Xu64019a22020-08-11 18:39:01 -07001777 return __gup_longterm_locked(mm, start, nr_pages, pages,
John Hubbard22bf29b2020-04-01 21:05:10 -07001778 vmas, gup_flags | FOLL_TOUCH |
1779 FOLL_REMOTE);
1780 }
1781
Peter Xu64019a22020-08-11 18:39:01 -07001782 return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
John Hubbard22bf29b2020-04-01 21:05:10 -07001783 locked,
1784 gup_flags | FOLL_TOUCH | FOLL_REMOTE);
1785}
1786
Souptick Joarderadc8cb42020-06-01 21:48:24 -07001787/**
John Hubbardc4237f82020-01-30 22:12:36 -08001788 * get_user_pages_remote() - pin user pages in memory
John Hubbardc4237f82020-01-30 22:12:36 -08001789 * @mm: mm_struct of target mm
1790 * @start: starting user address
1791 * @nr_pages: number of pages from start to pin
1792 * @gup_flags: flags modifying lookup behaviour
1793 * @pages: array that receives pointers to the pages pinned.
1794 * Should be at least nr_pages long. Or NULL, if caller
1795 * only intends to ensure the pages are faulted in.
1796 * @vmas: array of pointers to vmas corresponding to each page.
1797 * Or NULL if the caller does not require them.
1798 * @locked: pointer to lock flag indicating whether lock is held and
1799 * subsequently whether VM_FAULT_RETRY functionality can be
1800 * utilised. Lock must initially be held.
1801 *
1802 * Returns either number of pages pinned (which may be less than the
1803 * number requested), or an error. Details about the return value:
1804 *
1805 * -- If nr_pages is 0, returns 0.
1806 * -- If nr_pages is >0, but no pages were pinned, returns -errno.
1807 * -- If nr_pages is >0, and some pages were pinned, returns the number of
1808 * pages pinned. Again, this may be less than nr_pages.
1809 *
1810 * The caller is responsible for releasing returned @pages, via put_page().
1811 *
Michel Lespinassec1e8d7c2020-06-08 21:33:54 -07001812 * @vmas are valid only as long as mmap_lock is held.
John Hubbardc4237f82020-01-30 22:12:36 -08001813 *
Michel Lespinassec1e8d7c2020-06-08 21:33:54 -07001814 * Must be called with mmap_lock held for read or write.
John Hubbardc4237f82020-01-30 22:12:36 -08001815 *
Souptick Joarderadc8cb42020-06-01 21:48:24 -07001816 * get_user_pages_remote walks a process's page tables and takes a reference
1817 * to each struct page that each user address corresponds to at a given
John Hubbardc4237f82020-01-30 22:12:36 -08001818 * instant. That is, it takes the page that would be accessed if a user
1819 * thread accesses the given user virtual address at that instant.
1820 *
1821 * This does not guarantee that the page exists in the user mappings when
Souptick Joarderadc8cb42020-06-01 21:48:24 -07001822 * get_user_pages_remote returns, and there may even be a completely different
John Hubbardc4237f82020-01-30 22:12:36 -08001823 * page there in some cases (eg. if mmapped pagecache has been invalidated
1824 * and subsequently re faulted). However it does guarantee that the page
1825 * won't be freed completely. And mostly callers simply care that the page
1826 * contains data that was valid *at some point in time*. Typically, an IO
1827 * or similar operation cannot guarantee anything stronger anyway because
1828 * locks can't be held over the syscall boundary.
1829 *
1830 * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
1831 * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
1832 * be called after the page is finished with, and before put_page is called.
1833 *
Souptick Joarderadc8cb42020-06-01 21:48:24 -07001834 * get_user_pages_remote is typically used for fewer-copy IO operations,
1835 * to get a handle on the memory by some means other than accesses
1836 * via the user virtual addresses. The pages may be submitted for
1837 * DMA to devices or accessed via their kernel linear mapping (via the
1838 * kmap APIs). Care should be taken to use the correct cache flushing APIs.
John Hubbardc4237f82020-01-30 22:12:36 -08001839 *
1840 * See also get_user_pages_fast, for performance critical applications.
1841 *
Souptick Joarderadc8cb42020-06-01 21:48:24 -07001842 * get_user_pages_remote should be phased out in favor of
John Hubbardc4237f82020-01-30 22:12:36 -08001843 * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
Souptick Joarderadc8cb42020-06-01 21:48:24 -07001844 * should use get_user_pages_remote because it cannot pass
John Hubbardc4237f82020-01-30 22:12:36 -08001845 * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
1846 */
Peter Xu64019a22020-08-11 18:39:01 -07001847long get_user_pages_remote(struct mm_struct *mm,
John Hubbardc4237f82020-01-30 22:12:36 -08001848 unsigned long start, unsigned long nr_pages,
1849 unsigned int gup_flags, struct page **pages,
1850 struct vm_area_struct **vmas, int *locked)
1851{
Barry Song447f3e42020-10-13 16:51:58 -07001852 if (!is_valid_gup_flags(gup_flags))
John Hubbardeddb1c22020-01-30 22:12:54 -08001853 return -EINVAL;
1854
Peter Xu64019a22020-08-11 18:39:01 -07001855 return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
John Hubbard22bf29b2020-04-01 21:05:10 -07001856 pages, vmas, locked);
John Hubbardc4237f82020-01-30 22:12:36 -08001857}
1858EXPORT_SYMBOL(get_user_pages_remote);
1859
John Hubbardeddb1c22020-01-30 22:12:54 -08001860#else /* CONFIG_MMU */
Peter Xu64019a22020-08-11 18:39:01 -07001861long get_user_pages_remote(struct mm_struct *mm,
John Hubbardeddb1c22020-01-30 22:12:54 -08001862 unsigned long start, unsigned long nr_pages,
1863 unsigned int gup_flags, struct page **pages,
1864 struct vm_area_struct **vmas, int *locked)
1865{
1866 return 0;
1867}
John Hubbard3faa52c2020-04-01 21:05:29 -07001868
Peter Xu64019a22020-08-11 18:39:01 -07001869static long __get_user_pages_remote(struct mm_struct *mm,
John Hubbard3faa52c2020-04-01 21:05:29 -07001870 unsigned long start, unsigned long nr_pages,
1871 unsigned int gup_flags, struct page **pages,
1872 struct vm_area_struct **vmas, int *locked)
1873{
1874 return 0;
1875}
John Hubbardeddb1c22020-01-30 22:12:54 -08001876#endif /* !CONFIG_MMU */
1877
Souptick Joarderadc8cb42020-06-01 21:48:24 -07001878/**
1879 * get_user_pages() - pin user pages in memory
1880 * @start: starting user address
1881 * @nr_pages: number of pages from start to pin
1882 * @gup_flags: flags modifying lookup behaviour
1883 * @pages: array that receives pointers to the pages pinned.
1884 * Should be at least nr_pages long. Or NULL, if caller
1885 * only intends to ensure the pages are faulted in.
1886 * @vmas: array of pointers to vmas corresponding to each page.
1887 * Or NULL if the caller does not require them.
1888 *
Peter Xu64019a22020-08-11 18:39:01 -07001889 * This is the same as get_user_pages_remote(), just with a less-flexible
1890 * calling convention where we assume that the mm being operated on belongs to
1891 * the current task, and doesn't allow passing of a locked parameter. We also
1892 * obviously don't pass FOLL_REMOTE in here.
Ira Weiny932f4a62019-05-13 17:17:03 -07001893 */
1894long get_user_pages(unsigned long start, unsigned long nr_pages,
1895 unsigned int gup_flags, struct page **pages,
1896 struct vm_area_struct **vmas)
1897{
Barry Song447f3e42020-10-13 16:51:58 -07001898 if (!is_valid_gup_flags(gup_flags))
John Hubbardeddb1c22020-01-30 22:12:54 -08001899 return -EINVAL;
1900
Peter Xu64019a22020-08-11 18:39:01 -07001901 return __gup_longterm_locked(current->mm, start, nr_pages,
Ira Weiny932f4a62019-05-13 17:17:03 -07001902 pages, vmas, gup_flags | FOLL_TOUCH);
1903}
1904EXPORT_SYMBOL(get_user_pages);
Dan Williams2bb6d282017-11-29 16:10:35 -08001905
Souptick Joarderadc8cb42020-06-01 21:48:24 -07001906/**
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001907 * get_user_pages_locked() is suitable to replace the form:
Kirill A. Shutemovacc3c8d2015-04-14 15:44:45 -07001908 *
Michel Lespinasse3e4e28c2020-06-08 21:33:51 -07001909 * mmap_read_lock(mm);
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001910 * do_something()
Peter Xu64019a22020-08-11 18:39:01 -07001911 * get_user_pages(mm, ..., pages, NULL);
Michel Lespinasse3e4e28c2020-06-08 21:33:51 -07001912 * mmap_read_unlock(mm);
Kirill A. Shutemovacc3c8d2015-04-14 15:44:45 -07001913 *
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001914 * to:
Kirill A. Shutemovacc3c8d2015-04-14 15:44:45 -07001915 *
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001916 * int locked = 1;
Michel Lespinasse3e4e28c2020-06-08 21:33:51 -07001917 * mmap_read_lock(mm);
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001918 * do_something()
Peter Xu64019a22020-08-11 18:39:01 -07001919 * get_user_pages_locked(mm, ..., pages, &locked);
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001920 * if (locked)
Michel Lespinasse3e4e28c2020-06-08 21:33:51 -07001921 * mmap_read_unlock(mm);
Souptick Joarderadc8cb42020-06-01 21:48:24 -07001922 *
1923 * @start: starting user address
1924 * @nr_pages: number of pages from start to pin
1925 * @gup_flags: flags modifying lookup behaviour
1926 * @pages: array that receives pointers to the pages pinned.
1927 * Should be at least nr_pages long. Or NULL, if caller
1928 * only intends to ensure the pages are faulted in.
1929 * @locked: pointer to lock flag indicating whether lock is held and
1930 * subsequently whether VM_FAULT_RETRY functionality can be
1931 * utilised. Lock must initially be held.
1932 *
1933 * We can leverage the VM_FAULT_RETRY functionality in the page fault
1934 * paths better by using either get_user_pages_locked() or
1935 * get_user_pages_unlocked().
1936 *
Kirill A. Shutemovacc3c8d2015-04-14 15:44:45 -07001937 */
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001938long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
1939 unsigned int gup_flags, struct page **pages,
1940 int *locked)
Kirill A. Shutemovacc3c8d2015-04-14 15:44:45 -07001941{
Kirill A. Shutemovacc3c8d2015-04-14 15:44:45 -07001942 /*
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001943 * FIXME: Current FOLL_LONGTERM behavior is incompatible with
1944 * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
1945 * vmas. As there are no users of this flag in this call we simply
1946 * disallow this option for now.
Kirill A. Shutemovacc3c8d2015-04-14 15:44:45 -07001947 */
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001948 if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
1949 return -EINVAL;
John Hubbard420c2092020-06-07 21:41:02 -07001950 /*
1951 * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
1952 * never directly by the caller, so enforce that:
1953 */
1954 if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
1955 return -EINVAL;
Kirill A. Shutemovacc3c8d2015-04-14 15:44:45 -07001956
Peter Xu64019a22020-08-11 18:39:01 -07001957 return __get_user_pages_locked(current->mm, start, nr_pages,
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001958 pages, NULL, locked,
1959 gup_flags | FOLL_TOUCH);
Kirill A. Shutemovacc3c8d2015-04-14 15:44:45 -07001960}
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001961EXPORT_SYMBOL(get_user_pages_locked);
Kirill A. Shutemovacc3c8d2015-04-14 15:44:45 -07001962
1963/*
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001964 * get_user_pages_unlocked() is suitable to replace the form:
Kirill A. Shutemovacc3c8d2015-04-14 15:44:45 -07001965 *
Michel Lespinasse3e4e28c2020-06-08 21:33:51 -07001966 * mmap_read_lock(mm);
Peter Xu64019a22020-08-11 18:39:01 -07001967 * get_user_pages(mm, ..., pages, NULL);
Michel Lespinasse3e4e28c2020-06-08 21:33:51 -07001968 * mmap_read_unlock(mm);
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001969 *
1970 * with:
1971 *
Peter Xu64019a22020-08-11 18:39:01 -07001972 * get_user_pages_unlocked(mm, ..., pages);
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001973 *
1974 * It is functionally equivalent to get_user_pages_fast so
1975 * get_user_pages_fast should be used instead if specific gup_flags
1976 * (e.g. FOLL_FORCE) are not required.
Kirill A. Shutemovacc3c8d2015-04-14 15:44:45 -07001977 */
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001978long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
1979 struct page **pages, unsigned int gup_flags)
Kirill A. Shutemovacc3c8d2015-04-14 15:44:45 -07001980{
1981 struct mm_struct *mm = current->mm;
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001982 int locked = 1;
1983 long ret;
Kirill A. Shutemovacc3c8d2015-04-14 15:44:45 -07001984
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001985 /*
1986 * FIXME: Current FOLL_LONGTERM behavior is incompatible with
1987 * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
1988 * vmas. As there are no users of this flag in this call we simply
1989 * disallow this option for now.
1990 */
1991 if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
1992 return -EINVAL;
Kirill A. Shutemovacc3c8d2015-04-14 15:44:45 -07001993
Michel Lespinassed8ed45c2020-06-08 21:33:25 -07001994 mmap_read_lock(mm);
Peter Xu64019a22020-08-11 18:39:01 -07001995 ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL,
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001996 &locked, gup_flags | FOLL_TOUCH);
Kirill A. Shutemovacc3c8d2015-04-14 15:44:45 -07001997 if (locked)
Michel Lespinassed8ed45c2020-06-08 21:33:25 -07001998 mmap_read_unlock(mm);
Christoph Hellwigd3649f62019-07-11 20:57:18 -07001999 return ret;
Kirill A. Shutemovacc3c8d2015-04-14 15:44:45 -07002000}
Christoph Hellwigd3649f62019-07-11 20:57:18 -07002001EXPORT_SYMBOL(get_user_pages_unlocked);
Steve Capper2667f502014-10-09 15:29:14 -07002002
2003/*
Christoph Hellwig67a929e2019-07-11 20:57:14 -07002004 * Fast GUP
Steve Capper2667f502014-10-09 15:29:14 -07002005 *
2006 * get_user_pages_fast attempts to pin user pages by walking the page
2007 * tables directly and avoids taking locks. Thus the walker needs to be
2008 * protected from page table pages being freed from under it, and should
2009 * block any THP splits.
2010 *
2011 * One way to achieve this is to have the walker disable interrupts, and
2012 * rely on IPIs from the TLB flushing code blocking before the page table
2013 * pages are freed. This is unsuitable for architectures that do not need
2014 * to broadcast an IPI when invalidating TLBs.
2015 *
2016 * Another way to achieve this is to batch up page table containing pages
2017 * belonging to more than one mm_user, then rcu_sched a callback to free those
2018 * pages. Disabling interrupts will allow the fast_gup walker to both block
2019 * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
2020 * (which is a relatively rare event). The code below adopts this strategy.
2021 *
2022 * Before activating this code, please be aware that the following assumptions
2023 * are currently made:
2024 *
Peter Zijlstraff2e6d722020-02-03 17:37:02 -08002025 * *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
Kirill A. Shutemove5855132017-06-06 14:31:20 +03002026 * free pages containing page tables or TLB flushing requires IPI broadcast.
Steve Capper2667f502014-10-09 15:29:14 -07002027 *
Steve Capper2667f502014-10-09 15:29:14 -07002028 * *) ptes can be read atomically by the architecture.
2029 *
2030 * *) access_ok is sufficient to validate userspace address ranges.
2031 *
2032 * The last two assumptions can be relaxed by the addition of helper functions.
2033 *
2034 * This code is based heavily on the PowerPC implementation by Nick Piggin.
2035 */
Christoph Hellwig67a929e2019-07-11 20:57:14 -07002036#ifdef CONFIG_HAVE_FAST_GUP
Christoph Hellwig39656e82019-07-11 20:56:49 -07002037#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
John Hubbard3faa52c2020-04-01 21:05:29 -07002038
Kirill A. Shutemov0005d202017-03-16 18:26:51 +03002039/*
Christoph Hellwig39656e82019-07-11 20:56:49 -07002040 * WARNING: only to be used in the get_user_pages_fast() implementation.
2041 *
2042 * With get_user_pages_fast(), we walk down the pagetables without taking any
2043 * locks. For this we would like to load the pointers atomically, but sometimes
2044 * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What
2045 * we do have is the guarantee that a PTE will only either go from not present
2046 * to present, or present to not present or both -- it will not switch to a
2047 * completely different present page without a TLB flush in between; something
2048 * that we are blocking by holding interrupts off.
2049 *
2050 * Setting ptes from not present to present goes:
2051 *
2052 * ptep->pte_high = h;
2053 * smp_wmb();
2054 * ptep->pte_low = l;
2055 *
2056 * And present to not present goes:
2057 *
2058 * ptep->pte_low = 0;
2059 * smp_wmb();
2060 * ptep->pte_high = 0;
2061 *
2062 * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'.
2063 * We load pte_high *after* loading pte_low, which ensures we don't see an older
2064 * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't
2065 * picked up a changed pte high. We might have gotten rubbish values from
2066 * pte_low and pte_high, but we are guaranteed that pte_low will not have the
2067 * present bit set *unless* it is 'l'. Because get_user_pages_fast() only
2068 * operates on present ptes we're safe.
2069 */
2070static inline pte_t gup_get_pte(pte_t *ptep)
2071{
2072 pte_t pte;
2073
2074 do {
2075 pte.pte_low = ptep->pte_low;
2076 smp_rmb();
2077 pte.pte_high = ptep->pte_high;
2078 smp_rmb();
2079 } while (unlikely(pte.pte_low != ptep->pte_low));
2080
2081 return pte;
2082}
2083#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */
2084/*
2085 * We require that the PTE can be read atomically.
Kirill A. Shutemov0005d202017-03-16 18:26:51 +03002086 */
2087static inline pte_t gup_get_pte(pte_t *ptep)
2088{
Christophe Leroy481e9802020-06-15 12:57:58 +00002089 return ptep_get(ptep);
Kirill A. Shutemov0005d202017-03-16 18:26:51 +03002090}
Christoph Hellwig39656e82019-07-11 20:56:49 -07002091#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
Kirill A. Shutemov0005d202017-03-16 18:26:51 +03002092
Guenter Roeck790c7362019-07-11 20:57:46 -07002093static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
John Hubbard3b78d832020-04-01 21:05:22 -07002094 unsigned int flags,
Guenter Roeck790c7362019-07-11 20:57:46 -07002095 struct page **pages)
Kirill A. Shutemovb59f65f2017-03-16 18:26:53 +03002096{
2097 while ((*nr) - nr_start) {
2098 struct page *page = pages[--(*nr)];
2099
2100 ClearPageReferenced(page);
John Hubbard3faa52c2020-04-01 21:05:29 -07002101 if (flags & FOLL_PIN)
2102 unpin_user_page(page);
2103 else
2104 put_page(page);
Kirill A. Shutemovb59f65f2017-03-16 18:26:53 +03002105 }
2106}
2107
Laurent Dufour3010a5e2018-06-07 17:06:08 -07002108#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
Steve Capper2667f502014-10-09 15:29:14 -07002109static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
Ira Weinyb798bec2019-05-13 17:17:07 -07002110 unsigned int flags, struct page **pages, int *nr)
Steve Capper2667f502014-10-09 15:29:14 -07002111{
Kirill A. Shutemovb59f65f2017-03-16 18:26:53 +03002112 struct dev_pagemap *pgmap = NULL;
2113 int nr_start = *nr, ret = 0;
Steve Capper2667f502014-10-09 15:29:14 -07002114 pte_t *ptep, *ptem;
Steve Capper2667f502014-10-09 15:29:14 -07002115
2116 ptem = ptep = pte_offset_map(&pmd, addr);
2117 do {
Kirill A. Shutemov0005d202017-03-16 18:26:51 +03002118 pte_t pte = gup_get_pte(ptep);
Kirill A. Shutemov7aef4172016-01-15 16:52:32 -08002119 struct page *head, *page;
Steve Capper2667f502014-10-09 15:29:14 -07002120
2121 /*
2122 * Similar to the PMD case below, NUMA hinting must take slow
Mel Gorman8a0516e2015-02-12 14:58:22 -08002123 * path using the pte_protnone check.
Steve Capper2667f502014-10-09 15:29:14 -07002124 */
Kirill A. Shutemove7884f82017-03-16 18:26:50 +03002125 if (pte_protnone(pte))
2126 goto pte_unmap;
2127
Ira Weinyb798bec2019-05-13 17:17:07 -07002128 if (!pte_access_permitted(pte, flags & FOLL_WRITE))
Kirill A. Shutemove7884f82017-03-16 18:26:50 +03002129 goto pte_unmap;
2130
Kirill A. Shutemovb59f65f2017-03-16 18:26:53 +03002131 if (pte_devmap(pte)) {
Ira Weiny7af75562019-05-13 17:17:14 -07002132 if (unlikely(flags & FOLL_LONGTERM))
2133 goto pte_unmap;
2134
Kirill A. Shutemovb59f65f2017-03-16 18:26:53 +03002135 pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
2136 if (unlikely(!pgmap)) {
John Hubbard3b78d832020-04-01 21:05:22 -07002137 undo_dev_pagemap(nr, nr_start, flags, pages);
Kirill A. Shutemovb59f65f2017-03-16 18:26:53 +03002138 goto pte_unmap;
2139 }
2140 } else if (pte_special(pte))
Steve Capper2667f502014-10-09 15:29:14 -07002141 goto pte_unmap;
2142
2143 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
2144 page = pte_page(pte);
2145
John Hubbard3faa52c2020-04-01 21:05:29 -07002146 head = try_grab_compound_head(page, 1, flags);
Linus Torvalds8fde12c2019-04-11 10:49:19 -07002147 if (!head)
Steve Capper2667f502014-10-09 15:29:14 -07002148 goto pte_unmap;
2149
2150 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
John Hubbard3faa52c2020-04-01 21:05:29 -07002151 put_compound_head(head, 1, flags);
Steve Capper2667f502014-10-09 15:29:14 -07002152 goto pte_unmap;
2153 }
2154
Kirill A. Shutemov7aef4172016-01-15 16:52:32 -08002155 VM_BUG_ON_PAGE(compound_head(page) != head, page);
Kirill A. Shutemove9348052017-03-16 18:26:52 +03002156
Claudio Imbrendaf28d4362020-04-01 21:05:56 -07002157 /*
2158 * We need to make the page accessible if and only if we are
2159 * going to access its content (the FOLL_PIN case). Please
2160 * see Documentation/core-api/pin_user_pages.rst for
2161 * details.
2162 */
2163 if (flags & FOLL_PIN) {
2164 ret = arch_make_page_accessible(page);
2165 if (ret) {
2166 unpin_user_page(page);
2167 goto pte_unmap;
2168 }
2169 }
Kirill A. Shutemove9348052017-03-16 18:26:52 +03002170 SetPageReferenced(page);
Steve Capper2667f502014-10-09 15:29:14 -07002171 pages[*nr] = page;
2172 (*nr)++;
2173
2174 } while (ptep++, addr += PAGE_SIZE, addr != end);
2175
2176 ret = 1;
2177
2178pte_unmap:
Christoph Hellwig832d7aa2017-12-29 08:54:01 +01002179 if (pgmap)
2180 put_dev_pagemap(pgmap);
Steve Capper2667f502014-10-09 15:29:14 -07002181 pte_unmap(ptem);
2182 return ret;
2183}
2184#else
2185
2186/*
2187 * If we can't determine whether or not a pte is special, then fail immediately
2188 * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
2189 * to be special.
2190 *
2191 * For a futex to be placed on a THP tail page, get_futex_key requires a
Souptick Joarderdadbb612020-06-07 21:40:55 -07002192 * get_user_pages_fast_only implementation that can pin pages. Thus it's still
Steve Capper2667f502014-10-09 15:29:14 -07002193 * useful to have gup_huge_pmd even if we can't operate on ptes.
2194 */
2195static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
Ira Weinyb798bec2019-05-13 17:17:07 -07002196 unsigned int flags, struct page **pages, int *nr)
Steve Capper2667f502014-10-09 15:29:14 -07002197{
2198 return 0;
2199}
Laurent Dufour3010a5e2018-06-07 17:06:08 -07002200#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
Steve Capper2667f502014-10-09 15:29:14 -07002201
Robin Murphy17596732019-07-16 16:30:47 -07002202#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
Kirill A. Shutemovb59f65f2017-03-16 18:26:53 +03002203static int __gup_device_huge(unsigned long pfn, unsigned long addr,
John Hubbard86dfbed2020-04-01 21:05:14 -07002204 unsigned long end, unsigned int flags,
2205 struct page **pages, int *nr)
Kirill A. Shutemovb59f65f2017-03-16 18:26:53 +03002206{
2207 int nr_start = *nr;
2208 struct dev_pagemap *pgmap = NULL;
2209
2210 do {
2211 struct page *page = pfn_to_page(pfn);
2212
2213 pgmap = get_dev_pagemap(pfn, pgmap);
2214 if (unlikely(!pgmap)) {
John Hubbard3b78d832020-04-01 21:05:22 -07002215 undo_dev_pagemap(nr, nr_start, flags, pages);
Kirill A. Shutemovb59f65f2017-03-16 18:26:53 +03002216 return 0;
2217 }
2218 SetPageReferenced(page);
2219 pages[*nr] = page;
John Hubbard3faa52c2020-04-01 21:05:29 -07002220 if (unlikely(!try_grab_page(page, flags))) {
2221 undo_dev_pagemap(nr, nr_start, flags, pages);
2222 return 0;
2223 }
Kirill A. Shutemovb59f65f2017-03-16 18:26:53 +03002224 (*nr)++;
2225 pfn++;
2226 } while (addr += PAGE_SIZE, addr != end);
Christoph Hellwig832d7aa2017-12-29 08:54:01 +01002227
2228 if (pgmap)
2229 put_dev_pagemap(pgmap);
Kirill A. Shutemovb59f65f2017-03-16 18:26:53 +03002230 return 1;
2231}
2232
Dan Williamsa9b6de72018-04-19 21:32:19 -07002233static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
John Hubbard86dfbed2020-04-01 21:05:14 -07002234 unsigned long end, unsigned int flags,
2235 struct page **pages, int *nr)
Kirill A. Shutemovb59f65f2017-03-16 18:26:53 +03002236{
2237 unsigned long fault_pfn;
Dan Williamsa9b6de72018-04-19 21:32:19 -07002238 int nr_start = *nr;
Kirill A. Shutemovb59f65f2017-03-16 18:26:53 +03002239
Dan Williamsa9b6de72018-04-19 21:32:19 -07002240 fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
John Hubbard86dfbed2020-04-01 21:05:14 -07002241 if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
Dan Williamsa9b6de72018-04-19 21:32:19 -07002242 return 0;
2243
2244 if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
John Hubbard3b78d832020-04-01 21:05:22 -07002245 undo_dev_pagemap(nr, nr_start, flags, pages);
Dan Williamsa9b6de72018-04-19 21:32:19 -07002246 return 0;
2247 }
2248 return 1;
Kirill A. Shutemovb59f65f2017-03-16 18:26:53 +03002249}
2250
Dan Williamsa9b6de72018-04-19 21:32:19 -07002251static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
John Hubbard86dfbed2020-04-01 21:05:14 -07002252 unsigned long end, unsigned int flags,
2253 struct page **pages, int *nr)
Kirill A. Shutemovb59f65f2017-03-16 18:26:53 +03002254{
2255 unsigned long fault_pfn;
Dan Williamsa9b6de72018-04-19 21:32:19 -07002256 int nr_start = *nr;
Kirill A. Shutemovb59f65f2017-03-16 18:26:53 +03002257
Dan Williamsa9b6de72018-04-19 21:32:19 -07002258 fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
John Hubbard86dfbed2020-04-01 21:05:14 -07002259 if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
Dan Williamsa9b6de72018-04-19 21:32:19 -07002260 return 0;
2261
2262 if (unlikely(pud_val(orig) != pud_val(*pudp))) {
John Hubbard3b78d832020-04-01 21:05:22 -07002263 undo_dev_pagemap(nr, nr_start, flags, pages);
Dan Williamsa9b6de72018-04-19 21:32:19 -07002264 return 0;
2265 }
2266 return 1;
Kirill A. Shutemovb59f65f2017-03-16 18:26:53 +03002267}
2268#else
Dan Williamsa9b6de72018-04-19 21:32:19 -07002269static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
John Hubbard86dfbed2020-04-01 21:05:14 -07002270 unsigned long end, unsigned int flags,
2271 struct page **pages, int *nr)
Kirill A. Shutemovb59f65f2017-03-16 18:26:53 +03002272{
2273 BUILD_BUG();
2274 return 0;
2275}
2276
Dan Williamsa9b6de72018-04-19 21:32:19 -07002277static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
John Hubbard86dfbed2020-04-01 21:05:14 -07002278 unsigned long end, unsigned int flags,
2279 struct page **pages, int *nr)
Kirill A. Shutemovb59f65f2017-03-16 18:26:53 +03002280{
2281 BUILD_BUG();
2282 return 0;
2283}
2284#endif
2285
John Hubbarda43e9822020-01-30 22:12:17 -08002286static int record_subpages(struct page *page, unsigned long addr,
2287 unsigned long end, struct page **pages)
2288{
2289 int nr;
2290
2291 for (nr = 0; addr != end; addr += PAGE_SIZE)
2292 pages[nr++] = page++;
2293
2294 return nr;
2295}
2296
Christoph Hellwigcbd34da2019-07-11 20:57:28 -07002297#ifdef CONFIG_ARCH_HAS_HUGEPD
2298static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
2299 unsigned long sz)
2300{
2301 unsigned long __boundary = (addr + sz) & ~(sz-1);
2302 return (__boundary - 1 < end - 1) ? __boundary : end;
2303}
2304
2305static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
John Hubbard0cd22af2019-10-18 20:19:53 -07002306 unsigned long end, unsigned int flags,
2307 struct page **pages, int *nr)
Christoph Hellwigcbd34da2019-07-11 20:57:28 -07002308{
2309 unsigned long pte_end;
2310 struct page *head, *page;
2311 pte_t pte;
2312 int refs;
2313
2314 pte_end = (addr + sz) & ~(sz-1);
2315 if (pte_end < end)
2316 end = pte_end;
2317
Christophe Leroy55ca2262020-06-15 12:57:57 +00002318 pte = huge_ptep_get(ptep);
Christoph Hellwigcbd34da2019-07-11 20:57:28 -07002319
John Hubbard0cd22af2019-10-18 20:19:53 -07002320 if (!pte_access_permitted(pte, flags & FOLL_WRITE))
Christoph Hellwigcbd34da2019-07-11 20:57:28 -07002321 return 0;
2322
2323 /* hugepages are never "special" */
2324 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
2325
Christoph Hellwigcbd34da2019-07-11 20:57:28 -07002326 head = pte_page(pte);
Christoph Hellwigcbd34da2019-07-11 20:57:28 -07002327 page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
John Hubbarda43e9822020-01-30 22:12:17 -08002328 refs = record_subpages(page, addr, end, pages + *nr);
Christoph Hellwigcbd34da2019-07-11 20:57:28 -07002329
John Hubbard3faa52c2020-04-01 21:05:29 -07002330 head = try_grab_compound_head(head, refs, flags);
John Hubbarda43e9822020-01-30 22:12:17 -08002331 if (!head)
Christoph Hellwigcbd34da2019-07-11 20:57:28 -07002332 return 0;
Christoph Hellwigcbd34da2019-07-11 20:57:28 -07002333
2334 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
John Hubbard3b78d832020-04-01 21:05:22 -07002335 put_compound_head(head, refs, flags);
Christoph Hellwigcbd34da2019-07-11 20:57:28 -07002336 return 0;
2337 }
2338
John Hubbarda43e9822020-01-30 22:12:17 -08002339 *nr += refs;
Christoph Hellwig520b4a42019-07-11 20:57:36 -07002340 SetPageReferenced(head);
Christoph Hellwigcbd34da2019-07-11 20:57:28 -07002341 return 1;
2342}
2343
2344static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
John Hubbard0cd22af2019-10-18 20:19:53 -07002345 unsigned int pdshift, unsigned long end, unsigned int flags,
Christoph Hellwigcbd34da2019-07-11 20:57:28 -07002346 struct page **pages, int *nr)
2347{
2348 pte_t *ptep;
2349 unsigned long sz = 1UL << hugepd_shift(hugepd);
2350 unsigned long next;
2351
2352 ptep = hugepte_offset(hugepd, addr, pdshift);
2353 do {
2354 next = hugepte_addr_end(addr, end, sz);
John Hubbard0cd22af2019-10-18 20:19:53 -07002355 if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
Christoph Hellwigcbd34da2019-07-11 20:57:28 -07002356 return 0;
2357 } while (ptep++, addr = next, addr != end);
2358
2359 return 1;
2360}
2361#else
2362static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
John Hubbard0cd22af2019-10-18 20:19:53 -07002363 unsigned int pdshift, unsigned long end, unsigned int flags,
Christoph Hellwigcbd34da2019-07-11 20:57:28 -07002364 struct page **pages, int *nr)
2365{
2366 return 0;
2367}
2368#endif /* CONFIG_ARCH_HAS_HUGEPD */
2369
Steve Capper2667f502014-10-09 15:29:14 -07002370static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
John Hubbard0cd22af2019-10-18 20:19:53 -07002371 unsigned long end, unsigned int flags,
2372 struct page **pages, int *nr)
Steve Capper2667f502014-10-09 15:29:14 -07002373{
Kirill A. Shutemovddc58f22016-01-15 16:52:56 -08002374 struct page *head, *page;
Steve Capper2667f502014-10-09 15:29:14 -07002375 int refs;
2376
Ira Weinyb798bec2019-05-13 17:17:07 -07002377 if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
Steve Capper2667f502014-10-09 15:29:14 -07002378 return 0;
2379
Ira Weiny7af75562019-05-13 17:17:14 -07002380 if (pmd_devmap(orig)) {
2381 if (unlikely(flags & FOLL_LONGTERM))
2382 return 0;
John Hubbard86dfbed2020-04-01 21:05:14 -07002383 return __gup_device_huge_pmd(orig, pmdp, addr, end, flags,
2384 pages, nr);
Ira Weiny7af75562019-05-13 17:17:14 -07002385 }
Kirill A. Shutemovb59f65f2017-03-16 18:26:53 +03002386
Punit Agrawald63206e2017-07-06 15:39:39 -07002387 page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
John Hubbarda43e9822020-01-30 22:12:17 -08002388 refs = record_subpages(page, addr, end, pages + *nr);
Steve Capper2667f502014-10-09 15:29:14 -07002389
John Hubbard3faa52c2020-04-01 21:05:29 -07002390 head = try_grab_compound_head(pmd_page(orig), refs, flags);
John Hubbarda43e9822020-01-30 22:12:17 -08002391 if (!head)
Steve Capper2667f502014-10-09 15:29:14 -07002392 return 0;
Steve Capper2667f502014-10-09 15:29:14 -07002393
2394 if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
John Hubbard3b78d832020-04-01 21:05:22 -07002395 put_compound_head(head, refs, flags);
Steve Capper2667f502014-10-09 15:29:14 -07002396 return 0;
2397 }
2398
John Hubbarda43e9822020-01-30 22:12:17 -08002399 *nr += refs;
Kirill A. Shutemove9348052017-03-16 18:26:52 +03002400 SetPageReferenced(head);
Steve Capper2667f502014-10-09 15:29:14 -07002401 return 1;
2402}
2403
2404static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
John Hubbard86dfbed2020-04-01 21:05:14 -07002405 unsigned long end, unsigned int flags,
2406 struct page **pages, int *nr)
Steve Capper2667f502014-10-09 15:29:14 -07002407{
Kirill A. Shutemovddc58f22016-01-15 16:52:56 -08002408 struct page *head, *page;
Steve Capper2667f502014-10-09 15:29:14 -07002409 int refs;
2410
Ira Weinyb798bec2019-05-13 17:17:07 -07002411 if (!pud_access_permitted(orig, flags & FOLL_WRITE))
Steve Capper2667f502014-10-09 15:29:14 -07002412 return 0;
2413
Ira Weiny7af75562019-05-13 17:17:14 -07002414 if (pud_devmap(orig)) {
2415 if (unlikely(flags & FOLL_LONGTERM))
2416 return 0;
John Hubbard86dfbed2020-04-01 21:05:14 -07002417 return __gup_device_huge_pud(orig, pudp, addr, end, flags,
2418 pages, nr);
Ira Weiny7af75562019-05-13 17:17:14 -07002419 }
Kirill A. Shutemovb59f65f2017-03-16 18:26:53 +03002420
Punit Agrawald63206e2017-07-06 15:39:39 -07002421 page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
John Hubbarda43e9822020-01-30 22:12:17 -08002422 refs = record_subpages(page, addr, end, pages + *nr);
Steve Capper2667f502014-10-09 15:29:14 -07002423
John Hubbard3faa52c2020-04-01 21:05:29 -07002424 head = try_grab_compound_head(pud_page(orig), refs, flags);
John Hubbarda43e9822020-01-30 22:12:17 -08002425 if (!head)
Steve Capper2667f502014-10-09 15:29:14 -07002426 return 0;
Steve Capper2667f502014-10-09 15:29:14 -07002427
2428 if (unlikely(pud_val(orig) != pud_val(*pudp))) {
John Hubbard3b78d832020-04-01 21:05:22 -07002429 put_compound_head(head, refs, flags);
Steve Capper2667f502014-10-09 15:29:14 -07002430 return 0;
2431 }
2432
John Hubbarda43e9822020-01-30 22:12:17 -08002433 *nr += refs;
Kirill A. Shutemove9348052017-03-16 18:26:52 +03002434 SetPageReferenced(head);
Steve Capper2667f502014-10-09 15:29:14 -07002435 return 1;
2436}
2437
Aneesh Kumar K.Vf30c59e2014-11-05 21:57:40 +05302438static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
Ira Weinyb798bec2019-05-13 17:17:07 -07002439 unsigned long end, unsigned int flags,
Aneesh Kumar K.Vf30c59e2014-11-05 21:57:40 +05302440 struct page **pages, int *nr)
2441{
2442 int refs;
Kirill A. Shutemovddc58f22016-01-15 16:52:56 -08002443 struct page *head, *page;
Aneesh Kumar K.Vf30c59e2014-11-05 21:57:40 +05302444
Ira Weinyb798bec2019-05-13 17:17:07 -07002445 if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
Aneesh Kumar K.Vf30c59e2014-11-05 21:57:40 +05302446 return 0;
2447
Kirill A. Shutemovb59f65f2017-03-16 18:26:53 +03002448 BUILD_BUG_ON(pgd_devmap(orig));
John Hubbarda43e9822020-01-30 22:12:17 -08002449
Punit Agrawald63206e2017-07-06 15:39:39 -07002450 page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
John Hubbarda43e9822020-01-30 22:12:17 -08002451 refs = record_subpages(page, addr, end, pages + *nr);
Aneesh Kumar K.Vf30c59e2014-11-05 21:57:40 +05302452
John Hubbard3faa52c2020-04-01 21:05:29 -07002453 head = try_grab_compound_head(pgd_page(orig), refs, flags);
John Hubbarda43e9822020-01-30 22:12:17 -08002454 if (!head)
Aneesh Kumar K.Vf30c59e2014-11-05 21:57:40 +05302455 return 0;
Aneesh Kumar K.Vf30c59e2014-11-05 21:57:40 +05302456
2457 if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
John Hubbard3b78d832020-04-01 21:05:22 -07002458 put_compound_head(head, refs, flags);
Aneesh Kumar K.Vf30c59e2014-11-05 21:57:40 +05302459 return 0;
2460 }
2461
John Hubbarda43e9822020-01-30 22:12:17 -08002462 *nr += refs;
Kirill A. Shutemove9348052017-03-16 18:26:52 +03002463 SetPageReferenced(head);
Aneesh Kumar K.Vf30c59e2014-11-05 21:57:40 +05302464 return 1;
2465}
2466
Vasily Gorbikd3f7b1b2020-09-25 21:19:10 -07002467static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end,
Ira Weinyb798bec2019-05-13 17:17:07 -07002468 unsigned int flags, struct page **pages, int *nr)
Steve Capper2667f502014-10-09 15:29:14 -07002469{
2470 unsigned long next;
2471 pmd_t *pmdp;
2472
Vasily Gorbikd3f7b1b2020-09-25 21:19:10 -07002473 pmdp = pmd_offset_lockless(pudp, pud, addr);
Steve Capper2667f502014-10-09 15:29:14 -07002474 do {
Christian Borntraeger38c5ce92015-01-06 22:54:46 +01002475 pmd_t pmd = READ_ONCE(*pmdp);
Steve Capper2667f502014-10-09 15:29:14 -07002476
2477 next = pmd_addr_end(addr, end);
Zi Yan84c3fc42017-09-08 16:11:01 -07002478 if (!pmd_present(pmd))
Steve Capper2667f502014-10-09 15:29:14 -07002479 return 0;
2480
Yu Zhao414fd082019-02-12 15:35:58 -08002481 if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
2482 pmd_devmap(pmd))) {
Steve Capper2667f502014-10-09 15:29:14 -07002483 /*
2484 * NUMA hinting faults need to be handled in the GUP
2485 * slowpath for accounting purposes and so that they
2486 * can be serialised against THP migration.
2487 */
Mel Gorman8a0516e2015-02-12 14:58:22 -08002488 if (pmd_protnone(pmd))
Steve Capper2667f502014-10-09 15:29:14 -07002489 return 0;
2490
Ira Weinyb798bec2019-05-13 17:17:07 -07002491 if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
Steve Capper2667f502014-10-09 15:29:14 -07002492 pages, nr))
2493 return 0;
2494
Aneesh Kumar K.Vf30c59e2014-11-05 21:57:40 +05302495 } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) {
2496 /*
2497 * architecture have different format for hugetlbfs
2498 * pmd format and THP pmd format
2499 */
2500 if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
Ira Weinyb798bec2019-05-13 17:17:07 -07002501 PMD_SHIFT, next, flags, pages, nr))
Aneesh Kumar K.Vf30c59e2014-11-05 21:57:40 +05302502 return 0;
Ira Weinyb798bec2019-05-13 17:17:07 -07002503 } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
Mario Leinweber29231172018-04-05 16:24:18 -07002504 return 0;
Steve Capper2667f502014-10-09 15:29:14 -07002505 } while (pmdp++, addr = next, addr != end);
2506
2507 return 1;
2508}
2509
Vasily Gorbikd3f7b1b2020-09-25 21:19:10 -07002510static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end,
Ira Weinyb798bec2019-05-13 17:17:07 -07002511 unsigned int flags, struct page **pages, int *nr)
Steve Capper2667f502014-10-09 15:29:14 -07002512{
2513 unsigned long next;
2514 pud_t *pudp;
2515
Vasily Gorbikd3f7b1b2020-09-25 21:19:10 -07002516 pudp = pud_offset_lockless(p4dp, p4d, addr);
Steve Capper2667f502014-10-09 15:29:14 -07002517 do {
Christian Borntraegere37c6982014-12-07 21:41:33 +01002518 pud_t pud = READ_ONCE(*pudp);
Steve Capper2667f502014-10-09 15:29:14 -07002519
2520 next = pud_addr_end(addr, end);
Qiujun Huang154945202020-01-30 22:12:10 -08002521 if (unlikely(!pud_present(pud)))
Steve Capper2667f502014-10-09 15:29:14 -07002522 return 0;
Aneesh Kumar K.Vf30c59e2014-11-05 21:57:40 +05302523 if (unlikely(pud_huge(pud))) {
Ira Weinyb798bec2019-05-13 17:17:07 -07002524 if (!gup_huge_pud(pud, pudp, addr, next, flags,
Aneesh Kumar K.Vf30c59e2014-11-05 21:57:40 +05302525 pages, nr))
2526 return 0;
2527 } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
2528 if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
Ira Weinyb798bec2019-05-13 17:17:07 -07002529 PUD_SHIFT, next, flags, pages, nr))
Steve Capper2667f502014-10-09 15:29:14 -07002530 return 0;
Vasily Gorbikd3f7b1b2020-09-25 21:19:10 -07002531 } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr))
Steve Capper2667f502014-10-09 15:29:14 -07002532 return 0;
2533 } while (pudp++, addr = next, addr != end);
2534
2535 return 1;
2536}
2537
Vasily Gorbikd3f7b1b2020-09-25 21:19:10 -07002538static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end,
Ira Weinyb798bec2019-05-13 17:17:07 -07002539 unsigned int flags, struct page **pages, int *nr)
Kirill A. Shutemovc2febaf2017-03-09 17:24:07 +03002540{
2541 unsigned long next;
2542 p4d_t *p4dp;
2543
Vasily Gorbikd3f7b1b2020-09-25 21:19:10 -07002544 p4dp = p4d_offset_lockless(pgdp, pgd, addr);
Kirill A. Shutemovc2febaf2017-03-09 17:24:07 +03002545 do {
2546 p4d_t p4d = READ_ONCE(*p4dp);
2547
2548 next = p4d_addr_end(addr, end);
2549 if (p4d_none(p4d))
2550 return 0;
2551 BUILD_BUG_ON(p4d_huge(p4d));
2552 if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
2553 if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
Ira Weinyb798bec2019-05-13 17:17:07 -07002554 P4D_SHIFT, next, flags, pages, nr))
Kirill A. Shutemovc2febaf2017-03-09 17:24:07 +03002555 return 0;
Vasily Gorbikd3f7b1b2020-09-25 21:19:10 -07002556 } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr))
Kirill A. Shutemovc2febaf2017-03-09 17:24:07 +03002557 return 0;
2558 } while (p4dp++, addr = next, addr != end);
2559
2560 return 1;
2561}
2562
Kirill A. Shutemov5b65c4672017-09-09 00:56:03 +03002563static void gup_pgd_range(unsigned long addr, unsigned long end,
Ira Weinyb798bec2019-05-13 17:17:07 -07002564 unsigned int flags, struct page **pages, int *nr)
Kirill A. Shutemov5b65c4672017-09-09 00:56:03 +03002565{
2566 unsigned long next;
2567 pgd_t *pgdp;
2568
2569 pgdp = pgd_offset(current->mm, addr);
2570 do {
2571 pgd_t pgd = READ_ONCE(*pgdp);
2572
2573 next = pgd_addr_end(addr, end);
2574 if (pgd_none(pgd))
2575 return;
2576 if (unlikely(pgd_huge(pgd))) {
Ira Weinyb798bec2019-05-13 17:17:07 -07002577 if (!gup_huge_pgd(pgd, pgdp, addr, next, flags,
Kirill A. Shutemov5b65c4672017-09-09 00:56:03 +03002578 pages, nr))
2579 return;
2580 } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
2581 if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
Ira Weinyb798bec2019-05-13 17:17:07 -07002582 PGDIR_SHIFT, next, flags, pages, nr))
Kirill A. Shutemov5b65c4672017-09-09 00:56:03 +03002583 return;
Vasily Gorbikd3f7b1b2020-09-25 21:19:10 -07002584 } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr))
Kirill A. Shutemov5b65c4672017-09-09 00:56:03 +03002585 return;
2586 } while (pgdp++, addr = next, addr != end);
2587}
Christoph Hellwig050a9ad2019-07-11 20:57:21 -07002588#else
2589static inline void gup_pgd_range(unsigned long addr, unsigned long end,
2590 unsigned int flags, struct page **pages, int *nr)
2591{
2592}
2593#endif /* CONFIG_HAVE_FAST_GUP */
Kirill A. Shutemov5b65c4672017-09-09 00:56:03 +03002594
2595#ifndef gup_fast_permitted
2596/*
Souptick Joarderdadbb612020-06-07 21:40:55 -07002597 * Check if it's allowed to use get_user_pages_fast_only() for the range, or
Kirill A. Shutemov5b65c4672017-09-09 00:56:03 +03002598 * we need to fall back to the slow version:
2599 */
Christoph Hellwig26f4c322019-07-11 20:56:45 -07002600static bool gup_fast_permitted(unsigned long start, unsigned long end)
Kirill A. Shutemov5b65c4672017-09-09 00:56:03 +03002601{
Christoph Hellwig26f4c322019-07-11 20:56:45 -07002602 return true;
Kirill A. Shutemov5b65c4672017-09-09 00:56:03 +03002603}
2604#endif
2605
Ira Weiny7af75562019-05-13 17:17:14 -07002606static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
2607 unsigned int gup_flags, struct page **pages)
2608{
2609 int ret;
2610
2611 /*
2612 * FIXME: FOLL_LONGTERM does not work with
2613 * get_user_pages_unlocked() (see comments in that function)
2614 */
2615 if (gup_flags & FOLL_LONGTERM) {
Michel Lespinassed8ed45c2020-06-08 21:33:25 -07002616 mmap_read_lock(current->mm);
Peter Xu64019a22020-08-11 18:39:01 -07002617 ret = __gup_longterm_locked(current->mm,
Ira Weiny7af75562019-05-13 17:17:14 -07002618 start, nr_pages,
2619 pages, NULL, gup_flags);
Michel Lespinassed8ed45c2020-06-08 21:33:25 -07002620 mmap_read_unlock(current->mm);
Ira Weiny7af75562019-05-13 17:17:14 -07002621 } else {
2622 ret = get_user_pages_unlocked(start, nr_pages,
2623 pages, gup_flags);
2624 }
2625
2626 return ret;
2627}
2628
Jason Gunthorpebcb0f642020-12-14 19:05:41 -08002629static unsigned long lockless_pages_from_mm(unsigned long start,
2630 unsigned long end,
2631 unsigned int gup_flags,
2632 struct page **pages)
2633{
2634 unsigned long flags;
2635 int nr_pinned = 0;
Jason Gunthorpe53794652020-12-14 19:05:44 -08002636 unsigned seq;
Jason Gunthorpebcb0f642020-12-14 19:05:41 -08002637
2638 if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) ||
2639 !gup_fast_permitted(start, end))
2640 return 0;
2641
Jason Gunthorpe53794652020-12-14 19:05:44 -08002642 if (gup_flags & FOLL_PIN) {
2643 seq = raw_read_seqcount(&current->mm->write_protect_seq);
2644 if (seq & 1)
2645 return 0;
2646 }
2647
Jason Gunthorpebcb0f642020-12-14 19:05:41 -08002648 /*
2649 * Disable interrupts. The nested form is used, in order to allow full,
2650 * general purpose use of this routine.
2651 *
2652 * With interrupts disabled, we block page table pages from being freed
2653 * from under us. See struct mmu_table_batch comments in
2654 * include/asm-generic/tlb.h for more details.
2655 *
2656 * We do not adopt an rcu_read_lock() here as we also want to block IPIs
2657 * that come from THPs splitting.
2658 */
2659 local_irq_save(flags);
2660 gup_pgd_range(start, end, gup_flags, pages, &nr_pinned);
2661 local_irq_restore(flags);
Jason Gunthorpe53794652020-12-14 19:05:44 -08002662
2663 /*
2664 * When pinning pages for DMA there could be a concurrent write protect
2665 * from fork() via copy_page_range(), in this case always fail fast GUP.
2666 */
2667 if (gup_flags & FOLL_PIN) {
2668 if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) {
2669 unpin_user_pages(pages, nr_pinned);
2670 return 0;
2671 }
2672 }
Jason Gunthorpebcb0f642020-12-14 19:05:41 -08002673 return nr_pinned;
2674}
2675
2676static int internal_get_user_pages_fast(unsigned long start,
2677 unsigned long nr_pages,
John Hubbardeddb1c22020-01-30 22:12:54 -08002678 unsigned int gup_flags,
2679 struct page **pages)
Steve Capper2667f502014-10-09 15:29:14 -07002680{
Jason Gunthorpebcb0f642020-12-14 19:05:41 -08002681 unsigned long len, end;
2682 unsigned long nr_pinned;
2683 int ret;
Steve Capper2667f502014-10-09 15:29:14 -07002684
John Hubbardf4000fd2020-01-30 22:12:43 -08002685 if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
John Hubbard376a34ef2020-06-03 15:56:30 -07002686 FOLL_FORCE | FOLL_PIN | FOLL_GET |
2687 FOLL_FAST_ONLY)))
Christoph Hellwig817be122019-07-11 20:57:25 -07002688 return -EINVAL;
2689
Peter Xu008cfe42020-09-25 18:25:57 -04002690 if (gup_flags & FOLL_PIN)
2691 atomic_set(&current->mm->has_pinned, 1);
2692
John Hubbardf81cd172020-06-03 15:56:40 -07002693 if (!(gup_flags & FOLL_FAST_ONLY))
Michel Lespinasseda1c55f2020-06-08 21:33:47 -07002694 might_lock_read(&current->mm->mmap_lock);
John Hubbardf81cd172020-06-03 15:56:40 -07002695
Christoph Hellwigf455c8542019-07-11 20:56:41 -07002696 start = untagged_addr(start) & PAGE_MASK;
Jason Gunthorpebcb0f642020-12-14 19:05:41 -08002697 len = nr_pages << PAGE_SHIFT;
2698 if (check_add_overflow(start, len, &end))
Michael S. Tsirkinc61611f2018-04-13 15:35:20 -07002699 return 0;
Linus Torvalds96d4f262019-01-03 18:57:57 -08002700 if (unlikely(!access_ok((void __user *)start, len)))
Michael S. Tsirkinc61611f2018-04-13 15:35:20 -07002701 return -EFAULT;
Kirill A. Shutemov73e10a62017-03-16 18:26:54 +03002702
Jason Gunthorpebcb0f642020-12-14 19:05:41 -08002703 nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages);
2704 if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
2705 return nr_pinned;
John Hubbard376a34ef2020-06-03 15:56:30 -07002706
Jason Gunthorpebcb0f642020-12-14 19:05:41 -08002707 /* Slow path: try to get the remaining pages with get_user_pages */
2708 start += nr_pinned << PAGE_SHIFT;
2709 pages += nr_pinned;
2710 ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags,
2711 pages);
2712 if (ret < 0) {
2713 /*
2714 * The caller has to unpin the pages we already pinned so
2715 * returning -errno is not an option
2716 */
2717 if (nr_pinned)
2718 return nr_pinned;
2719 return ret;
Kirill A. Shutemov73e10a62017-03-16 18:26:54 +03002720 }
Jason Gunthorpebcb0f642020-12-14 19:05:41 -08002721 return ret + nr_pinned;
Steve Capper2667f502014-10-09 15:29:14 -07002722}
Jason Gunthorpebcb0f642020-12-14 19:05:41 -08002723
Souptick Joarderdadbb612020-06-07 21:40:55 -07002724/**
2725 * get_user_pages_fast_only() - pin user pages in memory
2726 * @start: starting user address
2727 * @nr_pages: number of pages from start to pin
2728 * @gup_flags: flags modifying pin behaviour
2729 * @pages: array that receives pointers to the pages pinned.
2730 * Should be at least nr_pages long.
2731 *
John Hubbard9e1f0582020-06-03 15:56:27 -07002732 * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
2733 * the regular GUP.
2734 * Note a difference with get_user_pages_fast: this always returns the
2735 * number of pages pinned, 0 if no pages were pinned.
2736 *
2737 * If the architecture does not support this function, simply return with no
2738 * pages pinned.
2739 *
2740 * Careful, careful! COW breaking can go either way, so a non-write
2741 * access can get ambiguous page results. If you call this function without
2742 * 'write' set, you'd better be sure that you're ok with that ambiguity.
2743 */
Souptick Joarderdadbb612020-06-07 21:40:55 -07002744int get_user_pages_fast_only(unsigned long start, int nr_pages,
2745 unsigned int gup_flags, struct page **pages)
John Hubbard9e1f0582020-06-03 15:56:27 -07002746{
John Hubbard376a34ef2020-06-03 15:56:30 -07002747 int nr_pinned;
John Hubbard9e1f0582020-06-03 15:56:27 -07002748 /*
2749 * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
2750 * because gup fast is always a "pin with a +1 page refcount" request.
John Hubbard376a34ef2020-06-03 15:56:30 -07002751 *
2752 * FOLL_FAST_ONLY is required in order to match the API description of
2753 * this routine: no fall back to regular ("slow") GUP.
John Hubbard9e1f0582020-06-03 15:56:27 -07002754 */
Souptick Joarderdadbb612020-06-07 21:40:55 -07002755 gup_flags |= FOLL_GET | FOLL_FAST_ONLY;
John Hubbard9e1f0582020-06-03 15:56:27 -07002756
John Hubbard376a34ef2020-06-03 15:56:30 -07002757 nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
2758 pages);
John Hubbard9e1f0582020-06-03 15:56:27 -07002759
2760 /*
John Hubbard376a34ef2020-06-03 15:56:30 -07002761 * As specified in the API description above, this routine is not
2762 * allowed to return negative values. However, the common core
2763 * routine internal_get_user_pages_fast() *can* return -errno.
2764 * Therefore, correct for that here:
John Hubbard9e1f0582020-06-03 15:56:27 -07002765 */
John Hubbard376a34ef2020-06-03 15:56:30 -07002766 if (nr_pinned < 0)
2767 nr_pinned = 0;
John Hubbard9e1f0582020-06-03 15:56:27 -07002768
2769 return nr_pinned;
2770}
Souptick Joarderdadbb612020-06-07 21:40:55 -07002771EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
John Hubbard9e1f0582020-06-03 15:56:27 -07002772
John Hubbardeddb1c22020-01-30 22:12:54 -08002773/**
2774 * get_user_pages_fast() - pin user pages in memory
John Hubbard3faa52c2020-04-01 21:05:29 -07002775 * @start: starting user address
2776 * @nr_pages: number of pages from start to pin
2777 * @gup_flags: flags modifying pin behaviour
2778 * @pages: array that receives pointers to the pages pinned.
2779 * Should be at least nr_pages long.
John Hubbardeddb1c22020-01-30 22:12:54 -08002780 *
Michel Lespinassec1e8d7c2020-06-08 21:33:54 -07002781 * Attempt to pin user pages in memory without taking mm->mmap_lock.
John Hubbardeddb1c22020-01-30 22:12:54 -08002782 * If not successful, it will fall back to taking the lock and
2783 * calling get_user_pages().
2784 *
2785 * Returns number of pages pinned. This may be fewer than the number requested.
2786 * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
2787 * -errno.
2788 */
2789int get_user_pages_fast(unsigned long start, int nr_pages,
2790 unsigned int gup_flags, struct page **pages)
2791{
Barry Song447f3e42020-10-13 16:51:58 -07002792 if (!is_valid_gup_flags(gup_flags))
John Hubbardeddb1c22020-01-30 22:12:54 -08002793 return -EINVAL;
2794
John Hubbard94202f12020-04-01 21:05:25 -07002795 /*
2796 * The caller may or may not have explicitly set FOLL_GET; either way is
2797 * OK. However, internally (within mm/gup.c), gup fast variants must set
2798 * FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
2799 * request.
2800 */
2801 gup_flags |= FOLL_GET;
John Hubbardeddb1c22020-01-30 22:12:54 -08002802 return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
2803}
Christoph Hellwig050a9ad2019-07-11 20:57:21 -07002804EXPORT_SYMBOL_GPL(get_user_pages_fast);
John Hubbardeddb1c22020-01-30 22:12:54 -08002805
2806/**
2807 * pin_user_pages_fast() - pin user pages in memory without taking locks
2808 *
John Hubbard3faa52c2020-04-01 21:05:29 -07002809 * @start: starting user address
2810 * @nr_pages: number of pages from start to pin
2811 * @gup_flags: flags modifying pin behaviour
2812 * @pages: array that receives pointers to the pages pinned.
2813 * Should be at least nr_pages long.
2814 *
2815 * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See
2816 * get_user_pages_fast() for documentation on the function arguments, because
2817 * the arguments here are identical.
2818 *
2819 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
Mauro Carvalho Chehab72ef5e52020-04-14 18:48:35 +02002820 * see Documentation/core-api/pin_user_pages.rst for further details.
John Hubbardeddb1c22020-01-30 22:12:54 -08002821 */
2822int pin_user_pages_fast(unsigned long start, int nr_pages,
2823 unsigned int gup_flags, struct page **pages)
2824{
John Hubbard3faa52c2020-04-01 21:05:29 -07002825 /* FOLL_GET and FOLL_PIN are mutually exclusive. */
2826 if (WARN_ON_ONCE(gup_flags & FOLL_GET))
2827 return -EINVAL;
2828
2829 gup_flags |= FOLL_PIN;
2830 return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
John Hubbardeddb1c22020-01-30 22:12:54 -08002831}
2832EXPORT_SYMBOL_GPL(pin_user_pages_fast);
2833
John Hubbard104acc32020-06-03 15:56:34 -07002834/*
Souptick Joarderdadbb612020-06-07 21:40:55 -07002835 * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior
2836 * is the same, except that this one sets FOLL_PIN instead of FOLL_GET.
John Hubbard104acc32020-06-03 15:56:34 -07002837 *
2838 * The API rules are the same, too: no negative values may be returned.
2839 */
2840int pin_user_pages_fast_only(unsigned long start, int nr_pages,
2841 unsigned int gup_flags, struct page **pages)
2842{
2843 int nr_pinned;
2844
2845 /*
2846 * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API
2847 * rules require returning 0, rather than -errno:
2848 */
2849 if (WARN_ON_ONCE(gup_flags & FOLL_GET))
2850 return 0;
2851 /*
2852 * FOLL_FAST_ONLY is required in order to match the API description of
2853 * this routine: no fall back to regular ("slow") GUP.
2854 */
2855 gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY);
2856 nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
2857 pages);
2858 /*
2859 * This routine is not allowed to return negative values. However,
2860 * internal_get_user_pages_fast() *can* return -errno. Therefore,
2861 * correct for that here:
2862 */
2863 if (nr_pinned < 0)
2864 nr_pinned = 0;
2865
2866 return nr_pinned;
2867}
2868EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
2869
John Hubbardeddb1c22020-01-30 22:12:54 -08002870/**
Peter Xu64019a22020-08-11 18:39:01 -07002871 * pin_user_pages_remote() - pin pages of a remote process
John Hubbardeddb1c22020-01-30 22:12:54 -08002872 *
John Hubbard3faa52c2020-04-01 21:05:29 -07002873 * @mm: mm_struct of target mm
2874 * @start: starting user address
2875 * @nr_pages: number of pages from start to pin
2876 * @gup_flags: flags modifying lookup behaviour
2877 * @pages: array that receives pointers to the pages pinned.
2878 * Should be at least nr_pages long. Or NULL, if caller
2879 * only intends to ensure the pages are faulted in.
2880 * @vmas: array of pointers to vmas corresponding to each page.
2881 * Or NULL if the caller does not require them.
2882 * @locked: pointer to lock flag indicating whether lock is held and
2883 * subsequently whether VM_FAULT_RETRY functionality can be
2884 * utilised. Lock must initially be held.
2885 *
2886 * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See
2887 * get_user_pages_remote() for documentation on the function arguments, because
2888 * the arguments here are identical.
2889 *
2890 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
Mauro Carvalho Chehab72ef5e52020-04-14 18:48:35 +02002891 * see Documentation/core-api/pin_user_pages.rst for details.
John Hubbardeddb1c22020-01-30 22:12:54 -08002892 */
Peter Xu64019a22020-08-11 18:39:01 -07002893long pin_user_pages_remote(struct mm_struct *mm,
John Hubbardeddb1c22020-01-30 22:12:54 -08002894 unsigned long start, unsigned long nr_pages,
2895 unsigned int gup_flags, struct page **pages,
2896 struct vm_area_struct **vmas, int *locked)
2897{
John Hubbard3faa52c2020-04-01 21:05:29 -07002898 /* FOLL_GET and FOLL_PIN are mutually exclusive. */
2899 if (WARN_ON_ONCE(gup_flags & FOLL_GET))
2900 return -EINVAL;
2901
2902 gup_flags |= FOLL_PIN;
Peter Xu64019a22020-08-11 18:39:01 -07002903 return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
John Hubbard3faa52c2020-04-01 21:05:29 -07002904 pages, vmas, locked);
John Hubbardeddb1c22020-01-30 22:12:54 -08002905}
2906EXPORT_SYMBOL(pin_user_pages_remote);
2907
2908/**
2909 * pin_user_pages() - pin user pages in memory for use by other devices
2910 *
John Hubbard3faa52c2020-04-01 21:05:29 -07002911 * @start: starting user address
2912 * @nr_pages: number of pages from start to pin
2913 * @gup_flags: flags modifying lookup behaviour
2914 * @pages: array that receives pointers to the pages pinned.
2915 * Should be at least nr_pages long. Or NULL, if caller
2916 * only intends to ensure the pages are faulted in.
2917 * @vmas: array of pointers to vmas corresponding to each page.
2918 * Or NULL if the caller does not require them.
2919 *
2920 * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
2921 * FOLL_PIN is set.
2922 *
2923 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
Mauro Carvalho Chehab72ef5e52020-04-14 18:48:35 +02002924 * see Documentation/core-api/pin_user_pages.rst for details.
John Hubbardeddb1c22020-01-30 22:12:54 -08002925 */
2926long pin_user_pages(unsigned long start, unsigned long nr_pages,
2927 unsigned int gup_flags, struct page **pages,
2928 struct vm_area_struct **vmas)
2929{
John Hubbard3faa52c2020-04-01 21:05:29 -07002930 /* FOLL_GET and FOLL_PIN are mutually exclusive. */
2931 if (WARN_ON_ONCE(gup_flags & FOLL_GET))
2932 return -EINVAL;
2933
2934 gup_flags |= FOLL_PIN;
Peter Xu64019a22020-08-11 18:39:01 -07002935 return __gup_longterm_locked(current->mm, start, nr_pages,
John Hubbard3faa52c2020-04-01 21:05:29 -07002936 pages, vmas, gup_flags);
John Hubbardeddb1c22020-01-30 22:12:54 -08002937}
2938EXPORT_SYMBOL(pin_user_pages);
John Hubbard91429022020-06-01 21:48:27 -07002939
2940/*
2941 * pin_user_pages_unlocked() is the FOLL_PIN variant of
2942 * get_user_pages_unlocked(). Behavior is the same, except that this one sets
2943 * FOLL_PIN and rejects FOLL_GET.
2944 */
2945long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
2946 struct page **pages, unsigned int gup_flags)
2947{
2948 /* FOLL_GET and FOLL_PIN are mutually exclusive. */
2949 if (WARN_ON_ONCE(gup_flags & FOLL_GET))
2950 return -EINVAL;
2951
2952 gup_flags |= FOLL_PIN;
2953 return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
2954}
2955EXPORT_SYMBOL(pin_user_pages_unlocked);
John Hubbard420c2092020-06-07 21:41:02 -07002956
2957/*
2958 * pin_user_pages_locked() is the FOLL_PIN variant of get_user_pages_locked().
2959 * Behavior is the same, except that this one sets FOLL_PIN and rejects
2960 * FOLL_GET.
2961 */
2962long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
2963 unsigned int gup_flags, struct page **pages,
2964 int *locked)
2965{
2966 /*
2967 * FIXME: Current FOLL_LONGTERM behavior is incompatible with
2968 * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
2969 * vmas. As there are no users of this flag in this call we simply
2970 * disallow this option for now.
2971 */
2972 if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
2973 return -EINVAL;
2974
2975 /* FOLL_GET and FOLL_PIN are mutually exclusive. */
2976 if (WARN_ON_ONCE(gup_flags & FOLL_GET))
2977 return -EINVAL;
2978
2979 gup_flags |= FOLL_PIN;
Peter Xu64019a22020-08-11 18:39:01 -07002980 return __get_user_pages_locked(current->mm, start, nr_pages,
John Hubbard420c2092020-06-07 21:41:02 -07002981 pages, NULL, locked,
2982 gup_flags | FOLL_TOUCH);
2983}
2984EXPORT_SYMBOL(pin_user_pages_locked);