blob: d13dedfb4781a136d68e1e75f40f0c79223dd819 [file] [log] [blame]
Thomas Gleixnerc942fdd2019-05-27 08:55:06 +02001// SPDX-License-Identifier: GPL-2.0-or-later
Jérôme Glisse133ff0e2017-09-08 16:11:23 -07002/*
3 * Copyright 2013 Red Hat Inc.
4 *
Jérôme Glissef813f212018-10-30 15:04:06 -07005 * Authors: Jérôme Glisse <jglisse@redhat.com>
Jérôme Glisse133ff0e2017-09-08 16:11:23 -07006 */
7/*
8 * Refer to include/linux/hmm.h for information about heterogeneous memory
9 * management or HMM for short.
10 */
Christoph Hellwiga5201102019-08-28 16:19:53 +020011#include <linux/pagewalk.h>
Jérôme Glisse133ff0e2017-09-08 16:11:23 -070012#include <linux/hmm.h>
Jérôme Glisse858b54d2017-09-08 16:12:02 -070013#include <linux/init.h>
Jérôme Glisseda4c3c72017-09-08 16:11:31 -070014#include <linux/rmap.h>
15#include <linux/swap.h>
Jérôme Glisse133ff0e2017-09-08 16:11:23 -070016#include <linux/slab.h>
17#include <linux/sched.h>
Jérôme Glisse4ef589d2017-09-08 16:11:58 -070018#include <linux/mmzone.h>
19#include <linux/pagemap.h>
Jérôme Glisseda4c3c72017-09-08 16:11:31 -070020#include <linux/swapops.h>
21#include <linux/hugetlb.h>
Jérôme Glisse4ef589d2017-09-08 16:11:58 -070022#include <linux/memremap.h>
Jason Gunthorpec8a53b22019-05-23 10:36:46 -030023#include <linux/sched/mm.h>
Jérôme Glisse7b2d55d22017-09-08 16:11:46 -070024#include <linux/jump_label.h>
Jérôme Glisse55c0ece2019-05-13 17:20:28 -070025#include <linux/dma-mapping.h>
Jérôme Glissec0b12402017-09-08 16:11:27 -070026#include <linux/mmu_notifier.h>
Jérôme Glisse4ef589d2017-09-08 16:11:58 -070027#include <linux/memory_hotplug.h>
28
Jérôme Glisse74eee182017-09-08 16:11:35 -070029struct hmm_vma_walk {
30 struct hmm_range *range;
Jérôme Glisse992de9a2019-05-13 17:20:21 -070031 struct dev_pagemap *pgmap;
Jérôme Glisse74eee182017-09-08 16:11:35 -070032 unsigned long last;
Christoph Hellwig9a4903e2019-07-25 17:56:46 -070033 unsigned int flags;
Jérôme Glisse74eee182017-09-08 16:11:35 -070034};
35
Jérôme Glisse2aee09d2018-04-10 16:29:02 -070036static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
37 bool write_fault, uint64_t *pfn)
Jérôme Glisse74eee182017-09-08 16:11:35 -070038{
Kuehling, Felix9b1ae602019-05-10 19:53:24 +000039 unsigned int flags = FAULT_FLAG_REMOTE;
Jérôme Glisse74eee182017-09-08 16:11:35 -070040 struct hmm_vma_walk *hmm_vma_walk = walk->private;
Jérôme Glissef88a1e92018-04-10 16:29:06 -070041 struct hmm_range *range = hmm_vma_walk->range;
Jérôme Glisse74eee182017-09-08 16:11:35 -070042 struct vm_area_struct *vma = walk->vma;
Souptick Joarder50a7ca32018-08-17 15:44:47 -070043 vm_fault_t ret;
Jérôme Glisse74eee182017-09-08 16:11:35 -070044
Ralph Campbell6c64f2b2019-08-23 15:17:52 -070045 if (!vma)
46 goto err;
47
Christoph Hellwig9a4903e2019-07-25 17:56:46 -070048 if (write_fault)
49 flags |= FAULT_FLAG_WRITE;
50
Souptick Joarder50a7ca32018-08-17 15:44:47 -070051 ret = handle_mm_fault(vma, addr, flags);
Ralph Campbell6c64f2b2019-08-23 15:17:52 -070052 if (ret & VM_FAULT_ERROR)
53 goto err;
Jérôme Glisse74eee182017-09-08 16:11:35 -070054
Jérôme Glisse73231612019-05-13 17:19:58 -070055 return -EBUSY;
Ralph Campbell6c64f2b2019-08-23 15:17:52 -070056
57err:
58 *pfn = range->values[HMM_PFN_ERROR];
59 return -EFAULT;
Jérôme Glisse74eee182017-09-08 16:11:35 -070060}
61
Ralph Campbelld28c2c9a2019-11-04 14:21:40 -080062static int hmm_pfns_fill(unsigned long addr, unsigned long end,
63 struct hmm_range *range, enum hmm_pfn_value_e value)
Jérôme Glisseda4c3c72017-09-08 16:11:31 -070064{
Jérôme Glisseff05c0c2018-04-10 16:28:38 -070065 uint64_t *pfns = range->pfns;
Jérôme Glisseda4c3c72017-09-08 16:11:31 -070066 unsigned long i;
67
68 i = (addr - range->start) >> PAGE_SHIFT;
69 for (; addr < end; addr += PAGE_SIZE, i++)
Ralph Campbelld28c2c9a2019-11-04 14:21:40 -080070 pfns[i] = range->values[value];
Jérôme Glisseda4c3c72017-09-08 16:11:31 -070071
72 return 0;
73}
74
Jérôme Glisse5504ed22018-04-10 16:28:46 -070075/*
Ralph Campbelld2e8d552019-07-25 17:56:45 -070076 * hmm_vma_walk_hole_() - handle a range lacking valid pmd or pte(s)
77 * @addr: range virtual start address (inclusive)
Jérôme Glisse5504ed22018-04-10 16:28:46 -070078 * @end: range virtual end address (exclusive)
Jérôme Glisse2aee09d2018-04-10 16:29:02 -070079 * @fault: should we fault or not ?
80 * @write_fault: write fault ?
Jérôme Glisse5504ed22018-04-10 16:28:46 -070081 * @walk: mm_walk structure
Ralph Campbell085ea252019-05-06 16:29:39 -070082 * Return: 0 on success, -EBUSY after page fault, or page fault error
Jérôme Glisse5504ed22018-04-10 16:28:46 -070083 *
84 * This function will be called whenever pmd_none() or pte_none() returns true,
85 * or whenever there is no page directory covering the virtual address range.
86 */
Jérôme Glisse2aee09d2018-04-10 16:29:02 -070087static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
88 bool fault, bool write_fault,
89 struct mm_walk *walk)
Jérôme Glisseda4c3c72017-09-08 16:11:31 -070090{
Jérôme Glisse74eee182017-09-08 16:11:35 -070091 struct hmm_vma_walk *hmm_vma_walk = walk->private;
92 struct hmm_range *range = hmm_vma_walk->range;
Jérôme Glisseff05c0c2018-04-10 16:28:38 -070093 uint64_t *pfns = range->pfns;
Christoph Hellwig7f082632019-08-06 19:05:45 +030094 unsigned long i;
Jérôme Glisseda4c3c72017-09-08 16:11:31 -070095
Jérôme Glisse74eee182017-09-08 16:11:35 -070096 hmm_vma_walk->last = addr;
Christoph Hellwig7f082632019-08-06 19:05:45 +030097 i = (addr - range->start) >> PAGE_SHIFT;
Jérôme Glisse63d50662019-05-13 17:20:18 -070098
Ralph Campbellc18ce674d2019-08-23 15:17:53 -070099 if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE))
100 return -EPERM;
101
Christoph Hellwig7f082632019-08-06 19:05:45 +0300102 for (; addr < end; addr += PAGE_SIZE, i++) {
Jérôme Glissef88a1e92018-04-10 16:29:06 -0700103 pfns[i] = range->values[HMM_PFN_NONE];
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700104 if (fault || write_fault) {
Jérôme Glisse74eee182017-09-08 16:11:35 -0700105 int ret;
Jérôme Glisseda4c3c72017-09-08 16:11:31 -0700106
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700107 ret = hmm_vma_do_fault(walk, addr, write_fault,
108 &pfns[i]);
Jérôme Glisse73231612019-05-13 17:19:58 -0700109 if (ret != -EBUSY)
Jérôme Glisse74eee182017-09-08 16:11:35 -0700110 return ret;
111 }
112 }
113
Jérôme Glisse73231612019-05-13 17:19:58 -0700114 return (fault || write_fault) ? -EBUSY : 0;
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700115}
116
117static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
118 uint64_t pfns, uint64_t cpu_flags,
119 bool *fault, bool *write_fault)
120{
Jérôme Glissef88a1e92018-04-10 16:29:06 -0700121 struct hmm_range *range = hmm_vma_walk->range;
122
Christoph Hellwigd45d4642019-07-25 17:56:47 -0700123 if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT)
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700124 return;
125
Jérôme Glisse023a0192019-05-13 17:20:05 -0700126 /*
127 * So we not only consider the individual per page request we also
128 * consider the default flags requested for the range. The API can
Ralph Campbelld2e8d552019-07-25 17:56:45 -0700129 * be used 2 ways. The first one where the HMM user coalesces
130 * multiple page faults into one request and sets flags per pfn for
131 * those faults. The second one where the HMM user wants to pre-
Jérôme Glisse023a0192019-05-13 17:20:05 -0700132 * fault a range with specific flags. For the latter one it is a
133 * waste to have the user pre-fill the pfn arrays with a default
134 * flags value.
135 */
136 pfns = (pfns & range->pfn_flags_mask) | range->default_flags;
137
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700138 /* We aren't ask to do anything ... */
Jérôme Glissef88a1e92018-04-10 16:29:06 -0700139 if (!(pfns & range->flags[HMM_PFN_VALID]))
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700140 return;
Ralph Campbelld2e8d552019-07-25 17:56:45 -0700141 /* If this is device memory then only fault if explicitly requested */
Jérôme Glissef88a1e92018-04-10 16:29:06 -0700142 if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
143 /* Do we fault on device memory ? */
144 if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) {
145 *write_fault = pfns & range->flags[HMM_PFN_WRITE];
146 *fault = true;
147 }
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700148 return;
149 }
Jérôme Glissef88a1e92018-04-10 16:29:06 -0700150
151 /* If CPU page table is not valid then we need to fault */
152 *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]);
153 /* Need to write fault ? */
154 if ((pfns & range->flags[HMM_PFN_WRITE]) &&
155 !(cpu_flags & range->flags[HMM_PFN_WRITE])) {
156 *write_fault = true;
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700157 *fault = true;
158 }
159}
160
161static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
162 const uint64_t *pfns, unsigned long npages,
163 uint64_t cpu_flags, bool *fault,
164 bool *write_fault)
165{
166 unsigned long i;
167
Christoph Hellwigd45d4642019-07-25 17:56:47 -0700168 if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) {
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700169 *fault = *write_fault = false;
170 return;
171 }
172
Jérôme Glissea3e0d412019-05-13 17:20:01 -0700173 *fault = *write_fault = false;
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700174 for (i = 0; i < npages; ++i) {
175 hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags,
176 fault, write_fault);
Jérôme Glissea3e0d412019-05-13 17:20:01 -0700177 if ((*write_fault))
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700178 return;
179 }
180}
181
182static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
Steven Priceb7a16c72020-02-03 17:36:03 -0800183 __always_unused int depth, struct mm_walk *walk)
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700184{
185 struct hmm_vma_walk *hmm_vma_walk = walk->private;
186 struct hmm_range *range = hmm_vma_walk->range;
187 bool fault, write_fault;
188 unsigned long i, npages;
189 uint64_t *pfns;
190
191 i = (addr - range->start) >> PAGE_SHIFT;
192 npages = (end - addr) >> PAGE_SHIFT;
193 pfns = &range->pfns[i];
194 hmm_range_need_fault(hmm_vma_walk, pfns, npages,
195 0, &fault, &write_fault);
196 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
197}
198
Jérôme Glissef88a1e92018-04-10 16:29:06 -0700199static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700200{
201 if (pmd_protnone(pmd))
202 return 0;
Jérôme Glissef88a1e92018-04-10 16:29:06 -0700203 return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] |
204 range->flags[HMM_PFN_WRITE] :
205 range->flags[HMM_PFN_VALID];
Jérôme Glisseda4c3c72017-09-08 16:11:31 -0700206}
207
Jérôme Glisse992de9a2019-05-13 17:20:21 -0700208#ifdef CONFIG_TRANSPARENT_HUGEPAGE
Christoph Hellwig9d3973d2019-08-06 19:05:49 +0300209static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
210 unsigned long end, uint64_t *pfns, pmd_t pmd)
211{
Jérôme Glisse53f5c3f2018-04-10 16:28:59 -0700212 struct hmm_vma_walk *hmm_vma_walk = walk->private;
Jérôme Glissef88a1e92018-04-10 16:29:06 -0700213 struct hmm_range *range = hmm_vma_walk->range;
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700214 unsigned long pfn, npages, i;
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700215 bool fault, write_fault;
Jérôme Glissef88a1e92018-04-10 16:29:06 -0700216 uint64_t cpu_flags;
Jérôme Glisse53f5c3f2018-04-10 16:28:59 -0700217
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700218 npages = (end - addr) >> PAGE_SHIFT;
Jérôme Glissef88a1e92018-04-10 16:29:06 -0700219 cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700220 hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags,
221 &fault, &write_fault);
Jérôme Glisse53f5c3f2018-04-10 16:28:59 -0700222
Jason Gunthorpe24cee8a2020-03-11 17:03:33 -0300223 if (fault || write_fault)
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700224 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
Jérôme Glisse53f5c3f2018-04-10 16:28:59 -0700225
Christoph Hellwig309f9a42019-08-06 19:05:47 +0300226 pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
Jérôme Glisse992de9a2019-05-13 17:20:21 -0700227 for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
228 if (pmd_devmap(pmd)) {
229 hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
230 hmm_vma_walk->pgmap);
231 if (unlikely(!hmm_vma_walk->pgmap))
232 return -EBUSY;
233 }
Jérôme Glisse391aab12019-05-13 17:20:31 -0700234 pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
Jérôme Glisse992de9a2019-05-13 17:20:21 -0700235 }
236 if (hmm_vma_walk->pgmap) {
237 put_dev_pagemap(hmm_vma_walk->pgmap);
238 hmm_vma_walk->pgmap = NULL;
239 }
Jérôme Glisse53f5c3f2018-04-10 16:28:59 -0700240 hmm_vma_walk->last = end;
241 return 0;
242}
Christoph Hellwig9d3973d2019-08-06 19:05:49 +0300243#else /* CONFIG_TRANSPARENT_HUGEPAGE */
244/* stub to allow the code below to compile */
245int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
246 unsigned long end, uint64_t *pfns, pmd_t pmd);
247#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
Jérôme Glisse53f5c3f2018-04-10 16:28:59 -0700248
Jérôme Glissef88a1e92018-04-10 16:29:06 -0700249static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700250{
Philip Yang789c2af2019-05-23 16:32:31 -0400251 if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte))
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700252 return 0;
Jérôme Glissef88a1e92018-04-10 16:29:06 -0700253 return pte_write(pte) ? range->flags[HMM_PFN_VALID] |
254 range->flags[HMM_PFN_WRITE] :
255 range->flags[HMM_PFN_VALID];
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700256}
257
Jérôme Glisse53f5c3f2018-04-10 16:28:59 -0700258static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
259 unsigned long end, pmd_t *pmdp, pte_t *ptep,
260 uint64_t *pfn)
261{
262 struct hmm_vma_walk *hmm_vma_walk = walk->private;
Jérôme Glissef88a1e92018-04-10 16:29:06 -0700263 struct hmm_range *range = hmm_vma_walk->range;
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700264 bool fault, write_fault;
265 uint64_t cpu_flags;
Jérôme Glisse53f5c3f2018-04-10 16:28:59 -0700266 pte_t pte = *ptep;
Jérôme Glissef88a1e92018-04-10 16:29:06 -0700267 uint64_t orig_pfn = *pfn;
Jérôme Glisse53f5c3f2018-04-10 16:28:59 -0700268
Jérôme Glissef88a1e92018-04-10 16:29:06 -0700269 *pfn = range->values[HMM_PFN_NONE];
Jérôme Glisse73231612019-05-13 17:19:58 -0700270 fault = write_fault = false;
Jérôme Glisse53f5c3f2018-04-10 16:28:59 -0700271
272 if (pte_none(pte)) {
Jérôme Glisse73231612019-05-13 17:19:58 -0700273 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0,
274 &fault, &write_fault);
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700275 if (fault || write_fault)
Jérôme Glisse53f5c3f2018-04-10 16:28:59 -0700276 goto fault;
277 return 0;
278 }
279
280 if (!pte_present(pte)) {
281 swp_entry_t entry = pte_to_swp_entry(pte);
282
Jérôme Glisse53f5c3f2018-04-10 16:28:59 -0700283 /*
284 * This is a special swap entry, ignore migration, use
285 * device and report anything else as error.
286 */
287 if (is_device_private_entry(entry)) {
Jérôme Glissef88a1e92018-04-10 16:29:06 -0700288 cpu_flags = range->flags[HMM_PFN_VALID] |
289 range->flags[HMM_PFN_DEVICE_PRIVATE];
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700290 cpu_flags |= is_write_device_private_entry(entry) ?
Jérôme Glissef88a1e92018-04-10 16:29:06 -0700291 range->flags[HMM_PFN_WRITE] : 0;
292 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
293 &fault, &write_fault);
294 if (fault || write_fault)
295 goto fault;
Jérôme Glisse391aab12019-05-13 17:20:31 -0700296 *pfn = hmm_device_entry_from_pfn(range,
297 swp_offset(entry));
Jérôme Glissef88a1e92018-04-10 16:29:06 -0700298 *pfn |= cpu_flags;
Jérôme Glisse53f5c3f2018-04-10 16:28:59 -0700299 return 0;
300 }
301
Jason Gunthorpe76612d62020-02-28 15:52:32 -0400302 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, &fault,
303 &write_fault);
304 if (!fault && !write_fault)
Jérôme Glisse53f5c3f2018-04-10 16:28:59 -0700305 return 0;
Jason Gunthorpe76612d62020-02-28 15:52:32 -0400306
307 if (!non_swap_entry(entry))
308 goto fault;
309
310 if (is_migration_entry(entry)) {
311 pte_unmap(ptep);
312 hmm_vma_walk->last = addr;
313 migration_entry_wait(walk->mm, pmdp, addr);
314 return -EBUSY;
Jérôme Glisse53f5c3f2018-04-10 16:28:59 -0700315 }
316
317 /* Report error for everything else */
Jason Gunthorpedfdc2202020-02-28 15:30:37 -0400318 pte_unmap(ptep);
Jérôme Glissef88a1e92018-04-10 16:29:06 -0700319 *pfn = range->values[HMM_PFN_ERROR];
Jérôme Glisse53f5c3f2018-04-10 16:28:59 -0700320 return -EFAULT;
321 }
322
Jason Gunthorpe76612d62020-02-28 15:52:32 -0400323 cpu_flags = pte_to_hmm_pfn_flags(range, pte);
324 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, &fault,
325 &write_fault);
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700326 if (fault || write_fault)
Jérôme Glisse53f5c3f2018-04-10 16:28:59 -0700327 goto fault;
328
Jérôme Glisse992de9a2019-05-13 17:20:21 -0700329 if (pte_devmap(pte)) {
330 hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
331 hmm_vma_walk->pgmap);
Jason Gunthorpedfdc2202020-02-28 15:30:37 -0400332 if (unlikely(!hmm_vma_walk->pgmap)) {
333 pte_unmap(ptep);
Jérôme Glisse992de9a2019-05-13 17:20:21 -0700334 return -EBUSY;
Jason Gunthorpedfdc2202020-02-28 15:30:37 -0400335 }
Jason Gunthorpe40550622020-03-05 14:27:20 -0400336 }
337
338 /*
339 * Since each architecture defines a struct page for the zero page, just
340 * fall through and treat it like a normal page.
341 */
342 if (pte_special(pte) && !is_zero_pfn(pte_pfn(pte))) {
343 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, &fault,
344 &write_fault);
345 if (fault || write_fault) {
Jason Gunthorpedfdc2202020-02-28 15:30:37 -0400346 pte_unmap(ptep);
Ralph Campbellac541f22019-10-23 12:55:14 -0700347 return -EFAULT;
348 }
Jason Gunthorpe40550622020-03-05 14:27:20 -0400349 *pfn = range->values[HMM_PFN_SPECIAL];
350 return 0;
Jérôme Glisse992de9a2019-05-13 17:20:21 -0700351 }
352
Jérôme Glisse391aab12019-05-13 17:20:31 -0700353 *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags;
Jérôme Glisse53f5c3f2018-04-10 16:28:59 -0700354 return 0;
355
356fault:
Jérôme Glisse992de9a2019-05-13 17:20:21 -0700357 if (hmm_vma_walk->pgmap) {
358 put_dev_pagemap(hmm_vma_walk->pgmap);
359 hmm_vma_walk->pgmap = NULL;
360 }
Jérôme Glisse53f5c3f2018-04-10 16:28:59 -0700361 pte_unmap(ptep);
362 /* Fault any virtual address we were asked to fault */
Jérôme Glisse2aee09d2018-04-10 16:29:02 -0700363 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
Jérôme Glisse53f5c3f2018-04-10 16:28:59 -0700364}
365
Jérôme Glisseda4c3c72017-09-08 16:11:31 -0700366static int hmm_vma_walk_pmd(pmd_t *pmdp,
367 unsigned long start,
368 unsigned long end,
369 struct mm_walk *walk)
370{
Jérôme Glisse74eee182017-09-08 16:11:35 -0700371 struct hmm_vma_walk *hmm_vma_walk = walk->private;
372 struct hmm_range *range = hmm_vma_walk->range;
Jason Gunthorpe2288a9a2020-03-05 15:26:33 -0400373 uint64_t *pfns = &range->pfns[(start - range->start) >> PAGE_SHIFT];
374 unsigned long npages = (end - start) >> PAGE_SHIFT;
375 unsigned long addr = start;
376 bool fault, write_fault;
Jérôme Glisseda4c3c72017-09-08 16:11:31 -0700377 pte_t *ptep;
Jérôme Glissed08faca2018-10-30 15:04:20 -0700378 pmd_t pmd;
Jérôme Glisseda4c3c72017-09-08 16:11:31 -0700379
Jérôme Glisseda4c3c72017-09-08 16:11:31 -0700380again:
Jérôme Glissed08faca2018-10-30 15:04:20 -0700381 pmd = READ_ONCE(*pmdp);
382 if (pmd_none(pmd))
Steven Priceb7a16c72020-02-03 17:36:03 -0800383 return hmm_vma_walk_hole(start, end, -1, walk);
Jérôme Glisseda4c3c72017-09-08 16:11:31 -0700384
Jérôme Glissed08faca2018-10-30 15:04:20 -0700385 if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
Jérôme Glissed08faca2018-10-30 15:04:20 -0700386 hmm_range_need_fault(hmm_vma_walk, pfns, npages,
387 0, &fault, &write_fault);
388 if (fault || write_fault) {
389 hmm_vma_walk->last = addr;
Ralph Campbelld2e8d552019-07-25 17:56:45 -0700390 pmd_migration_entry_wait(walk->mm, pmdp);
Jérôme Glisse73231612019-05-13 17:19:58 -0700391 return -EBUSY;
Jérôme Glissed08faca2018-10-30 15:04:20 -0700392 }
Jason Gunthorpe7d082982020-03-04 16:25:56 -0400393 return hmm_pfns_fill(start, end, range, HMM_PFN_NONE);
Jason Gunthorpe2288a9a2020-03-05 15:26:33 -0400394 }
395
396 if (!pmd_present(pmd)) {
397 hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0, &fault,
398 &write_fault);
399 if (fault || write_fault)
400 return -EFAULT;
Ralph Campbelld28c2c9a2019-11-04 14:21:40 -0800401 return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
Jason Gunthorpe2288a9a2020-03-05 15:26:33 -0400402 }
Jérôme Glissed08faca2018-10-30 15:04:20 -0700403
404 if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
Jérôme Glisseda4c3c72017-09-08 16:11:31 -0700405 /*
Ralph Campbelld2e8d552019-07-25 17:56:45 -0700406 * No need to take pmd_lock here, even if some other thread
Jérôme Glisseda4c3c72017-09-08 16:11:31 -0700407 * is splitting the huge pmd we will get that event through
408 * mmu_notifier callback.
409 *
Ralph Campbelld2e8d552019-07-25 17:56:45 -0700410 * So just read pmd value and check again it's a transparent
Jérôme Glisseda4c3c72017-09-08 16:11:31 -0700411 * huge or device mapping one and compute corresponding pfn
412 * values.
413 */
414 pmd = pmd_read_atomic(pmdp);
415 barrier();
416 if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
417 goto again;
Jérôme Glisseda4c3c72017-09-08 16:11:31 -0700418
Jason Gunthorpe2288a9a2020-03-05 15:26:33 -0400419 return hmm_vma_handle_pmd(walk, addr, end, pfns, pmd);
Jérôme Glisseda4c3c72017-09-08 16:11:31 -0700420 }
421
Jérôme Glissed08faca2018-10-30 15:04:20 -0700422 /*
Ralph Campbelld2e8d552019-07-25 17:56:45 -0700423 * We have handled all the valid cases above ie either none, migration,
Jérôme Glissed08faca2018-10-30 15:04:20 -0700424 * huge or transparent huge. At this point either it is a valid pmd
425 * entry pointing to pte directory or it is a bad pmd that will not
426 * recover.
427 */
Jason Gunthorpe2288a9a2020-03-05 15:26:33 -0400428 if (pmd_bad(pmd)) {
429 hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0, &fault,
430 &write_fault);
431 if (fault || write_fault)
432 return -EFAULT;
Ralph Campbelld28c2c9a2019-11-04 14:21:40 -0800433 return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
Jason Gunthorpe2288a9a2020-03-05 15:26:33 -0400434 }
Jérôme Glisseda4c3c72017-09-08 16:11:31 -0700435
436 ptep = pte_offset_map(pmdp, addr);
Jason Gunthorpe2288a9a2020-03-05 15:26:33 -0400437 for (; addr < end; addr += PAGE_SIZE, ptep++, pfns++) {
Jérôme Glisse53f5c3f2018-04-10 16:28:59 -0700438 int r;
Jérôme Glisseda4c3c72017-09-08 16:11:31 -0700439
Jason Gunthorpe2288a9a2020-03-05 15:26:33 -0400440 r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, pfns);
Jérôme Glisse53f5c3f2018-04-10 16:28:59 -0700441 if (r) {
Jason Gunthorpedfdc2202020-02-28 15:30:37 -0400442 /* hmm_vma_handle_pte() did pte_unmap() */
Jérôme Glisse53f5c3f2018-04-10 16:28:59 -0700443 hmm_vma_walk->last = addr;
444 return r;
Jérôme Glisseda4c3c72017-09-08 16:11:31 -0700445 }
Jérôme Glisseda4c3c72017-09-08 16:11:31 -0700446 }
Jérôme Glisse992de9a2019-05-13 17:20:21 -0700447 if (hmm_vma_walk->pgmap) {
448 /*
449 * We do put_dev_pagemap() here and not in hmm_vma_handle_pte()
450 * so that we can leverage get_dev_pagemap() optimization which
451 * will not re-take a reference on a pgmap if we already have
452 * one.
453 */
454 put_dev_pagemap(hmm_vma_walk->pgmap);
455 hmm_vma_walk->pgmap = NULL;
456 }
Jérôme Glisseda4c3c72017-09-08 16:11:31 -0700457 pte_unmap(ptep - 1);
458
Jérôme Glisse53f5c3f2018-04-10 16:28:59 -0700459 hmm_vma_walk->last = addr;
Jérôme Glisseda4c3c72017-09-08 16:11:31 -0700460 return 0;
461}
462
Christoph Hellwigf0b3c452019-08-06 19:05:48 +0300463#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \
464 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
465static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
466{
467 if (!pud_present(pud))
468 return 0;
469 return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
470 range->flags[HMM_PFN_WRITE] :
471 range->flags[HMM_PFN_VALID];
472}
473
474static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
475 struct mm_walk *walk)
Jérôme Glisse992de9a2019-05-13 17:20:21 -0700476{
477 struct hmm_vma_walk *hmm_vma_walk = walk->private;
478 struct hmm_range *range = hmm_vma_walk->range;
Steven Price3afc4232020-02-03 17:35:45 -0800479 unsigned long addr = start;
Jérôme Glisse992de9a2019-05-13 17:20:21 -0700480 pud_t pud;
Steven Price3afc4232020-02-03 17:35:45 -0800481 int ret = 0;
482 spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma);
Jérôme Glisse992de9a2019-05-13 17:20:21 -0700483
Steven Price3afc4232020-02-03 17:35:45 -0800484 if (!ptl)
485 return 0;
486
487 /* Normally we don't want to split the huge page */
488 walk->action = ACTION_CONTINUE;
489
Jérôme Glisse992de9a2019-05-13 17:20:21 -0700490 pud = READ_ONCE(*pudp);
Steven Price3afc4232020-02-03 17:35:45 -0800491 if (pud_none(pud)) {
Jason Gunthorpe05fc1df2020-03-02 15:26:44 -0400492 spin_unlock(ptl);
493 return hmm_vma_walk_hole(start, end, -1, walk);
Steven Price3afc4232020-02-03 17:35:45 -0800494 }
Jérôme Glisse992de9a2019-05-13 17:20:21 -0700495
496 if (pud_huge(pud) && pud_devmap(pud)) {
497 unsigned long i, npages, pfn;
498 uint64_t *pfns, cpu_flags;
499 bool fault, write_fault;
500
Steven Price3afc4232020-02-03 17:35:45 -0800501 if (!pud_present(pud)) {
Jason Gunthorpe05fc1df2020-03-02 15:26:44 -0400502 spin_unlock(ptl);
503 return hmm_vma_walk_hole(start, end, -1, walk);
Steven Price3afc4232020-02-03 17:35:45 -0800504 }
Jérôme Glisse992de9a2019-05-13 17:20:21 -0700505
506 i = (addr - range->start) >> PAGE_SHIFT;
507 npages = (end - addr) >> PAGE_SHIFT;
508 pfns = &range->pfns[i];
509
510 cpu_flags = pud_to_hmm_pfn_flags(range, pud);
511 hmm_range_need_fault(hmm_vma_walk, pfns, npages,
512 cpu_flags, &fault, &write_fault);
Steven Price3afc4232020-02-03 17:35:45 -0800513 if (fault || write_fault) {
Jason Gunthorpe05fc1df2020-03-02 15:26:44 -0400514 spin_unlock(ptl);
515 return hmm_vma_walk_hole_(addr, end, fault, write_fault,
516 walk);
Steven Price3afc4232020-02-03 17:35:45 -0800517 }
Jérôme Glisse992de9a2019-05-13 17:20:21 -0700518
Jérôme Glisse992de9a2019-05-13 17:20:21 -0700519 pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
520 for (i = 0; i < npages; ++i, ++pfn) {
521 hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
522 hmm_vma_walk->pgmap);
Steven Price3afc4232020-02-03 17:35:45 -0800523 if (unlikely(!hmm_vma_walk->pgmap)) {
524 ret = -EBUSY;
525 goto out_unlock;
526 }
Jérôme Glisse391aab12019-05-13 17:20:31 -0700527 pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
528 cpu_flags;
Jérôme Glisse992de9a2019-05-13 17:20:21 -0700529 }
530 if (hmm_vma_walk->pgmap) {
531 put_dev_pagemap(hmm_vma_walk->pgmap);
532 hmm_vma_walk->pgmap = NULL;
533 }
534 hmm_vma_walk->last = end;
Steven Price3afc4232020-02-03 17:35:45 -0800535 goto out_unlock;
Jérôme Glisse992de9a2019-05-13 17:20:21 -0700536 }
537
Steven Price3afc4232020-02-03 17:35:45 -0800538 /* Ask for the PUD to be split */
539 walk->action = ACTION_SUBTREE;
Jérôme Glisse992de9a2019-05-13 17:20:21 -0700540
Steven Price3afc4232020-02-03 17:35:45 -0800541out_unlock:
542 spin_unlock(ptl);
543 return ret;
Jérôme Glisse992de9a2019-05-13 17:20:21 -0700544}
Christoph Hellwigf0b3c452019-08-06 19:05:48 +0300545#else
546#define hmm_vma_walk_pud NULL
547#endif
Jérôme Glisse992de9a2019-05-13 17:20:21 -0700548
Christoph Hellwig251bbe52019-08-06 19:05:50 +0300549#ifdef CONFIG_HUGETLB_PAGE
Jérôme Glisse63d50662019-05-13 17:20:18 -0700550static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
551 unsigned long start, unsigned long end,
552 struct mm_walk *walk)
553{
Christoph Hellwig05c23af2019-08-06 19:05:46 +0300554 unsigned long addr = start, i, pfn;
Jérôme Glisse63d50662019-05-13 17:20:18 -0700555 struct hmm_vma_walk *hmm_vma_walk = walk->private;
556 struct hmm_range *range = hmm_vma_walk->range;
557 struct vm_area_struct *vma = walk->vma;
Jérôme Glisse63d50662019-05-13 17:20:18 -0700558 uint64_t orig_pfn, cpu_flags;
559 bool fault, write_fault;
560 spinlock_t *ptl;
561 pte_t entry;
Jérôme Glisse63d50662019-05-13 17:20:18 -0700562
Ralph Campbelld2e8d552019-07-25 17:56:45 -0700563 ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
Jérôme Glisse63d50662019-05-13 17:20:18 -0700564 entry = huge_ptep_get(pte);
565
Christoph Hellwig7f082632019-08-06 19:05:45 +0300566 i = (start - range->start) >> PAGE_SHIFT;
Jérôme Glisse63d50662019-05-13 17:20:18 -0700567 orig_pfn = range->pfns[i];
568 range->pfns[i] = range->values[HMM_PFN_NONE];
569 cpu_flags = pte_to_hmm_pfn_flags(range, entry);
570 fault = write_fault = false;
571 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
572 &fault, &write_fault);
573 if (fault || write_fault) {
Christoph Hellwig45050692020-03-16 14:53:08 +0100574 spin_unlock(ptl);
575 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
Jérôme Glisse63d50662019-05-13 17:20:18 -0700576 }
577
Christoph Hellwig05c23af2019-08-06 19:05:46 +0300578 pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
Christoph Hellwig7f082632019-08-06 19:05:45 +0300579 for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
Jérôme Glisse391aab12019-05-13 17:20:31 -0700580 range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
581 cpu_flags;
Jérôme Glisse63d50662019-05-13 17:20:18 -0700582 hmm_vma_walk->last = end;
Jérôme Glisse63d50662019-05-13 17:20:18 -0700583 spin_unlock(ptl);
Christoph Hellwig45050692020-03-16 14:53:08 +0100584 return 0;
Jérôme Glisse63d50662019-05-13 17:20:18 -0700585}
Christoph Hellwig251bbe52019-08-06 19:05:50 +0300586#else
587#define hmm_vma_walk_hugetlb_entry NULL
588#endif /* CONFIG_HUGETLB_PAGE */
Jérôme Glisse63d50662019-05-13 17:20:18 -0700589
Ralph Campbelld28c2c9a2019-11-04 14:21:40 -0800590static int hmm_vma_walk_test(unsigned long start, unsigned long end,
591 struct mm_walk *walk)
Jérôme Glisse33cd47d2018-04-10 16:28:54 -0700592{
Ralph Campbelld28c2c9a2019-11-04 14:21:40 -0800593 struct hmm_vma_walk *hmm_vma_walk = walk->private;
594 struct hmm_range *range = hmm_vma_walk->range;
595 struct vm_area_struct *vma = walk->vma;
596
597 /*
Jason Gunthorpec2579c92020-03-05 12:00:22 -0400598 * Skip vma ranges that don't have struct page backing them or map I/O
599 * devices directly.
600 *
Ralph Campbelld28c2c9a2019-11-04 14:21:40 -0800601 * If the vma does not allow read access, then assume that it does not
Jason Gunthorpec2579c92020-03-05 12:00:22 -0400602 * allow write access either. HMM does not support architectures that
603 * allow write without read.
Ralph Campbelld28c2c9a2019-11-04 14:21:40 -0800604 */
Jason Gunthorpec2579c92020-03-05 12:00:22 -0400605 if ((vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) ||
606 !(vma->vm_flags & VM_READ)) {
Ralph Campbelld28c2c9a2019-11-04 14:21:40 -0800607 bool fault, write_fault;
608
609 /*
610 * Check to see if a fault is requested for any page in the
611 * range.
612 */
613 hmm_range_need_fault(hmm_vma_walk, range->pfns +
614 ((start - range->start) >> PAGE_SHIFT),
615 (end - start) >> PAGE_SHIFT,
616 0, &fault, &write_fault);
617 if (fault || write_fault)
618 return -EFAULT;
619
Jason Gunthorpec2579c92020-03-05 12:00:22 -0400620 hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
Ralph Campbelld28c2c9a2019-11-04 14:21:40 -0800621 hmm_vma_walk->last = end;
622
623 /* Skip this vma and continue processing the next vma. */
624 return 1;
625 }
626
627 return 0;
Jérôme Glisse33cd47d2018-04-10 16:28:54 -0700628}
629
Christoph Hellwig7b86ac32019-08-28 16:19:54 +0200630static const struct mm_walk_ops hmm_walk_ops = {
631 .pud_entry = hmm_vma_walk_pud,
632 .pmd_entry = hmm_vma_walk_pmd,
633 .pte_hole = hmm_vma_walk_hole,
634 .hugetlb_entry = hmm_vma_walk_hugetlb_entry,
Ralph Campbelld28c2c9a2019-11-04 14:21:40 -0800635 .test_walk = hmm_vma_walk_test,
Christoph Hellwig7b86ac32019-08-28 16:19:54 +0200636};
637
Christoph Hellwig9a4903e2019-07-25 17:56:46 -0700638/**
639 * hmm_range_fault - try to fault some address in a virtual address range
640 * @range: range being faulted
641 * @flags: HMM_FAULT_* flags
Jérôme Glisse73231612019-05-13 17:19:58 -0700642 *
Christoph Hellwig9a4903e2019-07-25 17:56:46 -0700643 * Return: the number of valid pages in range->pfns[] (from range start
644 * address), which may be zero. On error one of the following status codes
645 * can be returned:
646 *
647 * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma
648 * (e.g., device file vma).
649 * -ENOMEM: Out of memory.
650 * -EPERM: Invalid permission (e.g., asking for write and range is read
651 * only).
Christoph Hellwig9a4903e2019-07-25 17:56:46 -0700652 * -EBUSY: The range has been invalidated and the caller needs to wait for
653 * the invalidation to finish.
654 * -EFAULT: Invalid (i.e., either no valid vma or it is illegal to access
655 * that range) number of valid pages in range->pfns[] (from
656 * range start address).
Jérôme Glisse74eee182017-09-08 16:11:35 -0700657 *
658 * This is similar to a regular CPU page fault except that it will not trigger
Jérôme Glisse73231612019-05-13 17:19:58 -0700659 * any memory migration if the memory being faulted is not accessible by CPUs
660 * and caller does not ask for migration.
Jérôme Glisse74eee182017-09-08 16:11:35 -0700661 *
Jérôme Glisseff05c0c2018-04-10 16:28:38 -0700662 * On error, for one virtual address in the range, the function will mark the
663 * corresponding HMM pfn entry with an error flag.
Jérôme Glisse74eee182017-09-08 16:11:35 -0700664 */
Christoph Hellwig9a4903e2019-07-25 17:56:46 -0700665long hmm_range_fault(struct hmm_range *range, unsigned int flags)
Jérôme Glisse74eee182017-09-08 16:11:35 -0700666{
Ralph Campbelld28c2c9a2019-11-04 14:21:40 -0800667 struct hmm_vma_walk hmm_vma_walk = {
668 .range = range,
669 .last = range->start,
670 .flags = flags,
671 };
Jason Gunthorpea22dd502019-11-12 16:22:30 -0400672 struct mm_struct *mm = range->notifier->mm;
Jérôme Glisse74eee182017-09-08 16:11:35 -0700673 int ret;
674
Jason Gunthorpe04ec32f2019-11-12 16:22:20 -0400675 lockdep_assert_held(&mm->mmap_sem);
Jérôme Glisse74eee182017-09-08 16:11:35 -0700676
677 do {
Jérôme Glissea3e0d412019-05-13 17:20:01 -0700678 /* If range is no longer valid force retry. */
Jason Gunthorpea22dd502019-11-12 16:22:30 -0400679 if (mmu_interval_check_retry(range->notifier,
680 range->notifier_seq))
Christoph Hellwig2bcbeae2019-07-24 08:52:52 +0200681 return -EBUSY;
Ralph Campbelld28c2c9a2019-11-04 14:21:40 -0800682 ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
683 &hmm_walk_ops, &hmm_vma_walk);
684 } while (ret == -EBUSY);
Jérôme Glisse74eee182017-09-08 16:11:35 -0700685
Ralph Campbelld28c2c9a2019-11-04 14:21:40 -0800686 if (ret)
687 return ret;
Jérôme Glisse73231612019-05-13 17:19:58 -0700688 return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
Jérôme Glisse74eee182017-09-08 16:11:35 -0700689}
Jérôme Glisse73231612019-05-13 17:19:58 -0700690EXPORT_SYMBOL(hmm_range_fault);