Thomas Hellstrom | c5acad8 | 2019-03-19 13:12:30 +0100 | [diff] [blame] | 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | #include <linux/pagewalk.h> |
| 3 | #include <linux/hugetlb.h> |
| 4 | #include <linux/bitops.h> |
| 5 | #include <linux/mmu_notifier.h> |
Arnd Bergmann | 36090de | 2022-01-14 14:06:10 -0800 | [diff] [blame^] | 6 | #include <linux/mm_inline.h> |
Thomas Hellstrom | c5acad8 | 2019-03-19 13:12:30 +0100 | [diff] [blame] | 7 | #include <asm/cacheflush.h> |
| 8 | #include <asm/tlbflush.h> |
| 9 | |
| 10 | /** |
| 11 | * struct wp_walk - Private struct for pagetable walk callbacks |
| 12 | * @range: Range for mmu notifiers |
| 13 | * @tlbflush_start: Address of first modified pte |
| 14 | * @tlbflush_end: Address of last modified pte + 1 |
| 15 | * @total: Total number of modified ptes |
| 16 | */ |
| 17 | struct wp_walk { |
| 18 | struct mmu_notifier_range range; |
| 19 | unsigned long tlbflush_start; |
| 20 | unsigned long tlbflush_end; |
| 21 | unsigned long total; |
| 22 | }; |
| 23 | |
| 24 | /** |
| 25 | * wp_pte - Write-protect a pte |
| 26 | * @pte: Pointer to the pte |
Alex Shi | f5b7e73 | 2020-12-14 19:07:48 -0800 | [diff] [blame] | 27 | * @addr: The start of protecting virtual address |
| 28 | * @end: The end of protecting virtual address |
Thomas Hellstrom | c5acad8 | 2019-03-19 13:12:30 +0100 | [diff] [blame] | 29 | * @walk: pagetable walk callback argument |
| 30 | * |
| 31 | * The function write-protects a pte and records the range in |
| 32 | * virtual address space of touched ptes for efficient range TLB flushes. |
| 33 | */ |
| 34 | static int wp_pte(pte_t *pte, unsigned long addr, unsigned long end, |
| 35 | struct mm_walk *walk) |
| 36 | { |
| 37 | struct wp_walk *wpwalk = walk->private; |
| 38 | pte_t ptent = *pte; |
| 39 | |
| 40 | if (pte_write(ptent)) { |
| 41 | pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte); |
| 42 | |
| 43 | ptent = pte_wrprotect(old_pte); |
| 44 | ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent); |
| 45 | wpwalk->total++; |
| 46 | wpwalk->tlbflush_start = min(wpwalk->tlbflush_start, addr); |
| 47 | wpwalk->tlbflush_end = max(wpwalk->tlbflush_end, |
| 48 | addr + PAGE_SIZE); |
| 49 | } |
| 50 | |
| 51 | return 0; |
| 52 | } |
| 53 | |
| 54 | /** |
| 55 | * struct clean_walk - Private struct for the clean_record_pte function. |
| 56 | * @base: struct wp_walk we derive from |
| 57 | * @bitmap_pgoff: Address_space Page offset of the first bit in @bitmap |
| 58 | * @bitmap: Bitmap with one bit for each page offset in the address_space range |
| 59 | * covered. |
| 60 | * @start: Address_space page offset of first modified pte relative |
| 61 | * to @bitmap_pgoff |
| 62 | * @end: Address_space page offset of last modified pte relative |
| 63 | * to @bitmap_pgoff |
| 64 | */ |
| 65 | struct clean_walk { |
| 66 | struct wp_walk base; |
| 67 | pgoff_t bitmap_pgoff; |
| 68 | unsigned long *bitmap; |
| 69 | pgoff_t start; |
| 70 | pgoff_t end; |
| 71 | }; |
| 72 | |
| 73 | #define to_clean_walk(_wpwalk) container_of(_wpwalk, struct clean_walk, base) |
| 74 | |
| 75 | /** |
| 76 | * clean_record_pte - Clean a pte and record its address space offset in a |
| 77 | * bitmap |
| 78 | * @pte: Pointer to the pte |
Alex Shi | f5b7e73 | 2020-12-14 19:07:48 -0800 | [diff] [blame] | 79 | * @addr: The start of virtual address to be clean |
| 80 | * @end: The end of virtual address to be clean |
Thomas Hellstrom | c5acad8 | 2019-03-19 13:12:30 +0100 | [diff] [blame] | 81 | * @walk: pagetable walk callback argument |
| 82 | * |
| 83 | * The function cleans a pte and records the range in |
| 84 | * virtual address space of touched ptes for efficient TLB flushes. |
| 85 | * It also records dirty ptes in a bitmap representing page offsets |
| 86 | * in the address_space, as well as the first and last of the bits |
| 87 | * touched. |
| 88 | */ |
| 89 | static int clean_record_pte(pte_t *pte, unsigned long addr, |
| 90 | unsigned long end, struct mm_walk *walk) |
| 91 | { |
| 92 | struct wp_walk *wpwalk = walk->private; |
| 93 | struct clean_walk *cwalk = to_clean_walk(wpwalk); |
| 94 | pte_t ptent = *pte; |
| 95 | |
| 96 | if (pte_dirty(ptent)) { |
| 97 | pgoff_t pgoff = ((addr - walk->vma->vm_start) >> PAGE_SHIFT) + |
| 98 | walk->vma->vm_pgoff - cwalk->bitmap_pgoff; |
| 99 | pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte); |
| 100 | |
| 101 | ptent = pte_mkclean(old_pte); |
| 102 | ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent); |
| 103 | |
| 104 | wpwalk->total++; |
| 105 | wpwalk->tlbflush_start = min(wpwalk->tlbflush_start, addr); |
| 106 | wpwalk->tlbflush_end = max(wpwalk->tlbflush_end, |
| 107 | addr + PAGE_SIZE); |
| 108 | |
| 109 | __set_bit(pgoff, cwalk->bitmap); |
| 110 | cwalk->start = min(cwalk->start, pgoff); |
| 111 | cwalk->end = max(cwalk->end, pgoff + 1); |
| 112 | } |
| 113 | |
| 114 | return 0; |
| 115 | } |
| 116 | |
Thomas Hellstrom | b2a403f | 2020-04-01 21:07:42 -0700 | [diff] [blame] | 117 | /* |
| 118 | * wp_clean_pmd_entry - The pagewalk pmd callback. |
| 119 | * |
| 120 | * Dirty-tracking should take place on the PTE level, so |
| 121 | * WARN() if encountering a dirty huge pmd. |
| 122 | * Furthermore, never split huge pmds, since that currently |
| 123 | * causes dirty info loss. The pagefault handler should do |
| 124 | * that if needed. |
| 125 | */ |
Thomas Hellstrom | c5acad8 | 2019-03-19 13:12:30 +0100 | [diff] [blame] | 126 | static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end, |
| 127 | struct mm_walk *walk) |
| 128 | { |
Thomas Hellstrom | c5acad8 | 2019-03-19 13:12:30 +0100 | [diff] [blame] | 129 | pmd_t pmdval = pmd_read_atomic(pmd); |
| 130 | |
Thomas Hellstrom | b2a403f | 2020-04-01 21:07:42 -0700 | [diff] [blame] | 131 | if (!pmd_trans_unstable(&pmdval)) |
| 132 | return 0; |
| 133 | |
| 134 | if (pmd_none(pmdval)) { |
| 135 | walk->action = ACTION_AGAIN; |
| 136 | return 0; |
| 137 | } |
| 138 | |
| 139 | /* Huge pmd, present or migrated */ |
| 140 | walk->action = ACTION_CONTINUE; |
Thomas Hellstrom | c5acad8 | 2019-03-19 13:12:30 +0100 | [diff] [blame] | 141 | if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) |
| 142 | WARN_ON(pmd_write(pmdval) || pmd_dirty(pmdval)); |
| 143 | |
| 144 | return 0; |
| 145 | } |
| 146 | |
Thomas Hellstrom | b2a403f | 2020-04-01 21:07:42 -0700 | [diff] [blame] | 147 | /* |
| 148 | * wp_clean_pud_entry - The pagewalk pud callback. |
| 149 | * |
| 150 | * Dirty-tracking should take place on the PTE level, so |
| 151 | * WARN() if encountering a dirty huge puds. |
| 152 | * Furthermore, never split huge puds, since that currently |
| 153 | * causes dirty info loss. The pagefault handler should do |
| 154 | * that if needed. |
| 155 | */ |
Thomas Hellstrom | c5acad8 | 2019-03-19 13:12:30 +0100 | [diff] [blame] | 156 | static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end, |
| 157 | struct mm_walk *walk) |
| 158 | { |
Thomas Hellstrom | c5acad8 | 2019-03-19 13:12:30 +0100 | [diff] [blame] | 159 | pud_t pudval = READ_ONCE(*pud); |
| 160 | |
Thomas Hellstrom | b2a403f | 2020-04-01 21:07:42 -0700 | [diff] [blame] | 161 | if (!pud_trans_unstable(&pudval)) |
| 162 | return 0; |
| 163 | |
| 164 | if (pud_none(pudval)) { |
| 165 | walk->action = ACTION_AGAIN; |
| 166 | return 0; |
| 167 | } |
| 168 | |
Zack Rusin | 94036f4 | 2021-04-16 15:46:18 -0700 | [diff] [blame] | 169 | #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD |
Thomas Hellstrom | b2a403f | 2020-04-01 21:07:42 -0700 | [diff] [blame] | 170 | /* Huge pud */ |
| 171 | walk->action = ACTION_CONTINUE; |
Thomas Hellstrom | c5acad8 | 2019-03-19 13:12:30 +0100 | [diff] [blame] | 172 | if (pud_trans_huge(pudval) || pud_devmap(pudval)) |
| 173 | WARN_ON(pud_write(pudval) || pud_dirty(pudval)); |
Zack Rusin | 94036f4 | 2021-04-16 15:46:18 -0700 | [diff] [blame] | 174 | #endif |
Thomas Hellstrom | c5acad8 | 2019-03-19 13:12:30 +0100 | [diff] [blame] | 175 | |
| 176 | return 0; |
| 177 | } |
| 178 | |
| 179 | /* |
| 180 | * wp_clean_pre_vma - The pagewalk pre_vma callback. |
| 181 | * |
| 182 | * The pre_vma callback performs the cache flush, stages the tlb flush |
| 183 | * and calls the necessary mmu notifiers. |
| 184 | */ |
| 185 | static int wp_clean_pre_vma(unsigned long start, unsigned long end, |
| 186 | struct mm_walk *walk) |
| 187 | { |
| 188 | struct wp_walk *wpwalk = walk->private; |
| 189 | |
| 190 | wpwalk->tlbflush_start = end; |
| 191 | wpwalk->tlbflush_end = start; |
| 192 | |
| 193 | mmu_notifier_range_init(&wpwalk->range, MMU_NOTIFY_PROTECTION_PAGE, 0, |
| 194 | walk->vma, walk->mm, start, end); |
| 195 | mmu_notifier_invalidate_range_start(&wpwalk->range); |
| 196 | flush_cache_range(walk->vma, start, end); |
| 197 | |
| 198 | /* |
| 199 | * We're not using tlb_gather_mmu() since typically |
| 200 | * only a small subrange of PTEs are affected, whereas |
| 201 | * tlb_gather_mmu() records the full range. |
| 202 | */ |
| 203 | inc_tlb_flush_pending(walk->mm); |
| 204 | |
| 205 | return 0; |
| 206 | } |
| 207 | |
| 208 | /* |
| 209 | * wp_clean_post_vma - The pagewalk post_vma callback. |
| 210 | * |
| 211 | * The post_vma callback performs the tlb flush and calls necessary mmu |
| 212 | * notifiers. |
| 213 | */ |
| 214 | static void wp_clean_post_vma(struct mm_walk *walk) |
| 215 | { |
| 216 | struct wp_walk *wpwalk = walk->private; |
| 217 | |
| 218 | if (mm_tlb_flush_nested(walk->mm)) |
| 219 | flush_tlb_range(walk->vma, wpwalk->range.start, |
| 220 | wpwalk->range.end); |
| 221 | else if (wpwalk->tlbflush_end > wpwalk->tlbflush_start) |
| 222 | flush_tlb_range(walk->vma, wpwalk->tlbflush_start, |
| 223 | wpwalk->tlbflush_end); |
| 224 | |
| 225 | mmu_notifier_invalidate_range_end(&wpwalk->range); |
| 226 | dec_tlb_flush_pending(walk->mm); |
| 227 | } |
| 228 | |
| 229 | /* |
| 230 | * wp_clean_test_walk - The pagewalk test_walk callback. |
| 231 | * |
| 232 | * Won't perform dirty-tracking on COW, read-only or HUGETLB vmas. |
| 233 | */ |
| 234 | static int wp_clean_test_walk(unsigned long start, unsigned long end, |
| 235 | struct mm_walk *walk) |
| 236 | { |
| 237 | unsigned long vm_flags = READ_ONCE(walk->vma->vm_flags); |
| 238 | |
| 239 | /* Skip non-applicable VMAs */ |
| 240 | if ((vm_flags & (VM_SHARED | VM_MAYWRITE | VM_HUGETLB)) != |
| 241 | (VM_SHARED | VM_MAYWRITE)) |
| 242 | return 1; |
| 243 | |
| 244 | return 0; |
| 245 | } |
| 246 | |
| 247 | static const struct mm_walk_ops clean_walk_ops = { |
| 248 | .pte_entry = clean_record_pte, |
| 249 | .pmd_entry = wp_clean_pmd_entry, |
| 250 | .pud_entry = wp_clean_pud_entry, |
| 251 | .test_walk = wp_clean_test_walk, |
| 252 | .pre_vma = wp_clean_pre_vma, |
| 253 | .post_vma = wp_clean_post_vma |
| 254 | }; |
| 255 | |
| 256 | static const struct mm_walk_ops wp_walk_ops = { |
| 257 | .pte_entry = wp_pte, |
| 258 | .pmd_entry = wp_clean_pmd_entry, |
| 259 | .pud_entry = wp_clean_pud_entry, |
| 260 | .test_walk = wp_clean_test_walk, |
| 261 | .pre_vma = wp_clean_pre_vma, |
| 262 | .post_vma = wp_clean_post_vma |
| 263 | }; |
| 264 | |
| 265 | /** |
| 266 | * wp_shared_mapping_range - Write-protect all ptes in an address space range |
| 267 | * @mapping: The address_space we want to write protect |
| 268 | * @first_index: The first page offset in the range |
| 269 | * @nr: Number of incremental page offsets to cover |
| 270 | * |
| 271 | * Note: This function currently skips transhuge page-table entries, since |
| 272 | * it's intended for dirty-tracking on the PTE level. It will warn on |
| 273 | * encountering transhuge write-enabled entries, though, and can easily be |
| 274 | * extended to handle them as well. |
| 275 | * |
| 276 | * Return: The number of ptes actually write-protected. Note that |
| 277 | * already write-protected ptes are not counted. |
| 278 | */ |
| 279 | unsigned long wp_shared_mapping_range(struct address_space *mapping, |
| 280 | pgoff_t first_index, pgoff_t nr) |
| 281 | { |
| 282 | struct wp_walk wpwalk = { .total = 0 }; |
| 283 | |
| 284 | i_mmap_lock_read(mapping); |
| 285 | WARN_ON(walk_page_mapping(mapping, first_index, nr, &wp_walk_ops, |
| 286 | &wpwalk)); |
| 287 | i_mmap_unlock_read(mapping); |
| 288 | |
| 289 | return wpwalk.total; |
| 290 | } |
| 291 | EXPORT_SYMBOL_GPL(wp_shared_mapping_range); |
| 292 | |
| 293 | /** |
| 294 | * clean_record_shared_mapping_range - Clean and record all ptes in an |
| 295 | * address space range |
| 296 | * @mapping: The address_space we want to clean |
| 297 | * @first_index: The first page offset in the range |
| 298 | * @nr: Number of incremental page offsets to cover |
| 299 | * @bitmap_pgoff: The page offset of the first bit in @bitmap |
| 300 | * @bitmap: Pointer to a bitmap of at least @nr bits. The bitmap needs to |
| 301 | * cover the whole range @first_index..@first_index + @nr. |
| 302 | * @start: Pointer to number of the first set bit in @bitmap. |
| 303 | * is modified as new bits are set by the function. |
| 304 | * @end: Pointer to the number of the last set bit in @bitmap. |
| 305 | * none set. The value is modified as new bits are set by the function. |
| 306 | * |
| 307 | * Note: When this function returns there is no guarantee that a CPU has |
| 308 | * not already dirtied new ptes. However it will not clean any ptes not |
| 309 | * reported in the bitmap. The guarantees are as follows: |
| 310 | * a) All ptes dirty when the function starts executing will end up recorded |
| 311 | * in the bitmap. |
| 312 | * b) All ptes dirtied after that will either remain dirty, be recorded in the |
| 313 | * bitmap or both. |
| 314 | * |
| 315 | * If a caller needs to make sure all dirty ptes are picked up and none |
| 316 | * additional are added, it first needs to write-protect the address-space |
| 317 | * range and make sure new writers are blocked in page_mkwrite() or |
| 318 | * pfn_mkwrite(). And then after a TLB flush following the write-protection |
| 319 | * pick up all dirty bits. |
| 320 | * |
Mel Gorman | b417941 | 2021-06-30 18:53:29 -0700 | [diff] [blame] | 321 | * This function currently skips transhuge page-table entries, since |
Thomas Hellstrom | c5acad8 | 2019-03-19 13:12:30 +0100 | [diff] [blame] | 322 | * it's intended for dirty-tracking on the PTE level. It will warn on |
| 323 | * encountering transhuge dirty entries, though, and can easily be extended |
| 324 | * to handle them as well. |
| 325 | * |
| 326 | * Return: The number of dirty ptes actually cleaned. |
| 327 | */ |
| 328 | unsigned long clean_record_shared_mapping_range(struct address_space *mapping, |
| 329 | pgoff_t first_index, pgoff_t nr, |
| 330 | pgoff_t bitmap_pgoff, |
| 331 | unsigned long *bitmap, |
| 332 | pgoff_t *start, |
| 333 | pgoff_t *end) |
| 334 | { |
| 335 | bool none_set = (*start >= *end); |
| 336 | struct clean_walk cwalk = { |
| 337 | .base = { .total = 0 }, |
| 338 | .bitmap_pgoff = bitmap_pgoff, |
| 339 | .bitmap = bitmap, |
| 340 | .start = none_set ? nr : *start, |
| 341 | .end = none_set ? 0 : *end, |
| 342 | }; |
| 343 | |
| 344 | i_mmap_lock_read(mapping); |
| 345 | WARN_ON(walk_page_mapping(mapping, first_index, nr, &clean_walk_ops, |
| 346 | &cwalk.base)); |
| 347 | i_mmap_unlock_read(mapping); |
| 348 | |
| 349 | *start = cwalk.start; |
| 350 | *end = cwalk.end; |
| 351 | |
| 352 | return cwalk.base.total; |
| 353 | } |
| 354 | EXPORT_SYMBOL_GPL(clean_record_shared_mapping_range); |