Thomas Hellstrom | c5acad8 | 2019-03-19 13:12:30 +0100 | [diff] [blame] | 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | #include <linux/pagewalk.h> |
| 3 | #include <linux/hugetlb.h> |
| 4 | #include <linux/bitops.h> |
| 5 | #include <linux/mmu_notifier.h> |
| 6 | #include <asm/cacheflush.h> |
| 7 | #include <asm/tlbflush.h> |
| 8 | |
| 9 | /** |
| 10 | * struct wp_walk - Private struct for pagetable walk callbacks |
| 11 | * @range: Range for mmu notifiers |
| 12 | * @tlbflush_start: Address of first modified pte |
| 13 | * @tlbflush_end: Address of last modified pte + 1 |
| 14 | * @total: Total number of modified ptes |
| 15 | */ |
| 16 | struct wp_walk { |
| 17 | struct mmu_notifier_range range; |
| 18 | unsigned long tlbflush_start; |
| 19 | unsigned long tlbflush_end; |
| 20 | unsigned long total; |
| 21 | }; |
| 22 | |
| 23 | /** |
| 24 | * wp_pte - Write-protect a pte |
| 25 | * @pte: Pointer to the pte |
| 26 | * @addr: The virtual page address |
| 27 | * @walk: pagetable walk callback argument |
| 28 | * |
| 29 | * The function write-protects a pte and records the range in |
| 30 | * virtual address space of touched ptes for efficient range TLB flushes. |
| 31 | */ |
| 32 | static int wp_pte(pte_t *pte, unsigned long addr, unsigned long end, |
| 33 | struct mm_walk *walk) |
| 34 | { |
| 35 | struct wp_walk *wpwalk = walk->private; |
| 36 | pte_t ptent = *pte; |
| 37 | |
| 38 | if (pte_write(ptent)) { |
| 39 | pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte); |
| 40 | |
| 41 | ptent = pte_wrprotect(old_pte); |
| 42 | ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent); |
| 43 | wpwalk->total++; |
| 44 | wpwalk->tlbflush_start = min(wpwalk->tlbflush_start, addr); |
| 45 | wpwalk->tlbflush_end = max(wpwalk->tlbflush_end, |
| 46 | addr + PAGE_SIZE); |
| 47 | } |
| 48 | |
| 49 | return 0; |
| 50 | } |
| 51 | |
| 52 | /** |
| 53 | * struct clean_walk - Private struct for the clean_record_pte function. |
| 54 | * @base: struct wp_walk we derive from |
| 55 | * @bitmap_pgoff: Address_space Page offset of the first bit in @bitmap |
| 56 | * @bitmap: Bitmap with one bit for each page offset in the address_space range |
| 57 | * covered. |
| 58 | * @start: Address_space page offset of first modified pte relative |
| 59 | * to @bitmap_pgoff |
| 60 | * @end: Address_space page offset of last modified pte relative |
| 61 | * to @bitmap_pgoff |
| 62 | */ |
| 63 | struct clean_walk { |
| 64 | struct wp_walk base; |
| 65 | pgoff_t bitmap_pgoff; |
| 66 | unsigned long *bitmap; |
| 67 | pgoff_t start; |
| 68 | pgoff_t end; |
| 69 | }; |
| 70 | |
| 71 | #define to_clean_walk(_wpwalk) container_of(_wpwalk, struct clean_walk, base) |
| 72 | |
| 73 | /** |
| 74 | * clean_record_pte - Clean a pte and record its address space offset in a |
| 75 | * bitmap |
| 76 | * @pte: Pointer to the pte |
| 77 | * @addr: The virtual page address |
| 78 | * @walk: pagetable walk callback argument |
| 79 | * |
| 80 | * The function cleans a pte and records the range in |
| 81 | * virtual address space of touched ptes for efficient TLB flushes. |
| 82 | * It also records dirty ptes in a bitmap representing page offsets |
| 83 | * in the address_space, as well as the first and last of the bits |
| 84 | * touched. |
| 85 | */ |
| 86 | static int clean_record_pte(pte_t *pte, unsigned long addr, |
| 87 | unsigned long end, struct mm_walk *walk) |
| 88 | { |
| 89 | struct wp_walk *wpwalk = walk->private; |
| 90 | struct clean_walk *cwalk = to_clean_walk(wpwalk); |
| 91 | pte_t ptent = *pte; |
| 92 | |
| 93 | if (pte_dirty(ptent)) { |
| 94 | pgoff_t pgoff = ((addr - walk->vma->vm_start) >> PAGE_SHIFT) + |
| 95 | walk->vma->vm_pgoff - cwalk->bitmap_pgoff; |
| 96 | pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte); |
| 97 | |
| 98 | ptent = pte_mkclean(old_pte); |
| 99 | ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent); |
| 100 | |
| 101 | wpwalk->total++; |
| 102 | wpwalk->tlbflush_start = min(wpwalk->tlbflush_start, addr); |
| 103 | wpwalk->tlbflush_end = max(wpwalk->tlbflush_end, |
| 104 | addr + PAGE_SIZE); |
| 105 | |
| 106 | __set_bit(pgoff, cwalk->bitmap); |
| 107 | cwalk->start = min(cwalk->start, pgoff); |
| 108 | cwalk->end = max(cwalk->end, pgoff + 1); |
| 109 | } |
| 110 | |
| 111 | return 0; |
| 112 | } |
| 113 | |
Thomas Hellstrom | b2a403f | 2020-04-01 21:07:42 -0700 | [diff] [blame] | 114 | /* |
| 115 | * wp_clean_pmd_entry - The pagewalk pmd callback. |
| 116 | * |
| 117 | * Dirty-tracking should take place on the PTE level, so |
| 118 | * WARN() if encountering a dirty huge pmd. |
| 119 | * Furthermore, never split huge pmds, since that currently |
| 120 | * causes dirty info loss. The pagefault handler should do |
| 121 | * that if needed. |
| 122 | */ |
Thomas Hellstrom | c5acad8 | 2019-03-19 13:12:30 +0100 | [diff] [blame] | 123 | static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end, |
| 124 | struct mm_walk *walk) |
| 125 | { |
Thomas Hellstrom | c5acad8 | 2019-03-19 13:12:30 +0100 | [diff] [blame] | 126 | pmd_t pmdval = pmd_read_atomic(pmd); |
| 127 | |
Thomas Hellstrom | b2a403f | 2020-04-01 21:07:42 -0700 | [diff] [blame] | 128 | if (!pmd_trans_unstable(&pmdval)) |
| 129 | return 0; |
| 130 | |
| 131 | if (pmd_none(pmdval)) { |
| 132 | walk->action = ACTION_AGAIN; |
| 133 | return 0; |
| 134 | } |
| 135 | |
| 136 | /* Huge pmd, present or migrated */ |
| 137 | walk->action = ACTION_CONTINUE; |
Thomas Hellstrom | c5acad8 | 2019-03-19 13:12:30 +0100 | [diff] [blame] | 138 | if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) |
| 139 | WARN_ON(pmd_write(pmdval) || pmd_dirty(pmdval)); |
| 140 | |
| 141 | return 0; |
| 142 | } |
| 143 | |
Thomas Hellstrom | b2a403f | 2020-04-01 21:07:42 -0700 | [diff] [blame] | 144 | /* |
| 145 | * wp_clean_pud_entry - The pagewalk pud callback. |
| 146 | * |
| 147 | * Dirty-tracking should take place on the PTE level, so |
| 148 | * WARN() if encountering a dirty huge puds. |
| 149 | * Furthermore, never split huge puds, since that currently |
| 150 | * causes dirty info loss. The pagefault handler should do |
| 151 | * that if needed. |
| 152 | */ |
Thomas Hellstrom | c5acad8 | 2019-03-19 13:12:30 +0100 | [diff] [blame] | 153 | static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end, |
| 154 | struct mm_walk *walk) |
| 155 | { |
Thomas Hellstrom | c5acad8 | 2019-03-19 13:12:30 +0100 | [diff] [blame] | 156 | pud_t pudval = READ_ONCE(*pud); |
| 157 | |
Thomas Hellstrom | b2a403f | 2020-04-01 21:07:42 -0700 | [diff] [blame] | 158 | if (!pud_trans_unstable(&pudval)) |
| 159 | return 0; |
| 160 | |
| 161 | if (pud_none(pudval)) { |
| 162 | walk->action = ACTION_AGAIN; |
| 163 | return 0; |
| 164 | } |
| 165 | |
| 166 | /* Huge pud */ |
| 167 | walk->action = ACTION_CONTINUE; |
Thomas Hellstrom | c5acad8 | 2019-03-19 13:12:30 +0100 | [diff] [blame] | 168 | if (pud_trans_huge(pudval) || pud_devmap(pudval)) |
| 169 | WARN_ON(pud_write(pudval) || pud_dirty(pudval)); |
| 170 | |
| 171 | return 0; |
| 172 | } |
| 173 | |
| 174 | /* |
| 175 | * wp_clean_pre_vma - The pagewalk pre_vma callback. |
| 176 | * |
| 177 | * The pre_vma callback performs the cache flush, stages the tlb flush |
| 178 | * and calls the necessary mmu notifiers. |
| 179 | */ |
| 180 | static int wp_clean_pre_vma(unsigned long start, unsigned long end, |
| 181 | struct mm_walk *walk) |
| 182 | { |
| 183 | struct wp_walk *wpwalk = walk->private; |
| 184 | |
| 185 | wpwalk->tlbflush_start = end; |
| 186 | wpwalk->tlbflush_end = start; |
| 187 | |
| 188 | mmu_notifier_range_init(&wpwalk->range, MMU_NOTIFY_PROTECTION_PAGE, 0, |
| 189 | walk->vma, walk->mm, start, end); |
| 190 | mmu_notifier_invalidate_range_start(&wpwalk->range); |
| 191 | flush_cache_range(walk->vma, start, end); |
| 192 | |
| 193 | /* |
| 194 | * We're not using tlb_gather_mmu() since typically |
| 195 | * only a small subrange of PTEs are affected, whereas |
| 196 | * tlb_gather_mmu() records the full range. |
| 197 | */ |
| 198 | inc_tlb_flush_pending(walk->mm); |
| 199 | |
| 200 | return 0; |
| 201 | } |
| 202 | |
| 203 | /* |
| 204 | * wp_clean_post_vma - The pagewalk post_vma callback. |
| 205 | * |
| 206 | * The post_vma callback performs the tlb flush and calls necessary mmu |
| 207 | * notifiers. |
| 208 | */ |
| 209 | static void wp_clean_post_vma(struct mm_walk *walk) |
| 210 | { |
| 211 | struct wp_walk *wpwalk = walk->private; |
| 212 | |
| 213 | if (mm_tlb_flush_nested(walk->mm)) |
| 214 | flush_tlb_range(walk->vma, wpwalk->range.start, |
| 215 | wpwalk->range.end); |
| 216 | else if (wpwalk->tlbflush_end > wpwalk->tlbflush_start) |
| 217 | flush_tlb_range(walk->vma, wpwalk->tlbflush_start, |
| 218 | wpwalk->tlbflush_end); |
| 219 | |
| 220 | mmu_notifier_invalidate_range_end(&wpwalk->range); |
| 221 | dec_tlb_flush_pending(walk->mm); |
| 222 | } |
| 223 | |
| 224 | /* |
| 225 | * wp_clean_test_walk - The pagewalk test_walk callback. |
| 226 | * |
| 227 | * Won't perform dirty-tracking on COW, read-only or HUGETLB vmas. |
| 228 | */ |
| 229 | static int wp_clean_test_walk(unsigned long start, unsigned long end, |
| 230 | struct mm_walk *walk) |
| 231 | { |
| 232 | unsigned long vm_flags = READ_ONCE(walk->vma->vm_flags); |
| 233 | |
| 234 | /* Skip non-applicable VMAs */ |
| 235 | if ((vm_flags & (VM_SHARED | VM_MAYWRITE | VM_HUGETLB)) != |
| 236 | (VM_SHARED | VM_MAYWRITE)) |
| 237 | return 1; |
| 238 | |
| 239 | return 0; |
| 240 | } |
| 241 | |
| 242 | static const struct mm_walk_ops clean_walk_ops = { |
| 243 | .pte_entry = clean_record_pte, |
| 244 | .pmd_entry = wp_clean_pmd_entry, |
| 245 | .pud_entry = wp_clean_pud_entry, |
| 246 | .test_walk = wp_clean_test_walk, |
| 247 | .pre_vma = wp_clean_pre_vma, |
| 248 | .post_vma = wp_clean_post_vma |
| 249 | }; |
| 250 | |
| 251 | static const struct mm_walk_ops wp_walk_ops = { |
| 252 | .pte_entry = wp_pte, |
| 253 | .pmd_entry = wp_clean_pmd_entry, |
| 254 | .pud_entry = wp_clean_pud_entry, |
| 255 | .test_walk = wp_clean_test_walk, |
| 256 | .pre_vma = wp_clean_pre_vma, |
| 257 | .post_vma = wp_clean_post_vma |
| 258 | }; |
| 259 | |
| 260 | /** |
| 261 | * wp_shared_mapping_range - Write-protect all ptes in an address space range |
| 262 | * @mapping: The address_space we want to write protect |
| 263 | * @first_index: The first page offset in the range |
| 264 | * @nr: Number of incremental page offsets to cover |
| 265 | * |
| 266 | * Note: This function currently skips transhuge page-table entries, since |
| 267 | * it's intended for dirty-tracking on the PTE level. It will warn on |
| 268 | * encountering transhuge write-enabled entries, though, and can easily be |
| 269 | * extended to handle them as well. |
| 270 | * |
| 271 | * Return: The number of ptes actually write-protected. Note that |
| 272 | * already write-protected ptes are not counted. |
| 273 | */ |
| 274 | unsigned long wp_shared_mapping_range(struct address_space *mapping, |
| 275 | pgoff_t first_index, pgoff_t nr) |
| 276 | { |
| 277 | struct wp_walk wpwalk = { .total = 0 }; |
| 278 | |
| 279 | i_mmap_lock_read(mapping); |
| 280 | WARN_ON(walk_page_mapping(mapping, first_index, nr, &wp_walk_ops, |
| 281 | &wpwalk)); |
| 282 | i_mmap_unlock_read(mapping); |
| 283 | |
| 284 | return wpwalk.total; |
| 285 | } |
| 286 | EXPORT_SYMBOL_GPL(wp_shared_mapping_range); |
| 287 | |
| 288 | /** |
| 289 | * clean_record_shared_mapping_range - Clean and record all ptes in an |
| 290 | * address space range |
| 291 | * @mapping: The address_space we want to clean |
| 292 | * @first_index: The first page offset in the range |
| 293 | * @nr: Number of incremental page offsets to cover |
| 294 | * @bitmap_pgoff: The page offset of the first bit in @bitmap |
| 295 | * @bitmap: Pointer to a bitmap of at least @nr bits. The bitmap needs to |
| 296 | * cover the whole range @first_index..@first_index + @nr. |
| 297 | * @start: Pointer to number of the first set bit in @bitmap. |
| 298 | * is modified as new bits are set by the function. |
| 299 | * @end: Pointer to the number of the last set bit in @bitmap. |
| 300 | * none set. The value is modified as new bits are set by the function. |
| 301 | * |
| 302 | * Note: When this function returns there is no guarantee that a CPU has |
| 303 | * not already dirtied new ptes. However it will not clean any ptes not |
| 304 | * reported in the bitmap. The guarantees are as follows: |
| 305 | * a) All ptes dirty when the function starts executing will end up recorded |
| 306 | * in the bitmap. |
| 307 | * b) All ptes dirtied after that will either remain dirty, be recorded in the |
| 308 | * bitmap or both. |
| 309 | * |
| 310 | * If a caller needs to make sure all dirty ptes are picked up and none |
| 311 | * additional are added, it first needs to write-protect the address-space |
| 312 | * range and make sure new writers are blocked in page_mkwrite() or |
| 313 | * pfn_mkwrite(). And then after a TLB flush following the write-protection |
| 314 | * pick up all dirty bits. |
| 315 | * |
| 316 | * Note: This function currently skips transhuge page-table entries, since |
| 317 | * it's intended for dirty-tracking on the PTE level. It will warn on |
| 318 | * encountering transhuge dirty entries, though, and can easily be extended |
| 319 | * to handle them as well. |
| 320 | * |
| 321 | * Return: The number of dirty ptes actually cleaned. |
| 322 | */ |
| 323 | unsigned long clean_record_shared_mapping_range(struct address_space *mapping, |
| 324 | pgoff_t first_index, pgoff_t nr, |
| 325 | pgoff_t bitmap_pgoff, |
| 326 | unsigned long *bitmap, |
| 327 | pgoff_t *start, |
| 328 | pgoff_t *end) |
| 329 | { |
| 330 | bool none_set = (*start >= *end); |
| 331 | struct clean_walk cwalk = { |
| 332 | .base = { .total = 0 }, |
| 333 | .bitmap_pgoff = bitmap_pgoff, |
| 334 | .bitmap = bitmap, |
| 335 | .start = none_set ? nr : *start, |
| 336 | .end = none_set ? 0 : *end, |
| 337 | }; |
| 338 | |
| 339 | i_mmap_lock_read(mapping); |
| 340 | WARN_ON(walk_page_mapping(mapping, first_index, nr, &clean_walk_ops, |
| 341 | &cwalk.base)); |
| 342 | i_mmap_unlock_read(mapping); |
| 343 | |
| 344 | *start = cwalk.start; |
| 345 | *end = cwalk.end; |
| 346 | |
| 347 | return cwalk.base.total; |
| 348 | } |
| 349 | EXPORT_SYMBOL_GPL(clean_record_shared_mapping_range); |