Darrick J. Wong | a211432 | 2019-07-02 09:39:38 -0700 | [diff] [blame] | 1 | // SPDX-License-Identifier: GPL-2.0-or-later |
| 2 | /* |
| 3 | * Copyright (C) 2019 Oracle. All Rights Reserved. |
| 4 | * Author: Darrick J. Wong <darrick.wong@oracle.com> |
| 5 | */ |
| 6 | #include "xfs.h" |
| 7 | #include "xfs_fs.h" |
| 8 | #include "xfs_shared.h" |
| 9 | #include "xfs_format.h" |
| 10 | #include "xfs_log_format.h" |
| 11 | #include "xfs_trans_resv.h" |
| 12 | #include "xfs_mount.h" |
| 13 | #include "xfs_inode.h" |
| 14 | #include "xfs_btree.h" |
| 15 | #include "xfs_ialloc.h" |
| 16 | #include "xfs_ialloc_btree.h" |
| 17 | #include "xfs_iwalk.h" |
| 18 | #include "xfs_itable.h" |
| 19 | #include "xfs_error.h" |
| 20 | #include "xfs_trace.h" |
| 21 | #include "xfs_icache.h" |
| 22 | #include "xfs_health.h" |
| 23 | #include "xfs_trans.h" |
| 24 | |
| 25 | /* |
| 26 | * Walking Inodes in the Filesystem |
| 27 | * ================================ |
| 28 | * |
| 29 | * This iterator function walks a subset of filesystem inodes in increasing |
| 30 | * order from @startino until there are no more inodes. For each allocated |
| 31 | * inode it finds, it calls a walk function with the relevant inode number and |
| 32 | * a pointer to caller-provided data. The walk function can return the usual |
| 33 | * negative error code to stop the iteration; 0 to continue the iteration; or |
| 34 | * XFS_IWALK_ABORT to stop the iteration. This return value is returned to the |
| 35 | * caller. |
| 36 | * |
| 37 | * Internally, we allow the walk function to do anything, which means that we |
| 38 | * cannot maintain the inobt cursor or our lock on the AGI buffer. We |
| 39 | * therefore cache the inobt records in kernel memory and only call the walk |
| 40 | * function when our memory buffer is full. @nr_recs is the number of records |
| 41 | * that we've cached, and @sz_recs is the size of our cache. |
| 42 | * |
| 43 | * It is the responsibility of the walk function to ensure it accesses |
| 44 | * allocated inodes, as the inobt records may be stale by the time they are |
| 45 | * acted upon. |
| 46 | */ |
| 47 | |
| 48 | struct xfs_iwalk_ag { |
| 49 | struct xfs_mount *mp; |
| 50 | struct xfs_trans *tp; |
| 51 | |
| 52 | /* Where do we start the traversal? */ |
| 53 | xfs_ino_t startino; |
| 54 | |
| 55 | /* Array of inobt records we cache. */ |
| 56 | struct xfs_inobt_rec_incore *recs; |
| 57 | |
| 58 | /* Number of entries allocated for the @recs array. */ |
| 59 | unsigned int sz_recs; |
| 60 | |
| 61 | /* Number of entries in the @recs array that are in use. */ |
| 62 | unsigned int nr_recs; |
| 63 | |
| 64 | /* Inode walk function and data pointer. */ |
| 65 | xfs_iwalk_fn iwalk_fn; |
| 66 | void *data; |
| 67 | }; |
| 68 | |
| 69 | /* Allocate memory for a walk. */ |
| 70 | STATIC int |
| 71 | xfs_iwalk_alloc( |
| 72 | struct xfs_iwalk_ag *iwag) |
| 73 | { |
| 74 | size_t size; |
| 75 | |
| 76 | ASSERT(iwag->recs == NULL); |
| 77 | iwag->nr_recs = 0; |
| 78 | |
| 79 | /* Allocate a prefetch buffer for inobt records. */ |
| 80 | size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore); |
| 81 | iwag->recs = kmem_alloc(size, KM_MAYFAIL); |
| 82 | if (iwag->recs == NULL) |
| 83 | return -ENOMEM; |
| 84 | |
| 85 | return 0; |
| 86 | } |
| 87 | |
| 88 | /* Free memory we allocated for a walk. */ |
| 89 | STATIC void |
| 90 | xfs_iwalk_free( |
| 91 | struct xfs_iwalk_ag *iwag) |
| 92 | { |
| 93 | kmem_free(iwag->recs); |
| 94 | iwag->recs = NULL; |
| 95 | } |
| 96 | |
| 97 | /* For each inuse inode in each cached inobt record, call our function. */ |
| 98 | STATIC int |
| 99 | xfs_iwalk_ag_recs( |
| 100 | struct xfs_iwalk_ag *iwag) |
| 101 | { |
| 102 | struct xfs_mount *mp = iwag->mp; |
| 103 | struct xfs_trans *tp = iwag->tp; |
| 104 | xfs_ino_t ino; |
| 105 | unsigned int i, j; |
| 106 | xfs_agnumber_t agno; |
| 107 | int error; |
| 108 | |
| 109 | agno = XFS_INO_TO_AGNO(mp, iwag->startino); |
| 110 | for (i = 0; i < iwag->nr_recs; i++) { |
| 111 | struct xfs_inobt_rec_incore *irec = &iwag->recs[i]; |
| 112 | |
| 113 | trace_xfs_iwalk_ag_rec(mp, agno, irec); |
| 114 | |
| 115 | for (j = 0; j < XFS_INODES_PER_CHUNK; j++) { |
| 116 | /* Skip if this inode is free */ |
| 117 | if (XFS_INOBT_MASK(j) & irec->ir_free) |
| 118 | continue; |
| 119 | |
| 120 | /* Otherwise call our function. */ |
| 121 | ino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino + j); |
| 122 | error = iwag->iwalk_fn(mp, tp, ino, iwag->data); |
| 123 | if (error) |
| 124 | return error; |
| 125 | } |
| 126 | } |
| 127 | |
| 128 | return 0; |
| 129 | } |
| 130 | |
| 131 | /* Delete cursor and let go of AGI. */ |
| 132 | static inline void |
| 133 | xfs_iwalk_del_inobt( |
| 134 | struct xfs_trans *tp, |
| 135 | struct xfs_btree_cur **curpp, |
| 136 | struct xfs_buf **agi_bpp, |
| 137 | int error) |
| 138 | { |
| 139 | if (*curpp) { |
| 140 | xfs_btree_del_cursor(*curpp, error); |
| 141 | *curpp = NULL; |
| 142 | } |
| 143 | if (*agi_bpp) { |
| 144 | xfs_trans_brelse(tp, *agi_bpp); |
| 145 | *agi_bpp = NULL; |
| 146 | } |
| 147 | } |
| 148 | |
| 149 | /* |
| 150 | * Set ourselves up for walking inobt records starting from a given point in |
| 151 | * the filesystem. |
| 152 | * |
| 153 | * If caller passed in a nonzero start inode number, load the record from the |
| 154 | * inobt and make the record look like all the inodes before agino are free so |
| 155 | * that we skip them, and then move the cursor to the next inobt record. This |
| 156 | * is how we support starting an iwalk in the middle of an inode chunk. |
| 157 | * |
| 158 | * If the caller passed in a start number of zero, move the cursor to the first |
| 159 | * inobt record. |
| 160 | * |
| 161 | * The caller is responsible for cleaning up the cursor and buffer pointer |
| 162 | * regardless of the error status. |
| 163 | */ |
| 164 | STATIC int |
| 165 | xfs_iwalk_ag_start( |
| 166 | struct xfs_iwalk_ag *iwag, |
| 167 | xfs_agnumber_t agno, |
| 168 | xfs_agino_t agino, |
| 169 | struct xfs_btree_cur **curpp, |
| 170 | struct xfs_buf **agi_bpp, |
| 171 | int *has_more) |
| 172 | { |
| 173 | struct xfs_mount *mp = iwag->mp; |
| 174 | struct xfs_trans *tp = iwag->tp; |
| 175 | int icount; |
| 176 | int error; |
| 177 | |
| 178 | /* Set up a fresh cursor and empty the inobt cache. */ |
| 179 | iwag->nr_recs = 0; |
| 180 | error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp); |
| 181 | if (error) |
| 182 | return error; |
| 183 | |
| 184 | /* Starting at the beginning of the AG? That's easy! */ |
| 185 | if (agino == 0) |
| 186 | return xfs_inobt_lookup(*curpp, 0, XFS_LOOKUP_GE, has_more); |
| 187 | |
| 188 | /* |
| 189 | * Otherwise, we have to grab the inobt record where we left off, stuff |
| 190 | * the record into our cache, and then see if there are more records. |
| 191 | * We require a lookup cache of at least two elements so that we don't |
| 192 | * have to deal with tearing down the cursor to walk the records. |
| 193 | */ |
| 194 | error = xfs_bulkstat_grab_ichunk(*curpp, agino - 1, &icount, |
| 195 | &iwag->recs[iwag->nr_recs]); |
| 196 | if (error) |
| 197 | return error; |
| 198 | if (icount) |
| 199 | iwag->nr_recs++; |
| 200 | |
| 201 | /* |
| 202 | * The prefetch calculation is supposed to give us a large enough inobt |
| 203 | * record cache that grab_ichunk can stage a partial first record and |
| 204 | * the loop body can cache a record without having to check for cache |
| 205 | * space until after it reads an inobt record. |
| 206 | */ |
| 207 | ASSERT(iwag->nr_recs < iwag->sz_recs); |
| 208 | |
| 209 | return xfs_btree_increment(*curpp, 0, has_more); |
| 210 | } |
| 211 | |
| 212 | /* |
| 213 | * The inobt record cache is full, so preserve the inobt cursor state and |
| 214 | * run callbacks on the cached inobt records. When we're done, restore the |
| 215 | * cursor state to wherever the cursor would have been had the cache not been |
| 216 | * full (and therefore we could've just incremented the cursor) if *@has_more |
| 217 | * is true. On exit, *@has_more will indicate whether or not the caller should |
| 218 | * try for more inode records. |
| 219 | */ |
| 220 | STATIC int |
| 221 | xfs_iwalk_run_callbacks( |
| 222 | struct xfs_iwalk_ag *iwag, |
| 223 | xfs_agnumber_t agno, |
| 224 | struct xfs_btree_cur **curpp, |
| 225 | struct xfs_buf **agi_bpp, |
| 226 | int *has_more) |
| 227 | { |
| 228 | struct xfs_mount *mp = iwag->mp; |
| 229 | struct xfs_trans *tp = iwag->tp; |
| 230 | struct xfs_inobt_rec_incore *irec; |
| 231 | xfs_agino_t restart; |
| 232 | int error; |
| 233 | |
| 234 | ASSERT(iwag->nr_recs > 0); |
| 235 | |
| 236 | /* Delete cursor but remember the last record we cached... */ |
| 237 | xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0); |
| 238 | irec = &iwag->recs[iwag->nr_recs - 1]; |
| 239 | restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1; |
| 240 | |
| 241 | error = xfs_iwalk_ag_recs(iwag); |
| 242 | if (error) |
| 243 | return error; |
| 244 | |
| 245 | /* ...empty the cache... */ |
| 246 | iwag->nr_recs = 0; |
| 247 | |
| 248 | if (!has_more) |
| 249 | return 0; |
| 250 | |
| 251 | /* ...and recreate the cursor just past where we left off. */ |
| 252 | error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp); |
| 253 | if (error) |
| 254 | return error; |
| 255 | |
| 256 | return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more); |
| 257 | } |
| 258 | |
| 259 | /* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */ |
| 260 | STATIC int |
| 261 | xfs_iwalk_ag( |
| 262 | struct xfs_iwalk_ag *iwag) |
| 263 | { |
| 264 | struct xfs_mount *mp = iwag->mp; |
| 265 | struct xfs_trans *tp = iwag->tp; |
| 266 | struct xfs_buf *agi_bp = NULL; |
| 267 | struct xfs_btree_cur *cur = NULL; |
| 268 | xfs_agnumber_t agno; |
| 269 | xfs_agino_t agino; |
| 270 | int has_more; |
| 271 | int error = 0; |
| 272 | |
| 273 | /* Set up our cursor at the right place in the inode btree. */ |
| 274 | agno = XFS_INO_TO_AGNO(mp, iwag->startino); |
| 275 | agino = XFS_INO_TO_AGINO(mp, iwag->startino); |
| 276 | error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more); |
| 277 | |
| 278 | while (!error && has_more) { |
| 279 | struct xfs_inobt_rec_incore *irec; |
| 280 | |
| 281 | cond_resched(); |
| 282 | |
| 283 | /* Fetch the inobt record. */ |
| 284 | irec = &iwag->recs[iwag->nr_recs]; |
| 285 | error = xfs_inobt_get_rec(cur, irec, &has_more); |
| 286 | if (error || !has_more) |
| 287 | break; |
| 288 | |
| 289 | /* No allocated inodes in this chunk; skip it. */ |
| 290 | if (irec->ir_freecount == irec->ir_count) { |
| 291 | error = xfs_btree_increment(cur, 0, &has_more); |
| 292 | if (error) |
| 293 | break; |
| 294 | continue; |
| 295 | } |
| 296 | |
| 297 | /* |
| 298 | * Start readahead for this inode chunk in anticipation of |
| 299 | * walking the inodes. |
| 300 | */ |
| 301 | xfs_bulkstat_ichunk_ra(mp, agno, irec); |
| 302 | |
| 303 | /* |
| 304 | * If there's space in the buffer for more records, increment |
| 305 | * the btree cursor and grab more. |
| 306 | */ |
| 307 | if (++iwag->nr_recs < iwag->sz_recs) { |
| 308 | error = xfs_btree_increment(cur, 0, &has_more); |
| 309 | if (error || !has_more) |
| 310 | break; |
| 311 | continue; |
| 312 | } |
| 313 | |
| 314 | /* |
| 315 | * Otherwise, we need to save cursor state and run the callback |
| 316 | * function on the cached records. The run_callbacks function |
| 317 | * is supposed to return a cursor pointing to the record where |
| 318 | * we would be if we had been able to increment like above. |
| 319 | */ |
| 320 | ASSERT(has_more); |
| 321 | error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp, |
| 322 | &has_more); |
| 323 | } |
| 324 | |
| 325 | if (iwag->nr_recs == 0 || error) |
| 326 | goto out; |
| 327 | |
| 328 | /* Walk the unprocessed records in the cache. */ |
| 329 | error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp, &has_more); |
| 330 | |
| 331 | out: |
| 332 | xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error); |
| 333 | return error; |
| 334 | } |
| 335 | |
| 336 | /* |
| 337 | * Given the number of inodes to prefetch, set the number of inobt records that |
| 338 | * we cache in memory, which controls the number of inodes we try to read |
| 339 | * ahead. |
| 340 | */ |
| 341 | static inline unsigned int |
| 342 | xfs_iwalk_prefetch( |
| 343 | unsigned int inode_records) |
| 344 | { |
| 345 | return PAGE_SIZE * 4 / sizeof(struct xfs_inobt_rec_incore); |
| 346 | } |
| 347 | |
| 348 | /* |
| 349 | * Walk all inodes in the filesystem starting from @startino. The @iwalk_fn |
| 350 | * will be called for each allocated inode, being passed the inode's number and |
| 351 | * @data. @max_prefetch controls how many inobt records' worth of inodes we |
| 352 | * try to readahead. |
| 353 | */ |
| 354 | int |
| 355 | xfs_iwalk( |
| 356 | struct xfs_mount *mp, |
| 357 | struct xfs_trans *tp, |
| 358 | xfs_ino_t startino, |
| 359 | xfs_iwalk_fn iwalk_fn, |
| 360 | unsigned int inode_records, |
| 361 | void *data) |
| 362 | { |
| 363 | struct xfs_iwalk_ag iwag = { |
| 364 | .mp = mp, |
| 365 | .tp = tp, |
| 366 | .iwalk_fn = iwalk_fn, |
| 367 | .data = data, |
| 368 | .startino = startino, |
| 369 | .sz_recs = xfs_iwalk_prefetch(inode_records), |
| 370 | }; |
| 371 | xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); |
| 372 | int error; |
| 373 | |
| 374 | ASSERT(agno < mp->m_sb.sb_agcount); |
| 375 | |
| 376 | error = xfs_iwalk_alloc(&iwag); |
| 377 | if (error) |
| 378 | return error; |
| 379 | |
| 380 | for (; agno < mp->m_sb.sb_agcount; agno++) { |
| 381 | error = xfs_iwalk_ag(&iwag); |
| 382 | if (error) |
| 383 | break; |
| 384 | iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); |
| 385 | } |
| 386 | |
| 387 | xfs_iwalk_free(&iwag); |
| 388 | return error; |
| 389 | } |