Vishal Verma | 9e0e252 | 2015-12-24 19:20:32 -0700 | [diff] [blame] | 1 | /* |
| 2 | * Bad block management |
| 3 | * |
| 4 | * - Heavily based on MD badblocks code from Neil Brown |
| 5 | * |
| 6 | * Copyright (c) 2015, Intel Corporation. |
| 7 | * |
| 8 | * This program is free software; you can redistribute it and/or modify it |
| 9 | * under the terms and conditions of the GNU General Public License, |
| 10 | * version 2, as published by the Free Software Foundation. |
| 11 | * |
| 12 | * This program is distributed in the hope it will be useful, but WITHOUT |
| 13 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 14 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for |
| 15 | * more details. |
| 16 | */ |
| 17 | |
| 18 | #include <linux/badblocks.h> |
| 19 | #include <linux/seqlock.h> |
| 20 | #include <linux/kernel.h> |
| 21 | #include <linux/module.h> |
| 22 | #include <linux/stddef.h> |
| 23 | #include <linux/types.h> |
| 24 | #include <linux/slab.h> |
| 25 | |
| 26 | /** |
| 27 | * badblocks_check() - check a given range for bad sectors |
| 28 | * @bb: the badblocks structure that holds all badblock information |
| 29 | * @s: sector (start) at which to check for badblocks |
| 30 | * @sectors: number of sectors to check for badblocks |
| 31 | * @first_bad: pointer to store location of the first badblock |
| 32 | * @bad_sectors: pointer to store number of badblocks after @first_bad |
| 33 | * |
| 34 | * We can record which blocks on each device are 'bad' and so just |
| 35 | * fail those blocks, or that stripe, rather than the whole device. |
| 36 | * Entries in the bad-block table are 64bits wide. This comprises: |
| 37 | * Length of bad-range, in sectors: 0-511 for lengths 1-512 |
| 38 | * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) |
| 39 | * A 'shift' can be set so that larger blocks are tracked and |
| 40 | * consequently larger devices can be covered. |
| 41 | * 'Acknowledged' flag - 1 bit. - the most significant bit. |
| 42 | * |
| 43 | * Locking of the bad-block table uses a seqlock so badblocks_check |
| 44 | * might need to retry if it is very unlucky. |
| 45 | * We will sometimes want to check for bad blocks in a bi_end_io function, |
| 46 | * so we use the write_seqlock_irq variant. |
| 47 | * |
| 48 | * When looking for a bad block we specify a range and want to |
| 49 | * know if any block in the range is bad. So we binary-search |
| 50 | * to the last range that starts at-or-before the given endpoint, |
| 51 | * (or "before the sector after the target range") |
| 52 | * then see if it ends after the given start. |
| 53 | * |
| 54 | * Return: |
| 55 | * 0: there are no known bad blocks in the range |
| 56 | * 1: there are known bad block which are all acknowledged |
| 57 | * -1: there are bad blocks which have not yet been acknowledged in metadata. |
| 58 | * plus the start/length of the first bad section we overlap. |
| 59 | */ |
| 60 | int badblocks_check(struct badblocks *bb, sector_t s, int sectors, |
| 61 | sector_t *first_bad, int *bad_sectors) |
| 62 | { |
| 63 | int hi; |
| 64 | int lo; |
| 65 | u64 *p = bb->page; |
| 66 | int rv; |
| 67 | sector_t target = s + sectors; |
| 68 | unsigned seq; |
| 69 | |
| 70 | if (bb->shift > 0) { |
| 71 | /* round the start down, and the end up */ |
| 72 | s >>= bb->shift; |
| 73 | target += (1<<bb->shift) - 1; |
| 74 | target >>= bb->shift; |
| 75 | sectors = target - s; |
| 76 | } |
| 77 | /* 'target' is now the first block after the bad range */ |
| 78 | |
| 79 | retry: |
| 80 | seq = read_seqbegin(&bb->lock); |
| 81 | lo = 0; |
| 82 | rv = 0; |
| 83 | hi = bb->count; |
| 84 | |
| 85 | /* Binary search between lo and hi for 'target' |
| 86 | * i.e. for the last range that starts before 'target' |
| 87 | */ |
| 88 | /* INVARIANT: ranges before 'lo' and at-or-after 'hi' |
| 89 | * are known not to be the last range before target. |
| 90 | * VARIANT: hi-lo is the number of possible |
| 91 | * ranges, and decreases until it reaches 1 |
| 92 | */ |
| 93 | while (hi - lo > 1) { |
| 94 | int mid = (lo + hi) / 2; |
| 95 | sector_t a = BB_OFFSET(p[mid]); |
| 96 | |
| 97 | if (a < target) |
| 98 | /* This could still be the one, earlier ranges |
| 99 | * could not. |
| 100 | */ |
| 101 | lo = mid; |
| 102 | else |
| 103 | /* This and later ranges are definitely out. */ |
| 104 | hi = mid; |
| 105 | } |
| 106 | /* 'lo' might be the last that started before target, but 'hi' isn't */ |
| 107 | if (hi > lo) { |
| 108 | /* need to check all range that end after 's' to see if |
| 109 | * any are unacknowledged. |
| 110 | */ |
| 111 | while (lo >= 0 && |
| 112 | BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { |
| 113 | if (BB_OFFSET(p[lo]) < target) { |
| 114 | /* starts before the end, and finishes after |
| 115 | * the start, so they must overlap |
| 116 | */ |
| 117 | if (rv != -1 && BB_ACK(p[lo])) |
| 118 | rv = 1; |
| 119 | else |
| 120 | rv = -1; |
| 121 | *first_bad = BB_OFFSET(p[lo]); |
| 122 | *bad_sectors = BB_LEN(p[lo]); |
| 123 | } |
| 124 | lo--; |
| 125 | } |
| 126 | } |
| 127 | |
| 128 | if (read_seqretry(&bb->lock, seq)) |
| 129 | goto retry; |
| 130 | |
| 131 | return rv; |
| 132 | } |
| 133 | EXPORT_SYMBOL_GPL(badblocks_check); |
| 134 | |
| 135 | /** |
| 136 | * badblocks_set() - Add a range of bad blocks to the table. |
| 137 | * @bb: the badblocks structure that holds all badblock information |
| 138 | * @s: first sector to mark as bad |
| 139 | * @sectors: number of sectors to mark as bad |
| 140 | * @acknowledged: weather to mark the bad sectors as acknowledged |
| 141 | * |
| 142 | * This might extend the table, or might contract it if two adjacent ranges |
| 143 | * can be merged. We binary-search to find the 'insertion' point, then |
| 144 | * decide how best to handle it. |
| 145 | * |
| 146 | * Return: |
| 147 | * 0: success |
| 148 | * 1: failed to set badblocks (out of space) |
| 149 | */ |
| 150 | int badblocks_set(struct badblocks *bb, sector_t s, int sectors, |
| 151 | int acknowledged) |
| 152 | { |
| 153 | u64 *p; |
| 154 | int lo, hi; |
| 155 | int rv = 0; |
| 156 | unsigned long flags; |
| 157 | |
| 158 | if (bb->shift < 0) |
| 159 | /* badblocks are disabled */ |
| 160 | return 0; |
| 161 | |
| 162 | if (bb->shift) { |
| 163 | /* round the start down, and the end up */ |
| 164 | sector_t next = s + sectors; |
| 165 | |
| 166 | s >>= bb->shift; |
| 167 | next += (1<<bb->shift) - 1; |
| 168 | next >>= bb->shift; |
| 169 | sectors = next - s; |
| 170 | } |
| 171 | |
| 172 | write_seqlock_irqsave(&bb->lock, flags); |
| 173 | |
| 174 | p = bb->page; |
| 175 | lo = 0; |
| 176 | hi = bb->count; |
| 177 | /* Find the last range that starts at-or-before 's' */ |
| 178 | while (hi - lo > 1) { |
| 179 | int mid = (lo + hi) / 2; |
| 180 | sector_t a = BB_OFFSET(p[mid]); |
| 181 | |
| 182 | if (a <= s) |
| 183 | lo = mid; |
| 184 | else |
| 185 | hi = mid; |
| 186 | } |
| 187 | if (hi > lo && BB_OFFSET(p[lo]) > s) |
| 188 | hi = lo; |
| 189 | |
| 190 | if (hi > lo) { |
| 191 | /* we found a range that might merge with the start |
| 192 | * of our new range |
| 193 | */ |
| 194 | sector_t a = BB_OFFSET(p[lo]); |
| 195 | sector_t e = a + BB_LEN(p[lo]); |
| 196 | int ack = BB_ACK(p[lo]); |
| 197 | |
| 198 | if (e >= s) { |
| 199 | /* Yes, we can merge with a previous range */ |
| 200 | if (s == a && s + sectors >= e) |
| 201 | /* new range covers old */ |
| 202 | ack = acknowledged; |
| 203 | else |
| 204 | ack = ack && acknowledged; |
| 205 | |
| 206 | if (e < s + sectors) |
| 207 | e = s + sectors; |
| 208 | if (e - a <= BB_MAX_LEN) { |
| 209 | p[lo] = BB_MAKE(a, e-a, ack); |
| 210 | s = e; |
| 211 | } else { |
| 212 | /* does not all fit in one range, |
| 213 | * make p[lo] maximal |
| 214 | */ |
| 215 | if (BB_LEN(p[lo]) != BB_MAX_LEN) |
| 216 | p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); |
| 217 | s = a + BB_MAX_LEN; |
| 218 | } |
| 219 | sectors = e - s; |
| 220 | } |
| 221 | } |
| 222 | if (sectors && hi < bb->count) { |
| 223 | /* 'hi' points to the first range that starts after 's'. |
| 224 | * Maybe we can merge with the start of that range |
| 225 | */ |
| 226 | sector_t a = BB_OFFSET(p[hi]); |
| 227 | sector_t e = a + BB_LEN(p[hi]); |
| 228 | int ack = BB_ACK(p[hi]); |
| 229 | |
| 230 | if (a <= s + sectors) { |
| 231 | /* merging is possible */ |
| 232 | if (e <= s + sectors) { |
| 233 | /* full overlap */ |
| 234 | e = s + sectors; |
| 235 | ack = acknowledged; |
| 236 | } else |
| 237 | ack = ack && acknowledged; |
| 238 | |
| 239 | a = s; |
| 240 | if (e - a <= BB_MAX_LEN) { |
| 241 | p[hi] = BB_MAKE(a, e-a, ack); |
| 242 | s = e; |
| 243 | } else { |
| 244 | p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); |
| 245 | s = a + BB_MAX_LEN; |
| 246 | } |
| 247 | sectors = e - s; |
| 248 | lo = hi; |
| 249 | hi++; |
| 250 | } |
| 251 | } |
| 252 | if (sectors == 0 && hi < bb->count) { |
| 253 | /* we might be able to combine lo and hi */ |
| 254 | /* Note: 's' is at the end of 'lo' */ |
| 255 | sector_t a = BB_OFFSET(p[hi]); |
| 256 | int lolen = BB_LEN(p[lo]); |
| 257 | int hilen = BB_LEN(p[hi]); |
| 258 | int newlen = lolen + hilen - (s - a); |
| 259 | |
| 260 | if (s >= a && newlen < BB_MAX_LEN) { |
| 261 | /* yes, we can combine them */ |
| 262 | int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); |
| 263 | |
| 264 | p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); |
| 265 | memmove(p + hi, p + hi + 1, |
| 266 | (bb->count - hi - 1) * 8); |
| 267 | bb->count--; |
| 268 | } |
| 269 | } |
| 270 | while (sectors) { |
| 271 | /* didn't merge (it all). |
| 272 | * Need to add a range just before 'hi' |
| 273 | */ |
| 274 | if (bb->count >= MAX_BADBLOCKS) { |
| 275 | /* No room for more */ |
| 276 | rv = 1; |
| 277 | break; |
| 278 | } else { |
| 279 | int this_sectors = sectors; |
| 280 | |
| 281 | memmove(p + hi + 1, p + hi, |
| 282 | (bb->count - hi) * 8); |
| 283 | bb->count++; |
| 284 | |
| 285 | if (this_sectors > BB_MAX_LEN) |
| 286 | this_sectors = BB_MAX_LEN; |
| 287 | p[hi] = BB_MAKE(s, this_sectors, acknowledged); |
| 288 | sectors -= this_sectors; |
| 289 | s += this_sectors; |
| 290 | } |
| 291 | } |
| 292 | |
| 293 | bb->changed = 1; |
| 294 | if (!acknowledged) |
| 295 | bb->unacked_exist = 1; |
| 296 | write_sequnlock_irqrestore(&bb->lock, flags); |
| 297 | |
| 298 | return rv; |
| 299 | } |
| 300 | EXPORT_SYMBOL_GPL(badblocks_set); |
| 301 | |
| 302 | /** |
| 303 | * badblocks_clear() - Remove a range of bad blocks to the table. |
| 304 | * @bb: the badblocks structure that holds all badblock information |
| 305 | * @s: first sector to mark as bad |
| 306 | * @sectors: number of sectors to mark as bad |
| 307 | * |
| 308 | * This may involve extending the table if we spilt a region, |
| 309 | * but it must not fail. So if the table becomes full, we just |
| 310 | * drop the remove request. |
| 311 | * |
| 312 | * Return: |
| 313 | * 0: success |
| 314 | * 1: failed to clear badblocks |
| 315 | */ |
| 316 | int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) |
| 317 | { |
| 318 | u64 *p; |
| 319 | int lo, hi; |
| 320 | sector_t target = s + sectors; |
| 321 | int rv = 0; |
| 322 | |
| 323 | if (bb->shift > 0) { |
| 324 | /* When clearing we round the start up and the end down. |
| 325 | * This should not matter as the shift should align with |
| 326 | * the block size and no rounding should ever be needed. |
| 327 | * However it is better the think a block is bad when it |
| 328 | * isn't than to think a block is not bad when it is. |
| 329 | */ |
| 330 | s += (1<<bb->shift) - 1; |
| 331 | s >>= bb->shift; |
| 332 | target >>= bb->shift; |
| 333 | sectors = target - s; |
| 334 | } |
| 335 | |
| 336 | write_seqlock_irq(&bb->lock); |
| 337 | |
| 338 | p = bb->page; |
| 339 | lo = 0; |
| 340 | hi = bb->count; |
| 341 | /* Find the last range that starts before 'target' */ |
| 342 | while (hi - lo > 1) { |
| 343 | int mid = (lo + hi) / 2; |
| 344 | sector_t a = BB_OFFSET(p[mid]); |
| 345 | |
| 346 | if (a < target) |
| 347 | lo = mid; |
| 348 | else |
| 349 | hi = mid; |
| 350 | } |
| 351 | if (hi > lo) { |
| 352 | /* p[lo] is the last range that could overlap the |
| 353 | * current range. Earlier ranges could also overlap, |
| 354 | * but only this one can overlap the end of the range. |
| 355 | */ |
| 356 | if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { |
| 357 | /* Partial overlap, leave the tail of this range */ |
| 358 | int ack = BB_ACK(p[lo]); |
| 359 | sector_t a = BB_OFFSET(p[lo]); |
| 360 | sector_t end = a + BB_LEN(p[lo]); |
| 361 | |
| 362 | if (a < s) { |
| 363 | /* we need to split this range */ |
| 364 | if (bb->count >= MAX_BADBLOCKS) { |
| 365 | rv = -ENOSPC; |
| 366 | goto out; |
| 367 | } |
| 368 | memmove(p+lo+1, p+lo, (bb->count - lo) * 8); |
| 369 | bb->count++; |
| 370 | p[lo] = BB_MAKE(a, s-a, ack); |
| 371 | lo++; |
| 372 | } |
| 373 | p[lo] = BB_MAKE(target, end - target, ack); |
| 374 | /* there is no longer an overlap */ |
| 375 | hi = lo; |
| 376 | lo--; |
| 377 | } |
| 378 | while (lo >= 0 && |
| 379 | BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { |
| 380 | /* This range does overlap */ |
| 381 | if (BB_OFFSET(p[lo]) < s) { |
| 382 | /* Keep the early parts of this range. */ |
| 383 | int ack = BB_ACK(p[lo]); |
| 384 | sector_t start = BB_OFFSET(p[lo]); |
| 385 | |
| 386 | p[lo] = BB_MAKE(start, s - start, ack); |
| 387 | /* now low doesn't overlap, so.. */ |
| 388 | break; |
| 389 | } |
| 390 | lo--; |
| 391 | } |
| 392 | /* 'lo' is strictly before, 'hi' is strictly after, |
| 393 | * anything between needs to be discarded |
| 394 | */ |
| 395 | if (hi - lo > 1) { |
| 396 | memmove(p+lo+1, p+hi, (bb->count - hi) * 8); |
| 397 | bb->count -= (hi - lo - 1); |
| 398 | } |
| 399 | } |
| 400 | |
| 401 | bb->changed = 1; |
| 402 | out: |
| 403 | write_sequnlock_irq(&bb->lock); |
| 404 | return rv; |
| 405 | } |
| 406 | EXPORT_SYMBOL_GPL(badblocks_clear); |
| 407 | |
| 408 | /** |
| 409 | * ack_all_badblocks() - Acknowledge all bad blocks in a list. |
| 410 | * @bb: the badblocks structure that holds all badblock information |
| 411 | * |
| 412 | * This only succeeds if ->changed is clear. It is used by |
| 413 | * in-kernel metadata updates |
| 414 | */ |
| 415 | void ack_all_badblocks(struct badblocks *bb) |
| 416 | { |
| 417 | if (bb->page == NULL || bb->changed) |
| 418 | /* no point even trying */ |
| 419 | return; |
| 420 | write_seqlock_irq(&bb->lock); |
| 421 | |
| 422 | if (bb->changed == 0 && bb->unacked_exist) { |
| 423 | u64 *p = bb->page; |
| 424 | int i; |
| 425 | |
| 426 | for (i = 0; i < bb->count ; i++) { |
| 427 | if (!BB_ACK(p[i])) { |
| 428 | sector_t start = BB_OFFSET(p[i]); |
| 429 | int len = BB_LEN(p[i]); |
| 430 | |
| 431 | p[i] = BB_MAKE(start, len, 1); |
| 432 | } |
| 433 | } |
| 434 | bb->unacked_exist = 0; |
| 435 | } |
| 436 | write_sequnlock_irq(&bb->lock); |
| 437 | } |
| 438 | EXPORT_SYMBOL_GPL(ack_all_badblocks); |
| 439 | |
| 440 | /** |
| 441 | * badblocks_show() - sysfs access to bad-blocks list |
| 442 | * @bb: the badblocks structure that holds all badblock information |
| 443 | * @page: buffer received from sysfs |
| 444 | * @unack: weather to show unacknowledged badblocks |
| 445 | * |
| 446 | * Return: |
| 447 | * Length of returned data |
| 448 | */ |
| 449 | ssize_t badblocks_show(struct badblocks *bb, char *page, int unack) |
| 450 | { |
| 451 | size_t len; |
| 452 | int i; |
| 453 | u64 *p = bb->page; |
| 454 | unsigned seq; |
| 455 | |
| 456 | if (bb->shift < 0) |
| 457 | return 0; |
| 458 | |
| 459 | retry: |
| 460 | seq = read_seqbegin(&bb->lock); |
| 461 | |
| 462 | len = 0; |
| 463 | i = 0; |
| 464 | |
| 465 | while (len < PAGE_SIZE && i < bb->count) { |
| 466 | sector_t s = BB_OFFSET(p[i]); |
| 467 | unsigned int length = BB_LEN(p[i]); |
| 468 | int ack = BB_ACK(p[i]); |
| 469 | |
| 470 | i++; |
| 471 | |
| 472 | if (unack && ack) |
| 473 | continue; |
| 474 | |
| 475 | len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", |
| 476 | (unsigned long long)s << bb->shift, |
| 477 | length << bb->shift); |
| 478 | } |
| 479 | if (unack && len == 0) |
| 480 | bb->unacked_exist = 0; |
| 481 | |
| 482 | if (read_seqretry(&bb->lock, seq)) |
| 483 | goto retry; |
| 484 | |
| 485 | return len; |
| 486 | } |
| 487 | EXPORT_SYMBOL_GPL(badblocks_show); |
| 488 | |
| 489 | /** |
| 490 | * badblocks_store() - sysfs access to bad-blocks list |
| 491 | * @bb: the badblocks structure that holds all badblock information |
| 492 | * @page: buffer received from sysfs |
| 493 | * @len: length of data received from sysfs |
| 494 | * @unack: weather to show unacknowledged badblocks |
| 495 | * |
| 496 | * Return: |
| 497 | * Length of the buffer processed or -ve error. |
| 498 | */ |
| 499 | ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, |
| 500 | int unack) |
| 501 | { |
| 502 | unsigned long long sector; |
| 503 | int length; |
| 504 | char newline; |
| 505 | |
| 506 | switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { |
| 507 | case 3: |
| 508 | if (newline != '\n') |
| 509 | return -EINVAL; |
| 510 | case 2: |
| 511 | if (length <= 0) |
| 512 | return -EINVAL; |
| 513 | break; |
| 514 | default: |
| 515 | return -EINVAL; |
| 516 | } |
| 517 | |
| 518 | if (badblocks_set(bb, sector, length, !unack)) |
| 519 | return -ENOSPC; |
| 520 | else |
| 521 | return len; |
| 522 | } |
| 523 | EXPORT_SYMBOL_GPL(badblocks_store); |
| 524 | |
| 525 | /** |
| 526 | * badblocks_init() - initialize the badblocks structure |
| 527 | * @bb: the badblocks structure that holds all badblock information |
| 528 | * @enable: weather to enable badblocks accounting |
| 529 | * |
| 530 | * Return: |
| 531 | * 0: success |
| 532 | * -ve errno: on error |
| 533 | */ |
| 534 | int badblocks_init(struct badblocks *bb, int enable) |
| 535 | { |
| 536 | bb->count = 0; |
| 537 | if (enable) |
| 538 | bb->shift = 0; |
| 539 | else |
| 540 | bb->shift = -1; |
| 541 | bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL); |
| 542 | if (bb->page == (u64 *)0) { |
| 543 | bb->shift = -1; |
| 544 | return -ENOMEM; |
| 545 | } |
| 546 | seqlock_init(&bb->lock); |
| 547 | |
| 548 | return 0; |
| 549 | } |
| 550 | EXPORT_SYMBOL_GPL(badblocks_init); |
| 551 | |
| 552 | /** |
Dan Williams | d3b407fb | 2016-01-06 12:19:22 -0800 | [diff] [blame^] | 553 | * badblocks_exit() - free the badblocks structure |
Vishal Verma | 9e0e252 | 2015-12-24 19:20:32 -0700 | [diff] [blame] | 554 | * @bb: the badblocks structure that holds all badblock information |
| 555 | */ |
Dan Williams | d3b407fb | 2016-01-06 12:19:22 -0800 | [diff] [blame^] | 556 | void badblocks_exit(struct badblocks *bb) |
Vishal Verma | 9e0e252 | 2015-12-24 19:20:32 -0700 | [diff] [blame] | 557 | { |
| 558 | kfree(bb->page); |
| 559 | bb->page = NULL; |
| 560 | } |
Dan Williams | d3b407fb | 2016-01-06 12:19:22 -0800 | [diff] [blame^] | 561 | EXPORT_SYMBOL_GPL(badblocks_exit); |