Btrfs: add device counters for detected IO and checksum errors The goal is to detect when drives start to get an increased error rate, when drives should be replaced soon. Therefore statistic counters are added that count IO errors (read, write and flush). Additionally, the software detected errors like checksum errors and corrupted blocks are counted. Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>

commit: 442a4f6308e694e0fa6025708bd5e4e424bbf51c [log] [tgz]
author: Stefan Behrens <sbehrens@giantdisaster.de> Fri May 25 16:06:08 2012 +0200
committer: Josef Bacik <josef@redhat.com> Wed May 30 10:23:39 2012 -0400
tree: e782db1bcbec25283048d77871e0bed7ad04567c
parent: d07eb9117050c9ed3f78296ebcc06128b52693be [diff] [blame]
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 69a527c..b3692c1 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c

@@ -1913,6 +1913,7 @@
 	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
 		/* try to remap that extent elsewhere? */
 		bio_put(bio);
+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
 		return -EIO;
 	}
 
@@ -2327,10 +2328,23 @@
 		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
 			ret = tree->ops->readpage_end_io_hook(page, start, end,
 							      state, mirror);
-			if (ret)
+			if (ret) {
+				/* no IO indicated but software detected errors
+				 * in the block, either checksum errors or
+				 * issues with the contents */
+				struct btrfs_root *root =
+					BTRFS_I(page->mapping->host)->root;
+				struct btrfs_device *device;
+
 				uptodate = 0;
-			else
+				device = btrfs_find_device_for_logical(
+						root, start, mirror);
+				if (device)
+					btrfs_dev_stat_inc_and_print(device,
+						BTRFS_DEV_STAT_CORRUPTION_ERRS);
+			} else {
 				clean_io_failure(start, page);
+			}
 		}
 
 		if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
commit	442a4f6308e694e0fa6025708bd5e4e424bbf51c	[log] [tgz]
author	Stefan Behrens <sbehrens@giantdisaster.de>	Fri May 25 16:06:08 2012 +0200
committer	Josef Bacik <josef@redhat.com>	Wed May 30 10:23:39 2012 -0400
tree	e782db1bcbec25283048d77871e0bed7ad04567c
parent	d07eb9117050c9ed3f78296ebcc06128b52693be [diff] [blame]