Btrfs: account for large extents with enospc
On our gluster boxes we stream large tar balls of backups onto our fses. With
160gb of ram this means we get really large contiguous ranges of dirty data, but
the way our ENOSPC stuff works is that as long as it's contiguous we only hold
metadata reservation for one extent. The problem is we limit our extents to
128mb, so we'll end up with at least 800 extents so our enospc accounting is
quite a bit lower than what we need. To keep track of this make sure we
increase outstanding_extents for every multiple of the max extent size so we can
be sure to have enough reserved metadata space. Thanks,
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3b95792..8564d8c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1530,10 +1530,45 @@
static void btrfs_split_extent_hook(struct inode *inode,
struct extent_state *orig, u64 split)
{
+ u64 size;
+
/* not delalloc, ignore it */
if (!(orig->state & EXTENT_DELALLOC))
return;
+ size = orig->end - orig->start + 1;
+ if (size > BTRFS_MAX_EXTENT_SIZE) {
+ u64 num_extents;
+ u64 new_size;
+
+ /*
+ * We need the largest size of the remaining extent to see if we
+ * need to add a new outstanding extent. Think of the following
+ * case
+ *
+ * [MEAX_EXTENT_SIZEx2 - 4k][4k]
+ *
+ * The new_size would just be 4k and we'd think we had enough
+ * outstanding extents for this if we only took one side of the
+ * split, same goes for the other direction. We need to see if
+ * the larger size still is the same amount of extents as the
+ * original size, because if it is we need to add a new
+ * outstanding extent. But if we split up and the larger size
+ * is less than the original then we are good to go since we've
+ * already accounted for the extra extent in our original
+ * accounting.
+ */
+ new_size = orig->end - split + 1;
+ if ((split - orig->start) > new_size)
+ new_size = split - orig->start;
+
+ num_extents = div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+ if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE) < num_extents)
+ return;
+ }
+
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
@@ -1549,10 +1584,34 @@
struct extent_state *new,
struct extent_state *other)
{
+ u64 new_size, old_size;
+ u64 num_extents;
+
/* not delalloc, ignore it */
if (!(other->state & EXTENT_DELALLOC))
return;
+ old_size = other->end - other->start + 1;
+ new_size = old_size + (new->end - new->start + 1);
+
+ /* we're not bigger than the max, unreserve the space and go */
+ if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ return;
+ }
+
+ /*
+ * If we grew by another max_extent, just return, we want to keep that
+ * reserved amount.
+ */
+ num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+ if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE) > num_extents)
+ return;
+
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents--;
spin_unlock(&BTRFS_I(inode)->lock);
@@ -1648,6 +1707,8 @@
unsigned *bits)
{
u64 len = state->end + 1 - state->start;
+ u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1,
+ BTRFS_MAX_EXTENT_SIZE);
spin_lock(&BTRFS_I(inode)->lock);
if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
@@ -1667,7 +1728,7 @@
*bits &= ~EXTENT_FIRST_DELALLOC;
} else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents--;
+ BTRFS_I(inode)->outstanding_extents -= num_extents;
spin_unlock(&BTRFS_I(inode)->lock);
}