diff mbox

btrfs: make the delalloc block rsv per inode

Message ID 1503004997-8396-1-git-send-email-jbacik@fb.com (mailing list archive)
State New, archived
Headers show

Commit Message

Josef Bacik Aug. 17, 2017, 9:23 p.m. UTC
From: Josef Bacik <jbacik@fb.com>

The way we handle delalloc metadata reservations has gotten
progressively more complicated over the years.  There is so much cruft
and weirdness around keeping the reserved count and outstanding counters
consistent and handling the error cases that it's impossible to
understand.

Fix this by making the delalloc block rsv per-inode.  This way we can
calculate the actual size of the outstanding metadata reservations every
time we make a change, and then reserve the delta based on that amount.
This greatly simplifies the code everywhere, and makes the error
handling in btrfs_delalloc_reserve_metadata far less terrifying.

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/btrfs_inode.h       |  29 ++--
 fs/btrfs/ctree.h             |   5 +-
 fs/btrfs/delayed-inode.c     |  46 +-----
 fs/btrfs/disk-io.c           |  18 ++-
 fs/btrfs/extent-tree.c       | 330 +++++++++++++++----------------------------
 fs/btrfs/inode.c             |  18 ++-
 include/trace/events/btrfs.h |  22 ---
 7 files changed, 147 insertions(+), 321 deletions(-)

Comments

Nikolay Borisov Aug. 21, 2017, 12:48 p.m. UTC | #1
On 18.08.2017 00:23, josef@toxicpanda.com wrote:
> From: Josef Bacik <jbacik@fb.com>
> 
> The way we handle delalloc metadata reservations has gotten
> progressively more complicated over the years.  There is so much cruft
> and weirdness around keeping the reserved count and outstanding counters
> consistent and handling the error cases that it's impossible to
> understand.
> 
> Fix this by making the delalloc block rsv per-inode.  This way we can
> calculate the actual size of the outstanding metadata reservations every
> time we make a change, and then reserve the delta based on that amount.
> This greatly simplifies the code everywhere, and makes the error
> handling in btrfs_delalloc_reserve_metadata far less terrifying.
> 
> Signed-off-by: Josef Bacik <jbacik@fb.com>
> ---
>  fs/btrfs/btrfs_inode.h       |  29 ++--
>  fs/btrfs/ctree.h             |   5 +-
>  fs/btrfs/delayed-inode.c     |  46 +-----
>  fs/btrfs/disk-io.c           |  18 ++-
>  fs/btrfs/extent-tree.c       | 330 +++++++++++++++----------------------------
>  fs/btrfs/inode.c             |  18 ++-
>  include/trace/events/btrfs.h |  22 ---
>  7 files changed, 147 insertions(+), 321 deletions(-)
> 
> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> index 2894ad6..71a4ded 100644
> --- a/fs/btrfs/btrfs_inode.h
> +++ b/fs/btrfs/btrfs_inode.h
> @@ -36,14 +36,13 @@
>  #define BTRFS_INODE_ORPHAN_META_RESERVED	1
>  #define BTRFS_INODE_DUMMY			2
>  #define BTRFS_INODE_IN_DEFRAG			3
> -#define BTRFS_INODE_DELALLOC_META_RESERVED	4
> -#define BTRFS_INODE_HAS_ORPHAN_ITEM		5
> -#define BTRFS_INODE_HAS_ASYNC_EXTENT		6
> -#define BTRFS_INODE_NEEDS_FULL_SYNC		7
> -#define BTRFS_INODE_COPY_EVERYTHING		8
> -#define BTRFS_INODE_IN_DELALLOC_LIST		9
> -#define BTRFS_INODE_READDIO_NEED_LOCK		10
> -#define BTRFS_INODE_HAS_PROPS		        11
> +#define BTRFS_INODE_HAS_ORPHAN_ITEM		4
> +#define BTRFS_INODE_HAS_ASYNC_EXTENT		5
> +#define BTRFS_INODE_NEEDS_FULL_SYNC		6
> +#define BTRFS_INODE_COPY_EVERYTHING		7
> +#define BTRFS_INODE_IN_DELALLOC_LIST		8
> +#define BTRFS_INODE_READDIO_NEED_LOCK		9
> +#define BTRFS_INODE_HAS_PROPS		        10
>  
>  /* in memory btrfs inode */
>  struct btrfs_inode {
> @@ -176,7 +175,8 @@ struct btrfs_inode {
>  	 * of extent items we've reserved metadata for.
>  	 */
>  	unsigned outstanding_extents;
> -	unsigned reserved_extents;
> +
> +	struct btrfs_block_rsv block_rsv;
>  
>  	/*
>  	 * Cached values if inode properties
> @@ -278,17 +278,6 @@ static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
>  						  mod);
>  }
>  
> -static inline void btrfs_mod_reserved_extents(struct btrfs_inode *inode,
> -					      int mod)
> -{
> -	ASSERT(spin_is_locked(&inode->lock));
> -	inode->reserved_extents += mod;
> -	if (btrfs_is_free_space_inode(inode))
> -		return;
> -	trace_btrfs_inode_mod_reserved_extents(inode->root, btrfs_ino(inode),
> -					       mod);
> -}
> -
>  static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
>  {
>  	int ret = 0;
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index b43114d..a4ee115 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -763,8 +763,6 @@ struct btrfs_fs_info {
>  	 * delayed dir index item
>  	 */
>  	struct btrfs_block_rsv global_block_rsv;
> -	/* block reservation for delay allocation */
> -	struct btrfs_block_rsv delalloc_block_rsv;
>  	/* block reservation for metadata operations */
>  	struct btrfs_block_rsv trans_block_rsv;
>  	/* block reservation for chunk tree */
> @@ -2744,6 +2742,9 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
>  void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
>  struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
>  					      unsigned short type);
> +void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
> +				   struct btrfs_block_rsv *rsv,
> +				   unsigned short type);
>  void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
>  			  struct btrfs_block_rsv *rsv);
>  void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv);
> diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
> index 19e4ad2..5d73f79 100644
> --- a/fs/btrfs/delayed-inode.c
> +++ b/fs/btrfs/delayed-inode.c
> @@ -581,7 +581,6 @@ static int btrfs_delayed_inode_reserve_metadata(
>  	struct btrfs_block_rsv *dst_rsv;
>  	u64 num_bytes;
>  	int ret;
> -	bool release = false;
>  
>  	src_rsv = trans->block_rsv;
>  	dst_rsv = &fs_info->delayed_block_rsv;
> @@ -589,36 +588,13 @@ static int btrfs_delayed_inode_reserve_metadata(
>  	num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
>  
>  	/*
> -	 * If our block_rsv is the delalloc block reserve then check and see if
> -	 * we have our extra reservation for updating the inode.  If not fall
> -	 * through and try to reserve space quickly.
> -	 *
> -	 * We used to try and steal from the delalloc block rsv or the global
> -	 * reserve, but we'd steal a full reservation, which isn't kind.  We are
> -	 * here through delalloc which means we've likely just cowed down close
> -	 * to the leaf that contains the inode, so we would steal less just
> -	 * doing the fallback inode update, so if we do end up having to steal
> -	 * from the global block rsv we hopefully only steal one or two blocks
> -	 * worth which is less likely to hurt us.
> -	 */
> -	if (src_rsv && src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
> -		spin_lock(&inode->lock);
> -		if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
> -				       &inode->runtime_flags))
> -			release = true;
> -		else
> -			src_rsv = NULL;
> -		spin_unlock(&inode->lock);
> -	}
> -
> -	/*
>  	 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
>  	 * which doesn't reserve space for speed.  This is a problem since we
>  	 * still need to reserve space for this update, so try to reserve the
>  	 * space.
>  	 *
>  	 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
> -	 * we're accounted for.
> +	 * we always reserve enough to update the inode item.
>  	 */
>  	if (!src_rsv || (!trans->bytes_reserved &&
>  			 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
> @@ -643,32 +619,12 @@ static int btrfs_delayed_inode_reserve_metadata(
>  	}
>  
>  	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
> -
> -	/*
> -	 * Migrate only takes a reservation, it doesn't touch the size of the
> -	 * block_rsv.  This is to simplify people who don't normally have things
> -	 * migrated from their block rsv.  If they go to release their
> -	 * reservation, that will decrease the size as well, so if migrate
> -	 * reduced size we'd end up with a negative size.  But for the
> -	 * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
> -	 * but we could in fact do this reserve/migrate dance several times
> -	 * between the time we did the original reservation and we'd clean it
> -	 * up.  So to take care of this, release the space for the meta
> -	 * reservation here.  I think it may be time for a documentation page on
> -	 * how block rsvs. work.
> -	 */
>  	if (!ret) {
>  		trace_btrfs_space_reservation(fs_info, "delayed_inode",
>  					      btrfs_ino(inode), num_bytes, 1);
>  		node->bytes_reserved = num_bytes;
>  	}
>  
> -	if (release) {
> -		trace_btrfs_space_reservation(fs_info, "delalloc",
> -					      btrfs_ino(inode), num_bytes, 0);
> -		btrfs_block_rsv_release(fs_info, src_rsv, num_bytes);
> -	}
> -
>  	return ret;
>  }
>  
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 9283e28..5681f8f 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2607,14 +2607,6 @@ int open_ctree(struct super_block *sb,
>  		goto fail_delalloc_bytes;
>  	}
>  
> -	fs_info->btree_inode = new_inode(sb);
> -	if (!fs_info->btree_inode) {
> -		err = -ENOMEM;
> -		goto fail_bio_counter;
> -	}
> -
> -	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
> -
>  	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
>  	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
>  	INIT_LIST_HEAD(&fs_info->trans_list);
> @@ -2647,8 +2639,6 @@ int open_ctree(struct super_block *sb,
>  	btrfs_mapping_init(&fs_info->mapping_tree);
>  	btrfs_init_block_rsv(&fs_info->global_block_rsv,
>  			     BTRFS_BLOCK_RSV_GLOBAL);
> -	btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
> -			     BTRFS_BLOCK_RSV_DELALLOC);
>  	btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
>  	btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
>  	btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
> @@ -2679,6 +2669,14 @@ int open_ctree(struct super_block *sb,
>  
>  	INIT_LIST_HEAD(&fs_info->ordered_roots);
>  	spin_lock_init(&fs_info->ordered_root_lock);
> +
> +	fs_info->btree_inode = new_inode(sb);
> +	if (!fs_info->btree_inode) {
> +		err = -ENOMEM;
> +		goto fail_bio_counter;
> +	}
> +	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
> +
>  	fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
>  					GFP_KERNEL);
>  	if (!fs_info->delayed_root) {

This hunk seem to be a leftover from some other work, so you can drop it
for v2. Of course the the removal of the call to btrfs_init_block_rsv
should stay.

> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index a313487..8484e0f 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -4761,7 +4761,6 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
>  static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
>  			    u64 orig, bool wait_ordered)
>  {
> -	struct btrfs_block_rsv *block_rsv;
>  	struct btrfs_space_info *space_info;
>  	struct btrfs_trans_handle *trans;
>  	u64 delalloc_bytes;
> @@ -4777,8 +4776,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
>  	to_reclaim = items * EXTENT_SIZE_PER_ITEM;
>  
>  	trans = (struct btrfs_trans_handle *)current->journal_info;
> -	block_rsv = &fs_info->delalloc_block_rsv;
> -	space_info = block_rsv->space_info;
> +	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
>  
>  	delalloc_bytes = percpu_counter_sum_positive(
>  						&fs_info->delalloc_bytes);
> @@ -5502,7 +5500,8 @@ static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
>  
>  static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
>  				    struct btrfs_block_rsv *block_rsv,
> -				    struct btrfs_block_rsv *dest, u64 num_bytes)
> +				    struct btrfs_block_rsv *dest, u64 num_bytes,
> +				    u64 *released_bytes)
>  {

Instead of adding yet another parameter, why not just return the value?
That way callers can handle it/check on their own accord. It seems this
is only ever used for debugging (tracepoint) purposes anyway

>  	struct btrfs_space_info *space_info = block_rsv->space_info;
>  
> @@ -5520,6 +5519,8 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
>  	spin_unlock(&block_rsv->lock);
>  
>  	if (num_bytes > 0) {
> +		if (released_bytes)
> +			*released_bytes = num_bytes;
>  		if (dest) {
>  			spin_lock(&dest->lock);
>  			if (!dest->full) {
> @@ -5561,6 +5562,15 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
>  	rsv->type = type;
>  }
>  
> +void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
> +				   struct btrfs_block_rsv *rsv,
> +				   unsigned short type)
> +{
> +	btrfs_init_block_rsv(rsv, type);
> +	rsv->space_info = __find_space_info(fs_info,
> +					    BTRFS_BLOCK_GROUP_METADATA);
> +}
> +
>  struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
>  					      unsigned short type)
>  {
> @@ -5570,9 +5580,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
>  	if (!block_rsv)
>  		return NULL;
>  
> -	btrfs_init_block_rsv(block_rsv, type);
> -	block_rsv->space_info = __find_space_info(fs_info,
> -						  BTRFS_BLOCK_GROUP_METADATA);
> +	btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
>  	return block_rsv;
>  }
>  
> @@ -5655,6 +5663,68 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
>  	return ret;
>  }
>  
> +/**
> + * btrfs_inode_rsv_refill - refill the inode block rsv.
> + * @inode - the inode we are refilling.
> + * @flush - the flusing restriction.
> + *
> + * Essentially the same as btrfs_block_rsv_refill, except it uses the
> + * block_rsv->size as the minimum size.  We'll either refill the missing amount
> + * or return if we already have enough space.  This will also handle the resreve
> + * tracepoint for the reserved amount.
> + */
> +int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
> +			   enum btrfs_reserve_flush_enum flush)
> +{
> +	struct btrfs_root *root = inode->root;
> +	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
> +	u64 num_bytes = 0;
> +	int ret = -ENOSPC;
> +
> +	spin_lock(&block_rsv->lock);
> +	if (block_rsv->reserved < block_rsv->size)
> +		num_bytes = block_rsv->size - block_rsv->reserved;
> +	spin_unlock(&block_rsv->lock);
> +
> +	if (num_bytes == 0)
> +		return 0;
> +
> +	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
> +	if (!ret) {
> +		block_rsv_add_bytes(block_rsv, num_bytes, 0);
> +		trace_btrfs_space_reservation(root->fs_info, "delalloc",
> +					      btrfs_ino(inode), num_bytes, 1);
> +	}
> +	return ret;
> +}
> +
> +/**
> + * btrfs_inode_rsv_release - release any excessive reservation.
> + * @inode - the inode we need to release from.
> + *
> + * This is the same as btrfs_block_rsv_release, except that it handles the
> + * tracepoint for the reservation.
> + */
> +void btrfs_inode_rsv_release(struct btrfs_inode *inode)
> +{
> +	struct btrfs_fs_info *fs_info = inode->root->fs_info;
> +	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
> +	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
> +	u64 release_bytes = 0;
> +
> +	/*
> +	 * Since we statically set the block_rsv->size we just want to say we
> +	 * are releasing 0 bytes, and then we'll just get the reservation over
> +	 * the size free'd.
> +	 */
> +	block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0,
> +				&release_bytes);
> +	if (release_bytes > 0)
> +		trace_btrfs_space_reservation(fs_info, "delalloc",
> +					      btrfs_ino(inode), release_bytes,
> +					      0);
> +}
> +
>  void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
>  			     struct btrfs_block_rsv *block_rsv,
>  			     u64 num_bytes)
> @@ -5664,7 +5734,8 @@ void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
>  	if (global_rsv == block_rsv ||
>  	    block_rsv->space_info != global_rsv->space_info)
>  		global_rsv = NULL;
> -	block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes);
> +	block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes,
> +				NULL);
>  }
>  
>  static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
> @@ -5726,7 +5797,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
>  
>  	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
>  	fs_info->global_block_rsv.space_info = space_info;
> -	fs_info->delalloc_block_rsv.space_info = space_info;
>  	fs_info->trans_block_rsv.space_info = space_info;
>  	fs_info->empty_block_rsv.space_info = space_info;
>  	fs_info->delayed_block_rsv.space_info = space_info;
> @@ -5745,9 +5815,7 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
>  static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
>  {
>  	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
> -				(u64)-1);
> -	WARN_ON(fs_info->delalloc_block_rsv.size > 0);
> -	WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
> +				(u64)-1, NULL);
>  	WARN_ON(fs_info->trans_block_rsv.size > 0);
>  	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
>  	WARN_ON(fs_info->chunk_block_rsv.size > 0);
> @@ -5786,7 +5854,7 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
>  	WARN_ON_ONCE(!list_empty(&trans->new_bgs));
>  
>  	block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
> -				trans->chunk_bytes_reserved);
> +				trans->chunk_bytes_reserved, NULL);
>  	trans->chunk_bytes_reserved = 0;
>  }
>  
> @@ -5886,95 +5954,37 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
>  	btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
>  }
>  
> -/**
> - * drop_over_reserved_extents - drop our extra extent reservations
> - * @inode: the inode we're dropping the extent for
> - *
> - * We reserve extents we may use, but they may have been merged with other
> - * extents and we may not need the extra reservation.
> - *
> - * We also call this when we've completed io to an extent or had an error and
> - * cleared the outstanding extent, in either case we no longer need our
> - * reservation and can drop the excess.
> - */
> -static unsigned drop_over_reserved_extents(struct btrfs_inode *inode)
> -{
> -	unsigned num_extents = 0;
> -
> -	if (inode->reserved_extents > inode->outstanding_extents) {
> -		num_extents = inode->reserved_extents -
> -			inode->outstanding_extents;
> -		btrfs_mod_reserved_extents(inode, -num_extents);
> -	}
> -
> -	if (inode->outstanding_extents == 0 &&
> -	    test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
> -			       &inode->runtime_flags))
> -		num_extents++;
> -	return num_extents;
> -}
> -
> -/**
> - * calc_csum_metadata_size - return the amount of metadata space that must be
> - *	reserved/freed for the given bytes.
> - * @inode: the inode we're manipulating
> - * @num_bytes: the number of bytes in question
> - * @reserve: 1 if we are reserving space, 0 if we are freeing space
> - *
> - * This adjusts the number of csum_bytes in the inode and then returns the
> - * correct amount of metadata that must either be reserved or freed.  We
> - * calculate how many checksums we can fit into one leaf and then divide the
> - * number of bytes that will need to be checksumed by this value to figure out
> - * how many checksums will be required.  If we are adding bytes then the number
> - * may go up and we will return the number of additional bytes that must be
> - * reserved.  If it is going down we will return the number of bytes that must
> - * be freed.
> - *
> - * This must be called with BTRFS_I(inode)->lock held.
> - */
> -static u64 calc_csum_metadata_size(struct btrfs_inode *inode, u64 num_bytes,
> -				   int reserve)
> +static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
> +						 struct btrfs_inode *inode)
>  {
> -	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
> -	u64 old_csums, num_csums;
> +	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
> +	u64 reserve_size = 0;
> +	u64 csum_leaves;
> +	unsigned outstanding_extents;
>  
> -	if (inode->flags & BTRFS_INODE_NODATASUM && inode->csum_bytes == 0)
> -		return 0;
> +	ASSERT(spin_is_locked(&inode->lock));

How about using lockdep_is_held so that this is caught even if btrfs is
compiled without assertions enabled ?

> +	outstanding_extents = inode->outstanding_extents;
> +	if (outstanding_extents)
> +		reserve_size = btrfs_calc_trans_metadata_size(fs_info,
> +						outstanding_extents + 1);
> +	csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
> +						 inode->csum_bytes);
> +	reserve_size += btrfs_calc_trans_metadata_size(fs_info,
> +						       csum_leaves);
>  
> -	old_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
> -	if (reserve)
> -		inode->csum_bytes += num_bytes;
> -	else
> -		inode->csum_bytes -= num_bytes;
> -	num_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
> -
> -	/* No change, no need to reserve more */
> -	if (old_csums == num_csums)
> -		return 0;
> -
> -	if (reserve)
> -		return btrfs_calc_trans_metadata_size(fs_info,
> -						      num_csums - old_csums);
> -
> -	return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums);
> +	spin_lock(&block_rsv->lock);
> +	block_rsv->size = reserve_size;
> +	spin_unlock(&block_rsv->lock);
>  }
>  
>  int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
>  {
>  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
>  	struct btrfs_root *root = inode->root;
> -	struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv;
> -	u64 to_reserve = 0;
> -	u64 csum_bytes;
> -	unsigned nr_extents, reserve_extents;
> +	unsigned nr_extents;
>  	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
>  	int ret = 0;
>  	bool delalloc_lock = true;
> -	u64 to_free = 0;
> -	unsigned dropped;
> -	bool release_extra = false;
> -	bool underflow = false;
> -	bool did_retry = false;
>  
>  	WARN_ON(btrfs_is_free_space_inode(inode));
>  
> @@ -6001,33 +6011,13 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
>  		mutex_lock(&inode->delalloc_mutex);
>  
>  	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
> -retry:
> +
> +	/* Add our new extents and calculate the new rsv size. */
>  	spin_lock(&inode->lock);
> -	reserve_extents = nr_extents = count_max_extents(num_bytes);
> +	nr_extents = count_max_extents(num_bytes);
>  	btrfs_mod_outstanding_extents(inode, nr_extents);
> -
> -	/*
> -	 * Because we add an outstanding extent for ordered before we clear
> -	 * delalloc we will double count our outstanding extents slightly.  This
> -	 * could mean that we transiently over-reserve, which could result in an
> -	 * early ENOSPC if our timing is unlucky.  Keep track of the case that
> -	 * we had a reservation underflow so we can retry if we fail.
> -	 *
> -	 * Keep in mind we can legitimately have more outstanding extents than
> -	 * reserved because of fragmentation, so only allow a retry once.
> -	 */
> -	if (inode->outstanding_extents >
> -	    inode->reserved_extents + nr_extents) {
> -		reserve_extents = inode->outstanding_extents -
> -			inode->reserved_extents;
> -		underflow = true;
> -	}
> -
> -	/* We always want to reserve a slot for updating the inode. */
> -	to_reserve = btrfs_calc_trans_metadata_size(fs_info,
> -						    reserve_extents + 1);
> -	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
> -	csum_bytes = inode->csum_bytes;
> +	inode->csum_bytes += num_bytes;
> +	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
>  	spin_unlock(&inode->lock);
>  
>  	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
> @@ -6037,100 +6027,26 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
>  			goto out_fail;
>  	}
>  
> -	ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush);
> +	ret = btrfs_inode_rsv_refill(inode, flush);
>  	if (unlikely(ret)) {
>  		btrfs_qgroup_free_meta(root,
>  				       nr_extents * fs_info->nodesize);
>  		goto out_fail;
>  	}
>  
> -	spin_lock(&inode->lock);
> -	if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
> -			     &inode->runtime_flags)) {
> -		to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1);
> -		release_extra = true;
> -	}
> -	btrfs_mod_reserved_extents(inode, reserve_extents);
> -	spin_unlock(&inode->lock);
> -
>  	if (delalloc_lock)
>  		mutex_unlock(&inode->delalloc_mutex);
> -
> -	if (to_reserve)
> -		trace_btrfs_space_reservation(fs_info, "delalloc",
> -					      btrfs_ino(inode), to_reserve, 1);
> -	if (release_extra)
> -		btrfs_block_rsv_release(fs_info, block_rsv,
> -				btrfs_calc_trans_metadata_size(fs_info, 1));
>  	return 0;
>  
>  out_fail:
>  	spin_lock(&inode->lock);
>  	nr_extents = count_max_extents(num_bytes);
>  	btrfs_mod_outstanding_extents(inode, -nr_extents);
> -
> -	dropped = drop_over_reserved_extents(inode);
> -	/*
> -	 * If the inodes csum_bytes is the same as the original
> -	 * csum_bytes then we know we haven't raced with any free()ers
> -	 * so we can just reduce our inodes csum bytes and carry on.
> -	 */
> -	if (inode->csum_bytes == csum_bytes) {
> -		calc_csum_metadata_size(inode, num_bytes, 0);
> -	} else {
> -		u64 orig_csum_bytes = inode->csum_bytes;
> -		u64 bytes;
> -
> -		/*
> -		 * This is tricky, but first we need to figure out how much we
> -		 * freed from any free-ers that occurred during this
> -		 * reservation, so we reset ->csum_bytes to the csum_bytes
> -		 * before we dropped our lock, and then call the free for the
> -		 * number of bytes that were freed while we were trying our
> -		 * reservation.
> -		 */
> -		bytes = csum_bytes - inode->csum_bytes;
> -		inode->csum_bytes = csum_bytes;
> -		to_free = calc_csum_metadata_size(inode, bytes, 0);
> -
> -
> -		/*
> -		 * Now we need to see how much we would have freed had we not
> -		 * been making this reservation and our ->csum_bytes were not
> -		 * artificially inflated.
> -		 */
> -		inode->csum_bytes = csum_bytes - num_bytes;
> -		bytes = csum_bytes - orig_csum_bytes;
> -		bytes = calc_csum_metadata_size(inode, bytes, 0);
> -
> -		/*
> -		 * Now reset ->csum_bytes to what it should be.  If bytes is
> -		 * more than to_free then we would have freed more space had we
> -		 * not had an artificially high ->csum_bytes, so we need to free
> -		 * the remainder.  If bytes is the same or less then we don't
> -		 * need to do anything, the other free-ers did the correct
> -		 * thing.
> -		 */
> -		inode->csum_bytes = orig_csum_bytes - num_bytes;
> -		if (bytes > to_free)
> -			to_free = bytes - to_free;
> -		else
> -			to_free = 0;
> -	}
> +	inode->csum_bytes -= num_bytes;
> +	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
>  	spin_unlock(&inode->lock);
> -	if (dropped)
> -		to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
>  
> -	if (to_free) {
> -		btrfs_block_rsv_release(fs_info, block_rsv, to_free);
> -		trace_btrfs_space_reservation(fs_info, "delalloc",
> -					      btrfs_ino(inode), to_free, 0);
> -	}
> -	if (underflow && !did_retry) {
> -		did_retry = true;
> -		underflow = false;
> -		goto retry;
> -	}
> +	btrfs_inode_rsv_release(inode);
>  	if (delalloc_lock)
>  		mutex_unlock(&inode->delalloc_mutex);
>  	return ret;
> @@ -6148,25 +6064,17 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
>  void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
>  {
>  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
> -	u64 to_free = 0;
> -	unsigned dropped;
>  
>  	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
>  	spin_lock(&inode->lock);
> -	dropped = drop_over_reserved_extents(inode);
> -	if (num_bytes)
> -		to_free = calc_csum_metadata_size(inode, num_bytes, 0);
> +	inode->csum_bytes -= num_bytes;
> +	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
>  	spin_unlock(&inode->lock);
> -	if (dropped > 0)
> -		to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
>  
>  	if (btrfs_is_testing(fs_info))
>  		return;
>  
> -	trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode),
> -				      to_free, 0);
> -
> -	btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free);
> +	btrfs_inode_rsv_release(inode);
>  }
>  
>  /**
> @@ -6184,25 +6092,17 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
>  {
>  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
>  	unsigned num_extents;
> -	u64 to_free;
> -	unsigned dropped;
>  
>  	spin_lock(&inode->lock);
>  	num_extents = count_max_extents(num_bytes);
>  	btrfs_mod_outstanding_extents(inode, -num_extents);
> -	dropped = drop_over_reserved_extents(inode);
> +	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
>  	spin_unlock(&inode->lock);
>  
> -	if (!dropped)
> -		return;
> -
>  	if (btrfs_is_testing(fs_info))
>  		return;
>  
> -	to_free = btrfs_calc_trans_metadata_size(fs_info, dropped);
> -	trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode),
> -				      to_free, 0);
> -	btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free);
> +	btrfs_inode_rsv_release(inode);
>  }
>  
>  /**
> @@ -8427,7 +8327,7 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
>  			    struct btrfs_block_rsv *block_rsv, u32 blocksize)
>  {
>  	block_rsv_add_bytes(block_rsv, blocksize, 0);
> -	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
> +	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
>  }
>  
>  /*
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 912417b..b0d1e94 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -42,6 +42,7 @@
>  #include <linux/blkdev.h>
>  #include <linux/posix_acl_xattr.h>
>  #include <linux/uio.h>
> +#include <linux/magic.h>
>  #include "ctree.h"
>  #include "disk-io.h"
>  #include "transaction.h"
> @@ -303,7 +304,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
>  		btrfs_free_path(path);
>  		return PTR_ERR(trans);
>  	}
> -	trans->block_rsv = &fs_info->delalloc_block_rsv;
> +	trans->block_rsv = &BTRFS_I(inode)->block_rsv;
>  
>  	if (compressed_size && compressed_pages)
>  		extent_item_size = btrfs_file_extent_calc_inline_size(
> @@ -2949,7 +2950,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
>  			trans = NULL;
>  			goto out;
>  		}
> -		trans->block_rsv = &fs_info->delalloc_block_rsv;
> +		trans->block_rsv = &BTRFS_I(inode)->block_rsv;
>  		ret = btrfs_update_inode_fallback(trans, root, inode);
>  		if (ret) /* -ENOMEM or corruption */
>  			btrfs_abort_transaction(trans, ret);
> @@ -2985,7 +2986,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
>  		goto out;
>  	}
>  
> -	trans->block_rsv = &fs_info->delalloc_block_rsv;
> +	trans->block_rsv = &BTRFS_I(inode)->block_rsv;
>  
>  	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
>  		compress_type = ordered_extent->compress_type;
> @@ -8834,7 +8835,6 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
>  	if (iov_iter_rw(iter) == WRITE) {
>  		up_read(&BTRFS_I(inode)->dio_sem);
>  		current->journal_info = NULL;
> -		btrfs_delalloc_release_extents(BTRFS_I(inode), count);
>  		if (ret < 0 && ret != -EIOCBQUEUED) {
>  			if (dio_data.reserve)
>  				btrfs_delalloc_release_space(inode, data_reserved,
> @@ -8855,6 +8855,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
>  		} else if (ret >= 0 && (size_t)ret < count)
>  			btrfs_delalloc_release_space(inode, data_reserved,
>  					offset, count - (size_t)ret);
> +		btrfs_delalloc_release_extents(BTRFS_I(inode), count);
>  	}
>  out:
>  	if (wakeup)
> @@ -9419,6 +9420,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
>  
>  struct inode *btrfs_alloc_inode(struct super_block *sb)
>  {
> +	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
>  	struct btrfs_inode *ei;
>  	struct inode *inode;
>  
> @@ -9445,8 +9447,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
>  
>  	spin_lock_init(&ei->lock);
>  	ei->outstanding_extents = 0;
> -	ei->reserved_extents = 0;
> -
> +	if (sb->s_magic != BTRFS_TEST_MAGIC)
> +		btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
> +					      BTRFS_BLOCK_RSV_DELALLOC);

Why the dependendance on BTRFS_TEST_MAGIC here?

>  	ei->runtime_flags = 0;
>  	ei->prop_compress = BTRFS_COMPRESS_NONE;
>  	ei->defrag_compress = BTRFS_COMPRESS_NONE;
> @@ -9496,8 +9499,9 @@ void btrfs_destroy_inode(struct inode *inode)
>  
>  	WARN_ON(!hlist_empty(&inode->i_dentry));
>  	WARN_ON(inode->i_data.nrpages);
> +	WARN_ON(BTRFS_I(inode)->block_rsv.reserved);
> +	WARN_ON(BTRFS_I(inode)->block_rsv.size);
>  	WARN_ON(BTRFS_I(inode)->outstanding_extents);
> -	WARN_ON(BTRFS_I(inode)->reserved_extents);
>  	WARN_ON(BTRFS_I(inode)->delalloc_bytes);
>  	WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
>  	WARN_ON(BTRFS_I(inode)->csum_bytes);
> diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
> index 5e48b98..a9e63fb 100644
> --- a/include/trace/events/btrfs.h
> +++ b/include/trace/events/btrfs.h
> @@ -1713,28 +1713,6 @@ TRACE_EVENT(btrfs_inode_mod_outstanding_extents,
>  			(unsigned long long)__entry->ino, __entry->mod)
>  );
>  
> -TRACE_EVENT(btrfs_inode_mod_reserved_extents,
> -	TP_PROTO(struct btrfs_root *root, u64 ino, int mod),
> -
> -	TP_ARGS(root, ino, mod),
> -
> -	TP_STRUCT__entry_btrfs(
> -		__field(	u64, root_objectid	)
> -		__field(	u64, ino		)
> -		__field(	int, mod		)
> -	),
> -
> -	TP_fast_assign_btrfs(root->fs_info,
> -		__entry->root_objectid	= root->objectid;
> -		__entry->ino		= ino;
> -		__entry->mod		= mod;
> -	),
> -
> -	TP_printk_btrfs("root = %llu(%s), ino = %llu, mod = %d",
> -			show_root_type(__entry->root_objectid),
> -			(unsigned long long)__entry->ino, __entry->mod)
> -);
> -
>  #endif /* _TRACE_BTRFS_H */
>  
>  /* This part must be outside protection */
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Josef Bacik Aug. 21, 2017, 1:31 p.m. UTC | #2
On Mon, Aug 21, 2017 at 03:48:51PM +0300, Nikolay Borisov wrote:
> 
> 
> On 18.08.2017 00:23, josef@toxicpanda.com wrote:
> > From: Josef Bacik <jbacik@fb.com>
> > 
> > The way we handle delalloc metadata reservations has gotten
> > progressively more complicated over the years.  There is so much cruft
> > and weirdness around keeping the reserved count and outstanding counters
> > consistent and handling the error cases that it's impossible to
> > understand.
> > 
> > Fix this by making the delalloc block rsv per-inode.  This way we can
> > calculate the actual size of the outstanding metadata reservations every
> > time we make a change, and then reserve the delta based on that amount.
> > This greatly simplifies the code everywhere, and makes the error
> > handling in btrfs_delalloc_reserve_metadata far less terrifying.
> > 
> > Signed-off-by: Josef Bacik <jbacik@fb.com>
> > ---
> >  fs/btrfs/btrfs_inode.h       |  29 ++--
> >  fs/btrfs/ctree.h             |   5 +-
> >  fs/btrfs/delayed-inode.c     |  46 +-----
> >  fs/btrfs/disk-io.c           |  18 ++-
> >  fs/btrfs/extent-tree.c       | 330 +++++++++++++++----------------------------
> >  fs/btrfs/inode.c             |  18 ++-
> >  include/trace/events/btrfs.h |  22 ---
> >  7 files changed, 147 insertions(+), 321 deletions(-)
> > 
> > diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> > index 2894ad6..71a4ded 100644
> > --- a/fs/btrfs/btrfs_inode.h
> > +++ b/fs/btrfs/btrfs_inode.h
> > @@ -36,14 +36,13 @@
> >  #define BTRFS_INODE_ORPHAN_META_RESERVED	1
> >  #define BTRFS_INODE_DUMMY			2
> >  #define BTRFS_INODE_IN_DEFRAG			3
> > -#define BTRFS_INODE_DELALLOC_META_RESERVED	4
> > -#define BTRFS_INODE_HAS_ORPHAN_ITEM		5
> > -#define BTRFS_INODE_HAS_ASYNC_EXTENT		6
> > -#define BTRFS_INODE_NEEDS_FULL_SYNC		7
> > -#define BTRFS_INODE_COPY_EVERYTHING		8
> > -#define BTRFS_INODE_IN_DELALLOC_LIST		9
> > -#define BTRFS_INODE_READDIO_NEED_LOCK		10
> > -#define BTRFS_INODE_HAS_PROPS		        11
> > +#define BTRFS_INODE_HAS_ORPHAN_ITEM		4
> > +#define BTRFS_INODE_HAS_ASYNC_EXTENT		5
> > +#define BTRFS_INODE_NEEDS_FULL_SYNC		6
> > +#define BTRFS_INODE_COPY_EVERYTHING		7
> > +#define BTRFS_INODE_IN_DELALLOC_LIST		8
> > +#define BTRFS_INODE_READDIO_NEED_LOCK		9
> > +#define BTRFS_INODE_HAS_PROPS		        10
> >  
> >  /* in memory btrfs inode */
> >  struct btrfs_inode {
> > @@ -176,7 +175,8 @@ struct btrfs_inode {
> >  	 * of extent items we've reserved metadata for.
> >  	 */
> >  	unsigned outstanding_extents;
> > -	unsigned reserved_extents;
> > +
> > +	struct btrfs_block_rsv block_rsv;
> >  
> >  	/*
> >  	 * Cached values if inode properties
> > @@ -278,17 +278,6 @@ static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
> >  						  mod);
> >  }
> >  
> > -static inline void btrfs_mod_reserved_extents(struct btrfs_inode *inode,
> > -					      int mod)
> > -{
> > -	ASSERT(spin_is_locked(&inode->lock));
> > -	inode->reserved_extents += mod;
> > -	if (btrfs_is_free_space_inode(inode))
> > -		return;
> > -	trace_btrfs_inode_mod_reserved_extents(inode->root, btrfs_ino(inode),
> > -					       mod);
> > -}
> > -
> >  static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
> >  {
> >  	int ret = 0;
> > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> > index b43114d..a4ee115 100644
> > --- a/fs/btrfs/ctree.h
> > +++ b/fs/btrfs/ctree.h
> > @@ -763,8 +763,6 @@ struct btrfs_fs_info {
> >  	 * delayed dir index item
> >  	 */
> >  	struct btrfs_block_rsv global_block_rsv;
> > -	/* block reservation for delay allocation */
> > -	struct btrfs_block_rsv delalloc_block_rsv;
> >  	/* block reservation for metadata operations */
> >  	struct btrfs_block_rsv trans_block_rsv;
> >  	/* block reservation for chunk tree */
> > @@ -2744,6 +2742,9 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
> >  void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
> >  struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
> >  					      unsigned short type);
> > +void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
> > +				   struct btrfs_block_rsv *rsv,
> > +				   unsigned short type);
> >  void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
> >  			  struct btrfs_block_rsv *rsv);
> >  void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv);
> > diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
> > index 19e4ad2..5d73f79 100644
> > --- a/fs/btrfs/delayed-inode.c
> > +++ b/fs/btrfs/delayed-inode.c
> > @@ -581,7 +581,6 @@ static int btrfs_delayed_inode_reserve_metadata(
> >  	struct btrfs_block_rsv *dst_rsv;
> >  	u64 num_bytes;
> >  	int ret;
> > -	bool release = false;
> >  
> >  	src_rsv = trans->block_rsv;
> >  	dst_rsv = &fs_info->delayed_block_rsv;
> > @@ -589,36 +588,13 @@ static int btrfs_delayed_inode_reserve_metadata(
> >  	num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
> >  
> >  	/*
> > -	 * If our block_rsv is the delalloc block reserve then check and see if
> > -	 * we have our extra reservation for updating the inode.  If not fall
> > -	 * through and try to reserve space quickly.
> > -	 *
> > -	 * We used to try and steal from the delalloc block rsv or the global
> > -	 * reserve, but we'd steal a full reservation, which isn't kind.  We are
> > -	 * here through delalloc which means we've likely just cowed down close
> > -	 * to the leaf that contains the inode, so we would steal less just
> > -	 * doing the fallback inode update, so if we do end up having to steal
> > -	 * from the global block rsv we hopefully only steal one or two blocks
> > -	 * worth which is less likely to hurt us.
> > -	 */
> > -	if (src_rsv && src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
> > -		spin_lock(&inode->lock);
> > -		if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
> > -				       &inode->runtime_flags))
> > -			release = true;
> > -		else
> > -			src_rsv = NULL;
> > -		spin_unlock(&inode->lock);
> > -	}
> > -
> > -	/*
> >  	 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
> >  	 * which doesn't reserve space for speed.  This is a problem since we
> >  	 * still need to reserve space for this update, so try to reserve the
> >  	 * space.
> >  	 *
> >  	 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
> > -	 * we're accounted for.
> > +	 * we always reserve enough to update the inode item.
> >  	 */
> >  	if (!src_rsv || (!trans->bytes_reserved &&
> >  			 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
> > @@ -643,32 +619,12 @@ static int btrfs_delayed_inode_reserve_metadata(
> >  	}
> >  
> >  	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
> > -
> > -	/*
> > -	 * Migrate only takes a reservation, it doesn't touch the size of the
> > -	 * block_rsv.  This is to simplify people who don't normally have things
> > -	 * migrated from their block rsv.  If they go to release their
> > -	 * reservation, that will decrease the size as well, so if migrate
> > -	 * reduced size we'd end up with a negative size.  But for the
> > -	 * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
> > -	 * but we could in fact do this reserve/migrate dance several times
> > -	 * between the time we did the original reservation and we'd clean it
> > -	 * up.  So to take care of this, release the space for the meta
> > -	 * reservation here.  I think it may be time for a documentation page on
> > -	 * how block rsvs. work.
> > -	 */
> >  	if (!ret) {
> >  		trace_btrfs_space_reservation(fs_info, "delayed_inode",
> >  					      btrfs_ino(inode), num_bytes, 1);
> >  		node->bytes_reserved = num_bytes;
> >  	}
> >  
> > -	if (release) {
> > -		trace_btrfs_space_reservation(fs_info, "delalloc",
> > -					      btrfs_ino(inode), num_bytes, 0);
> > -		btrfs_block_rsv_release(fs_info, src_rsv, num_bytes);
> > -	}
> > -
> >  	return ret;
> >  }
> >  
> > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> > index 9283e28..5681f8f 100644
> > --- a/fs/btrfs/disk-io.c
> > +++ b/fs/btrfs/disk-io.c
> > @@ -2607,14 +2607,6 @@ int open_ctree(struct super_block *sb,
> >  		goto fail_delalloc_bytes;
> >  	}
> >  
> > -	fs_info->btree_inode = new_inode(sb);
> > -	if (!fs_info->btree_inode) {
> > -		err = -ENOMEM;
> > -		goto fail_bio_counter;
> > -	}
> > -
> > -	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
> > -
> >  	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
> >  	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
> >  	INIT_LIST_HEAD(&fs_info->trans_list);
> > @@ -2647,8 +2639,6 @@ int open_ctree(struct super_block *sb,
> >  	btrfs_mapping_init(&fs_info->mapping_tree);
> >  	btrfs_init_block_rsv(&fs_info->global_block_rsv,
> >  			     BTRFS_BLOCK_RSV_GLOBAL);
> > -	btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
> > -			     BTRFS_BLOCK_RSV_DELALLOC);
> >  	btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
> >  	btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
> >  	btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
> > @@ -2679,6 +2669,14 @@ int open_ctree(struct super_block *sb,
> >  
> >  	INIT_LIST_HEAD(&fs_info->ordered_roots);
> >  	spin_lock_init(&fs_info->ordered_root_lock);
> > +
> > +	fs_info->btree_inode = new_inode(sb);
> > +	if (!fs_info->btree_inode) {
> > +		err = -ENOMEM;
> > +		goto fail_bio_counter;
> > +	}
> > +	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
> > +
> >  	fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
> >  					GFP_KERNEL);
> >  	if (!fs_info->delayed_root) {
> 
> This hunk seem to be a leftover from some other work, so you can drop it
> for v2. Of course the the removal of the call to btrfs_init_block_rsv
> should stay.
> 

No this is actually needed.  We do a __find_space_info from
btrfs_init_metadata_block_rsv() for the btree_inode, and if the fs_info isn't
properly zero'ed out it pukes on whatever fs_info->space_info is pointing at.

> > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> > index a313487..8484e0f 100644
> > --- a/fs/btrfs/extent-tree.c
> > +++ b/fs/btrfs/extent-tree.c
> > @@ -4761,7 +4761,6 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
> >  static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
> >  			    u64 orig, bool wait_ordered)
> >  {
> > -	struct btrfs_block_rsv *block_rsv;
> >  	struct btrfs_space_info *space_info;
> >  	struct btrfs_trans_handle *trans;
> >  	u64 delalloc_bytes;
> > @@ -4777,8 +4776,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
> >  	to_reclaim = items * EXTENT_SIZE_PER_ITEM;
> >  
> >  	trans = (struct btrfs_trans_handle *)current->journal_info;
> > -	block_rsv = &fs_info->delalloc_block_rsv;
> > -	space_info = block_rsv->space_info;
> > +	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
> >  
> >  	delalloc_bytes = percpu_counter_sum_positive(
> >  						&fs_info->delalloc_bytes);
> > @@ -5502,7 +5500,8 @@ static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
> >  
> >  static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
> >  				    struct btrfs_block_rsv *block_rsv,
> > -				    struct btrfs_block_rsv *dest, u64 num_bytes)
> > +				    struct btrfs_block_rsv *dest, u64 num_bytes,
> > +				    u64 *released_bytes)
> >  {
> 
> Instead of adding yet another parameter, why not just return the value?
> That way callers can handle it/check on their own accord. It seems this
> is only ever used for debugging (tracepoint) purposes anyway
> 

Yeah that sounds reasonable, I'll do that.

> >  	struct btrfs_space_info *space_info = block_rsv->space_info;
> >  
> > @@ -5520,6 +5519,8 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
> >  	spin_unlock(&block_rsv->lock);
> >  
> >  	if (num_bytes > 0) {
> > +		if (released_bytes)
> > +			*released_bytes = num_bytes;
> >  		if (dest) {
> >  			spin_lock(&dest->lock);
> >  			if (!dest->full) {
> > @@ -5561,6 +5562,15 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
> >  	rsv->type = type;
> >  }
> >  
> > +void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
> > +				   struct btrfs_block_rsv *rsv,
> > +				   unsigned short type)
> > +{
> > +	btrfs_init_block_rsv(rsv, type);
> > +	rsv->space_info = __find_space_info(fs_info,
> > +					    BTRFS_BLOCK_GROUP_METADATA);
> > +}
> > +
> >  struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
> >  					      unsigned short type)
> >  {
> > @@ -5570,9 +5580,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
> >  	if (!block_rsv)
> >  		return NULL;
> >  
> > -	btrfs_init_block_rsv(block_rsv, type);
> > -	block_rsv->space_info = __find_space_info(fs_info,
> > -						  BTRFS_BLOCK_GROUP_METADATA);
> > +	btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
> >  	return block_rsv;
> >  }
> >  
> > @@ -5655,6 +5663,68 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
> >  	return ret;
> >  }
> >  
> > +/**
> > + * btrfs_inode_rsv_refill - refill the inode block rsv.
> > + * @inode - the inode we are refilling.
> > + * @flush - the flusing restriction.
> > + *
> > + * Essentially the same as btrfs_block_rsv_refill, except it uses the
> > + * block_rsv->size as the minimum size.  We'll either refill the missing amount
> > + * or return if we already have enough space.  This will also handle the resreve
> > + * tracepoint for the reserved amount.
> > + */
> > +int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
> > +			   enum btrfs_reserve_flush_enum flush)
> > +{
> > +	struct btrfs_root *root = inode->root;
> > +	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
> > +	u64 num_bytes = 0;
> > +	int ret = -ENOSPC;
> > +
> > +	spin_lock(&block_rsv->lock);
> > +	if (block_rsv->reserved < block_rsv->size)
> > +		num_bytes = block_rsv->size - block_rsv->reserved;
> > +	spin_unlock(&block_rsv->lock);
> > +
> > +	if (num_bytes == 0)
> > +		return 0;
> > +
> > +	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
> > +	if (!ret) {
> > +		block_rsv_add_bytes(block_rsv, num_bytes, 0);
> > +		trace_btrfs_space_reservation(root->fs_info, "delalloc",
> > +					      btrfs_ino(inode), num_bytes, 1);
> > +	}
> > +	return ret;
> > +}
> > +
> > +/**
> > + * btrfs_inode_rsv_release - release any excessive reservation.
> > + * @inode - the inode we need to release from.
> > + *
> > + * This is the same as btrfs_block_rsv_release, except that it handles the
> > + * tracepoint for the reservation.
> > + */
> > +void btrfs_inode_rsv_release(struct btrfs_inode *inode)
> > +{
> > +	struct btrfs_fs_info *fs_info = inode->root->fs_info;
> > +	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
> > +	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
> > +	u64 release_bytes = 0;
> > +
> > +	/*
> > +	 * Since we statically set the block_rsv->size we just want to say we
> > +	 * are releasing 0 bytes, and then we'll just get the reservation over
> > +	 * the size free'd.
> > +	 */
> > +	block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0,
> > +				&release_bytes);
> > +	if (release_bytes > 0)
> > +		trace_btrfs_space_reservation(fs_info, "delalloc",
> > +					      btrfs_ino(inode), release_bytes,
> > +					      0);
> > +}
> > +
> >  void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
> >  			     struct btrfs_block_rsv *block_rsv,
> >  			     u64 num_bytes)
> > @@ -5664,7 +5734,8 @@ void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
> >  	if (global_rsv == block_rsv ||
> >  	    block_rsv->space_info != global_rsv->space_info)
> >  		global_rsv = NULL;
> > -	block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes);
> > +	block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes,
> > +				NULL);
> >  }
> >  
> >  static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
> > @@ -5726,7 +5797,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
> >  
> >  	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
> >  	fs_info->global_block_rsv.space_info = space_info;
> > -	fs_info->delalloc_block_rsv.space_info = space_info;
> >  	fs_info->trans_block_rsv.space_info = space_info;
> >  	fs_info->empty_block_rsv.space_info = space_info;
> >  	fs_info->delayed_block_rsv.space_info = space_info;
> > @@ -5745,9 +5815,7 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
> >  static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
> >  {
> >  	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
> > -				(u64)-1);
> > -	WARN_ON(fs_info->delalloc_block_rsv.size > 0);
> > -	WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
> > +				(u64)-1, NULL);
> >  	WARN_ON(fs_info->trans_block_rsv.size > 0);
> >  	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
> >  	WARN_ON(fs_info->chunk_block_rsv.size > 0);
> > @@ -5786,7 +5854,7 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
> >  	WARN_ON_ONCE(!list_empty(&trans->new_bgs));
> >  
> >  	block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
> > -				trans->chunk_bytes_reserved);
> > +				trans->chunk_bytes_reserved, NULL);
> >  	trans->chunk_bytes_reserved = 0;
> >  }
> >  
> > @@ -5886,95 +5954,37 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
> >  	btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
> >  }
> >  
> > -/**
> > - * drop_over_reserved_extents - drop our extra extent reservations
> > - * @inode: the inode we're dropping the extent for
> > - *
> > - * We reserve extents we may use, but they may have been merged with other
> > - * extents and we may not need the extra reservation.
> > - *
> > - * We also call this when we've completed io to an extent or had an error and
> > - * cleared the outstanding extent, in either case we no longer need our
> > - * reservation and can drop the excess.
> > - */
> > -static unsigned drop_over_reserved_extents(struct btrfs_inode *inode)
> > -{
> > -	unsigned num_extents = 0;
> > -
> > -	if (inode->reserved_extents > inode->outstanding_extents) {
> > -		num_extents = inode->reserved_extents -
> > -			inode->outstanding_extents;
> > -		btrfs_mod_reserved_extents(inode, -num_extents);
> > -	}
> > -
> > -	if (inode->outstanding_extents == 0 &&
> > -	    test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
> > -			       &inode->runtime_flags))
> > -		num_extents++;
> > -	return num_extents;
> > -}
> > -
> > -/**
> > - * calc_csum_metadata_size - return the amount of metadata space that must be
> > - *	reserved/freed for the given bytes.
> > - * @inode: the inode we're manipulating
> > - * @num_bytes: the number of bytes in question
> > - * @reserve: 1 if we are reserving space, 0 if we are freeing space
> > - *
> > - * This adjusts the number of csum_bytes in the inode and then returns the
> > - * correct amount of metadata that must either be reserved or freed.  We
> > - * calculate how many checksums we can fit into one leaf and then divide the
> > - * number of bytes that will need to be checksumed by this value to figure out
> > - * how many checksums will be required.  If we are adding bytes then the number
> > - * may go up and we will return the number of additional bytes that must be
> > - * reserved.  If it is going down we will return the number of bytes that must
> > - * be freed.
> > - *
> > - * This must be called with BTRFS_I(inode)->lock held.
> > - */
> > -static u64 calc_csum_metadata_size(struct btrfs_inode *inode, u64 num_bytes,
> > -				   int reserve)
> > +static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
> > +						 struct btrfs_inode *inode)
> >  {
> > -	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
> > -	u64 old_csums, num_csums;
> > +	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
> > +	u64 reserve_size = 0;
> > +	u64 csum_leaves;
> > +	unsigned outstanding_extents;
> >  
> > -	if (inode->flags & BTRFS_INODE_NODATASUM && inode->csum_bytes == 0)
> > -		return 0;
> > +	ASSERT(spin_is_locked(&inode->lock));
> 
> How about using lockdep_is_held so that this is caught even if btrfs is
> compiled without assertions enabled ?
> 

Yup can do.

> > +	outstanding_extents = inode->outstanding_extents;
> > +	if (outstanding_extents)
> > +		reserve_size = btrfs_calc_trans_metadata_size(fs_info,
> > +						outstanding_extents + 1);
> > +	csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
> > +						 inode->csum_bytes);
> > +	reserve_size += btrfs_calc_trans_metadata_size(fs_info,
> > +						       csum_leaves);
> >  
> > -	old_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
> > -	if (reserve)
> > -		inode->csum_bytes += num_bytes;
> > -	else
> > -		inode->csum_bytes -= num_bytes;
> > -	num_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
> > -
> > -	/* No change, no need to reserve more */
> > -	if (old_csums == num_csums)
> > -		return 0;
> > -
> > -	if (reserve)
> > -		return btrfs_calc_trans_metadata_size(fs_info,
> > -						      num_csums - old_csums);
> > -
> > -	return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums);
> > +	spin_lock(&block_rsv->lock);
> > +	block_rsv->size = reserve_size;
> > +	spin_unlock(&block_rsv->lock);
> >  }
> >  
> >  int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
> >  {
> >  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
> >  	struct btrfs_root *root = inode->root;
> > -	struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv;
> > -	u64 to_reserve = 0;
> > -	u64 csum_bytes;
> > -	unsigned nr_extents, reserve_extents;
> > +	unsigned nr_extents;
> >  	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
> >  	int ret = 0;
> >  	bool delalloc_lock = true;
> > -	u64 to_free = 0;
> > -	unsigned dropped;
> > -	bool release_extra = false;
> > -	bool underflow = false;
> > -	bool did_retry = false;
> >  
> >  	WARN_ON(btrfs_is_free_space_inode(inode));
> >  
> > @@ -6001,33 +6011,13 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
> >  		mutex_lock(&inode->delalloc_mutex);
> >  
> >  	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
> > -retry:
> > +
> > +	/* Add our new extents and calculate the new rsv size. */
> >  	spin_lock(&inode->lock);
> > -	reserve_extents = nr_extents = count_max_extents(num_bytes);
> > +	nr_extents = count_max_extents(num_bytes);
> >  	btrfs_mod_outstanding_extents(inode, nr_extents);
> > -
> > -	/*
> > -	 * Because we add an outstanding extent for ordered before we clear
> > -	 * delalloc we will double count our outstanding extents slightly.  This
> > -	 * could mean that we transiently over-reserve, which could result in an
> > -	 * early ENOSPC if our timing is unlucky.  Keep track of the case that
> > -	 * we had a reservation underflow so we can retry if we fail.
> > -	 *
> > -	 * Keep in mind we can legitimately have more outstanding extents than
> > -	 * reserved because of fragmentation, so only allow a retry once.
> > -	 */
> > -	if (inode->outstanding_extents >
> > -	    inode->reserved_extents + nr_extents) {
> > -		reserve_extents = inode->outstanding_extents -
> > -			inode->reserved_extents;
> > -		underflow = true;
> > -	}
> > -
> > -	/* We always want to reserve a slot for updating the inode. */
> > -	to_reserve = btrfs_calc_trans_metadata_size(fs_info,
> > -						    reserve_extents + 1);
> > -	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
> > -	csum_bytes = inode->csum_bytes;
> > +	inode->csum_bytes += num_bytes;
> > +	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
> >  	spin_unlock(&inode->lock);
> >  
> >  	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
> > @@ -6037,100 +6027,26 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
> >  			goto out_fail;
> >  	}
> >  
> > -	ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush);
> > +	ret = btrfs_inode_rsv_refill(inode, flush);
> >  	if (unlikely(ret)) {
> >  		btrfs_qgroup_free_meta(root,
> >  				       nr_extents * fs_info->nodesize);
> >  		goto out_fail;
> >  	}
> >  
> > -	spin_lock(&inode->lock);
> > -	if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
> > -			     &inode->runtime_flags)) {
> > -		to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1);
> > -		release_extra = true;
> > -	}
> > -	btrfs_mod_reserved_extents(inode, reserve_extents);
> > -	spin_unlock(&inode->lock);
> > -
> >  	if (delalloc_lock)
> >  		mutex_unlock(&inode->delalloc_mutex);
> > -
> > -	if (to_reserve)
> > -		trace_btrfs_space_reservation(fs_info, "delalloc",
> > -					      btrfs_ino(inode), to_reserve, 1);
> > -	if (release_extra)
> > -		btrfs_block_rsv_release(fs_info, block_rsv,
> > -				btrfs_calc_trans_metadata_size(fs_info, 1));
> >  	return 0;
> >  
> >  out_fail:
> >  	spin_lock(&inode->lock);
> >  	nr_extents = count_max_extents(num_bytes);
> >  	btrfs_mod_outstanding_extents(inode, -nr_extents);
> > -
> > -	dropped = drop_over_reserved_extents(inode);
> > -	/*
> > -	 * If the inodes csum_bytes is the same as the original
> > -	 * csum_bytes then we know we haven't raced with any free()ers
> > -	 * so we can just reduce our inodes csum bytes and carry on.
> > -	 */
> > -	if (inode->csum_bytes == csum_bytes) {
> > -		calc_csum_metadata_size(inode, num_bytes, 0);
> > -	} else {
> > -		u64 orig_csum_bytes = inode->csum_bytes;
> > -		u64 bytes;
> > -
> > -		/*
> > -		 * This is tricky, but first we need to figure out how much we
> > -		 * freed from any free-ers that occurred during this
> > -		 * reservation, so we reset ->csum_bytes to the csum_bytes
> > -		 * before we dropped our lock, and then call the free for the
> > -		 * number of bytes that were freed while we were trying our
> > -		 * reservation.
> > -		 */
> > -		bytes = csum_bytes - inode->csum_bytes;
> > -		inode->csum_bytes = csum_bytes;
> > -		to_free = calc_csum_metadata_size(inode, bytes, 0);
> > -
> > -
> > -		/*
> > -		 * Now we need to see how much we would have freed had we not
> > -		 * been making this reservation and our ->csum_bytes were not
> > -		 * artificially inflated.
> > -		 */
> > -		inode->csum_bytes = csum_bytes - num_bytes;
> > -		bytes = csum_bytes - orig_csum_bytes;
> > -		bytes = calc_csum_metadata_size(inode, bytes, 0);
> > -
> > -		/*
> > -		 * Now reset ->csum_bytes to what it should be.  If bytes is
> > -		 * more than to_free then we would have freed more space had we
> > -		 * not had an artificially high ->csum_bytes, so we need to free
> > -		 * the remainder.  If bytes is the same or less then we don't
> > -		 * need to do anything, the other free-ers did the correct
> > -		 * thing.
> > -		 */
> > -		inode->csum_bytes = orig_csum_bytes - num_bytes;
> > -		if (bytes > to_free)
> > -			to_free = bytes - to_free;
> > -		else
> > -			to_free = 0;
> > -	}
> > +	inode->csum_bytes -= num_bytes;
> > +	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
> >  	spin_unlock(&inode->lock);
> > -	if (dropped)
> > -		to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
> >  
> > -	if (to_free) {
> > -		btrfs_block_rsv_release(fs_info, block_rsv, to_free);
> > -		trace_btrfs_space_reservation(fs_info, "delalloc",
> > -					      btrfs_ino(inode), to_free, 0);
> > -	}
> > -	if (underflow && !did_retry) {
> > -		did_retry = true;
> > -		underflow = false;
> > -		goto retry;
> > -	}
> > +	btrfs_inode_rsv_release(inode);
> >  	if (delalloc_lock)
> >  		mutex_unlock(&inode->delalloc_mutex);
> >  	return ret;
> > @@ -6148,25 +6064,17 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
> >  void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
> >  {
> >  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
> > -	u64 to_free = 0;
> > -	unsigned dropped;
> >  
> >  	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
> >  	spin_lock(&inode->lock);
> > -	dropped = drop_over_reserved_extents(inode);
> > -	if (num_bytes)
> > -		to_free = calc_csum_metadata_size(inode, num_bytes, 0);
> > +	inode->csum_bytes -= num_bytes;
> > +	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
> >  	spin_unlock(&inode->lock);
> > -	if (dropped > 0)
> > -		to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
> >  
> >  	if (btrfs_is_testing(fs_info))
> >  		return;
> >  
> > -	trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode),
> > -				      to_free, 0);
> > -
> > -	btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free);
> > +	btrfs_inode_rsv_release(inode);
> >  }
> >  
> >  /**
> > @@ -6184,25 +6092,17 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
> >  {
> >  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
> >  	unsigned num_extents;
> > -	u64 to_free;
> > -	unsigned dropped;
> >  
> >  	spin_lock(&inode->lock);
> >  	num_extents = count_max_extents(num_bytes);
> >  	btrfs_mod_outstanding_extents(inode, -num_extents);
> > -	dropped = drop_over_reserved_extents(inode);
> > +	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
> >  	spin_unlock(&inode->lock);
> >  
> > -	if (!dropped)
> > -		return;
> > -
> >  	if (btrfs_is_testing(fs_info))
> >  		return;
> >  
> > -	to_free = btrfs_calc_trans_metadata_size(fs_info, dropped);
> > -	trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode),
> > -				      to_free, 0);
> > -	btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free);
> > +	btrfs_inode_rsv_release(inode);
> >  }
> >  
> >  /**
> > @@ -8427,7 +8327,7 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
> >  			    struct btrfs_block_rsv *block_rsv, u32 blocksize)
> >  {
> >  	block_rsv_add_bytes(block_rsv, blocksize, 0);
> > -	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
> > +	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
> >  }
> >  
> >  /*
> > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> > index 912417b..b0d1e94 100644
> > --- a/fs/btrfs/inode.c
> > +++ b/fs/btrfs/inode.c
> > @@ -42,6 +42,7 @@
> >  #include <linux/blkdev.h>
> >  #include <linux/posix_acl_xattr.h>
> >  #include <linux/uio.h>
> > +#include <linux/magic.h>
> >  #include "ctree.h"
> >  #include "disk-io.h"
> >  #include "transaction.h"
> > @@ -303,7 +304,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
> >  		btrfs_free_path(path);
> >  		return PTR_ERR(trans);
> >  	}
> > -	trans->block_rsv = &fs_info->delalloc_block_rsv;
> > +	trans->block_rsv = &BTRFS_I(inode)->block_rsv;
> >  
> >  	if (compressed_size && compressed_pages)
> >  		extent_item_size = btrfs_file_extent_calc_inline_size(
> > @@ -2949,7 +2950,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
> >  			trans = NULL;
> >  			goto out;
> >  		}
> > -		trans->block_rsv = &fs_info->delalloc_block_rsv;
> > +		trans->block_rsv = &BTRFS_I(inode)->block_rsv;
> >  		ret = btrfs_update_inode_fallback(trans, root, inode);
> >  		if (ret) /* -ENOMEM or corruption */
> >  			btrfs_abort_transaction(trans, ret);
> > @@ -2985,7 +2986,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
> >  		goto out;
> >  	}
> >  
> > -	trans->block_rsv = &fs_info->delalloc_block_rsv;
> > +	trans->block_rsv = &BTRFS_I(inode)->block_rsv;
> >  
> >  	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
> >  		compress_type = ordered_extent->compress_type;
> > @@ -8834,7 +8835,6 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
> >  	if (iov_iter_rw(iter) == WRITE) {
> >  		up_read(&BTRFS_I(inode)->dio_sem);
> >  		current->journal_info = NULL;
> > -		btrfs_delalloc_release_extents(BTRFS_I(inode), count);
> >  		if (ret < 0 && ret != -EIOCBQUEUED) {
> >  			if (dio_data.reserve)
> >  				btrfs_delalloc_release_space(inode, data_reserved,
> > @@ -8855,6 +8855,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
> >  		} else if (ret >= 0 && (size_t)ret < count)
> >  			btrfs_delalloc_release_space(inode, data_reserved,
> >  					offset, count - (size_t)ret);
> > +		btrfs_delalloc_release_extents(BTRFS_I(inode), count);
> >  	}
> >  out:
> >  	if (wakeup)
> > @@ -9419,6 +9420,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
> >  
> >  struct inode *btrfs_alloc_inode(struct super_block *sb)
> >  {
> > +	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
> >  	struct btrfs_inode *ei;
> >  	struct inode *inode;
> >  
> > @@ -9445,8 +9447,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
> >  
> >  	spin_lock_init(&ei->lock);
> >  	ei->outstanding_extents = 0;
> > -	ei->reserved_extents = 0;
> > -
> > +	if (sb->s_magic != BTRFS_TEST_MAGIC)
> > +		btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
> > +					      BTRFS_BLOCK_RSV_DELALLOC);
> 
> Why the dependendance on BTRFS_TEST_MAGIC here?
> 

Because we allocate test inodes without a fs_info sometimes, this was the most
expedient way to deal with that.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 2894ad6..71a4ded 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -36,14 +36,13 @@ 
 #define BTRFS_INODE_ORPHAN_META_RESERVED	1
 #define BTRFS_INODE_DUMMY			2
 #define BTRFS_INODE_IN_DEFRAG			3
-#define BTRFS_INODE_DELALLOC_META_RESERVED	4
-#define BTRFS_INODE_HAS_ORPHAN_ITEM		5
-#define BTRFS_INODE_HAS_ASYNC_EXTENT		6
-#define BTRFS_INODE_NEEDS_FULL_SYNC		7
-#define BTRFS_INODE_COPY_EVERYTHING		8
-#define BTRFS_INODE_IN_DELALLOC_LIST		9
-#define BTRFS_INODE_READDIO_NEED_LOCK		10
-#define BTRFS_INODE_HAS_PROPS		        11
+#define BTRFS_INODE_HAS_ORPHAN_ITEM		4
+#define BTRFS_INODE_HAS_ASYNC_EXTENT		5
+#define BTRFS_INODE_NEEDS_FULL_SYNC		6
+#define BTRFS_INODE_COPY_EVERYTHING		7
+#define BTRFS_INODE_IN_DELALLOC_LIST		8
+#define BTRFS_INODE_READDIO_NEED_LOCK		9
+#define BTRFS_INODE_HAS_PROPS		        10
 
 /* in memory btrfs inode */
 struct btrfs_inode {
@@ -176,7 +175,8 @@  struct btrfs_inode {
 	 * of extent items we've reserved metadata for.
 	 */
 	unsigned outstanding_extents;
-	unsigned reserved_extents;
+
+	struct btrfs_block_rsv block_rsv;
 
 	/*
 	 * Cached values if inode properties
@@ -278,17 +278,6 @@  static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
 						  mod);
 }
 
-static inline void btrfs_mod_reserved_extents(struct btrfs_inode *inode,
-					      int mod)
-{
-	ASSERT(spin_is_locked(&inode->lock));
-	inode->reserved_extents += mod;
-	if (btrfs_is_free_space_inode(inode))
-		return;
-	trace_btrfs_inode_mod_reserved_extents(inode->root, btrfs_ino(inode),
-					       mod);
-}
-
 static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
 {
 	int ret = 0;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b43114d..a4ee115 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -763,8 +763,6 @@  struct btrfs_fs_info {
 	 * delayed dir index item
 	 */
 	struct btrfs_block_rsv global_block_rsv;
-	/* block reservation for delay allocation */
-	struct btrfs_block_rsv delalloc_block_rsv;
 	/* block reservation for metadata operations */
 	struct btrfs_block_rsv trans_block_rsv;
 	/* block reservation for chunk tree */
@@ -2744,6 +2742,9 @@  int btrfs_delalloc_reserve_space(struct inode *inode,
 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
 					      unsigned short type);
+void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
+				   struct btrfs_block_rsv *rsv,
+				   unsigned short type);
 void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
 			  struct btrfs_block_rsv *rsv);
 void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 19e4ad2..5d73f79 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -581,7 +581,6 @@  static int btrfs_delayed_inode_reserve_metadata(
 	struct btrfs_block_rsv *dst_rsv;
 	u64 num_bytes;
 	int ret;
-	bool release = false;
 
 	src_rsv = trans->block_rsv;
 	dst_rsv = &fs_info->delayed_block_rsv;
@@ -589,36 +588,13 @@  static int btrfs_delayed_inode_reserve_metadata(
 	num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
 
 	/*
-	 * If our block_rsv is the delalloc block reserve then check and see if
-	 * we have our extra reservation for updating the inode.  If not fall
-	 * through and try to reserve space quickly.
-	 *
-	 * We used to try and steal from the delalloc block rsv or the global
-	 * reserve, but we'd steal a full reservation, which isn't kind.  We are
-	 * here through delalloc which means we've likely just cowed down close
-	 * to the leaf that contains the inode, so we would steal less just
-	 * doing the fallback inode update, so if we do end up having to steal
-	 * from the global block rsv we hopefully only steal one or two blocks
-	 * worth which is less likely to hurt us.
-	 */
-	if (src_rsv && src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
-		spin_lock(&inode->lock);
-		if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
-				       &inode->runtime_flags))
-			release = true;
-		else
-			src_rsv = NULL;
-		spin_unlock(&inode->lock);
-	}
-
-	/*
 	 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
 	 * which doesn't reserve space for speed.  This is a problem since we
 	 * still need to reserve space for this update, so try to reserve the
 	 * space.
 	 *
 	 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
-	 * we're accounted for.
+	 * we always reserve enough to update the inode item.
 	 */
 	if (!src_rsv || (!trans->bytes_reserved &&
 			 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
@@ -643,32 +619,12 @@  static int btrfs_delayed_inode_reserve_metadata(
 	}
 
 	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
-
-	/*
-	 * Migrate only takes a reservation, it doesn't touch the size of the
-	 * block_rsv.  This is to simplify people who don't normally have things
-	 * migrated from their block rsv.  If they go to release their
-	 * reservation, that will decrease the size as well, so if migrate
-	 * reduced size we'd end up with a negative size.  But for the
-	 * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
-	 * but we could in fact do this reserve/migrate dance several times
-	 * between the time we did the original reservation and we'd clean it
-	 * up.  So to take care of this, release the space for the meta
-	 * reservation here.  I think it may be time for a documentation page on
-	 * how block rsvs. work.
-	 */
 	if (!ret) {
 		trace_btrfs_space_reservation(fs_info, "delayed_inode",
 					      btrfs_ino(inode), num_bytes, 1);
 		node->bytes_reserved = num_bytes;
 	}
 
-	if (release) {
-		trace_btrfs_space_reservation(fs_info, "delalloc",
-					      btrfs_ino(inode), num_bytes, 0);
-		btrfs_block_rsv_release(fs_info, src_rsv, num_bytes);
-	}
-
 	return ret;
 }
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9283e28..5681f8f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2607,14 +2607,6 @@  int open_ctree(struct super_block *sb,
 		goto fail_delalloc_bytes;
 	}
 
-	fs_info->btree_inode = new_inode(sb);
-	if (!fs_info->btree_inode) {
-		err = -ENOMEM;
-		goto fail_bio_counter;
-	}
-
-	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
-
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
 	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
 	INIT_LIST_HEAD(&fs_info->trans_list);
@@ -2647,8 +2639,6 @@  int open_ctree(struct super_block *sb,
 	btrfs_mapping_init(&fs_info->mapping_tree);
 	btrfs_init_block_rsv(&fs_info->global_block_rsv,
 			     BTRFS_BLOCK_RSV_GLOBAL);
-	btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
-			     BTRFS_BLOCK_RSV_DELALLOC);
 	btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
 	btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
 	btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
@@ -2679,6 +2669,14 @@  int open_ctree(struct super_block *sb,
 
 	INIT_LIST_HEAD(&fs_info->ordered_roots);
 	spin_lock_init(&fs_info->ordered_root_lock);
+
+	fs_info->btree_inode = new_inode(sb);
+	if (!fs_info->btree_inode) {
+		err = -ENOMEM;
+		goto fail_bio_counter;
+	}
+	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
+
 	fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
 					GFP_KERNEL);
 	if (!fs_info->delayed_root) {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a313487..8484e0f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4761,7 +4761,6 @@  static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
 static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
 			    u64 orig, bool wait_ordered)
 {
-	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_space_info *space_info;
 	struct btrfs_trans_handle *trans;
 	u64 delalloc_bytes;
@@ -4777,8 +4776,7 @@  static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
 	to_reclaim = items * EXTENT_SIZE_PER_ITEM;
 
 	trans = (struct btrfs_trans_handle *)current->journal_info;
-	block_rsv = &fs_info->delalloc_block_rsv;
-	space_info = block_rsv->space_info;
+	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 
 	delalloc_bytes = percpu_counter_sum_positive(
 						&fs_info->delalloc_bytes);
@@ -5502,7 +5500,8 @@  static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
 
 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
 				    struct btrfs_block_rsv *block_rsv,
-				    struct btrfs_block_rsv *dest, u64 num_bytes)
+				    struct btrfs_block_rsv *dest, u64 num_bytes,
+				    u64 *released_bytes)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
 
@@ -5520,6 +5519,8 @@  static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
 	spin_unlock(&block_rsv->lock);
 
 	if (num_bytes > 0) {
+		if (released_bytes)
+			*released_bytes = num_bytes;
 		if (dest) {
 			spin_lock(&dest->lock);
 			if (!dest->full) {
@@ -5561,6 +5562,15 @@  void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
 	rsv->type = type;
 }
 
+void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
+				   struct btrfs_block_rsv *rsv,
+				   unsigned short type)
+{
+	btrfs_init_block_rsv(rsv, type);
+	rsv->space_info = __find_space_info(fs_info,
+					    BTRFS_BLOCK_GROUP_METADATA);
+}
+
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
 					      unsigned short type)
 {
@@ -5570,9 +5580,7 @@  struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
 	if (!block_rsv)
 		return NULL;
 
-	btrfs_init_block_rsv(block_rsv, type);
-	block_rsv->space_info = __find_space_info(fs_info,
-						  BTRFS_BLOCK_GROUP_METADATA);
+	btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
 	return block_rsv;
 }
 
@@ -5655,6 +5663,68 @@  int btrfs_block_rsv_refill(struct btrfs_root *root,
 	return ret;
 }
 
+/**
+ * btrfs_inode_rsv_refill - refill the inode block rsv.
+ * @inode - the inode we are refilling.
+ * @flush - the flusing restriction.
+ *
+ * Essentially the same as btrfs_block_rsv_refill, except it uses the
+ * block_rsv->size as the minimum size.  We'll either refill the missing amount
+ * or return if we already have enough space.  This will also handle the resreve
+ * tracepoint for the reserved amount.
+ */
+int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
+			   enum btrfs_reserve_flush_enum flush)
+{
+	struct btrfs_root *root = inode->root;
+	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+	u64 num_bytes = 0;
+	int ret = -ENOSPC;
+
+	spin_lock(&block_rsv->lock);
+	if (block_rsv->reserved < block_rsv->size)
+		num_bytes = block_rsv->size - block_rsv->reserved;
+	spin_unlock(&block_rsv->lock);
+
+	if (num_bytes == 0)
+		return 0;
+
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+	if (!ret) {
+		block_rsv_add_bytes(block_rsv, num_bytes, 0);
+		trace_btrfs_space_reservation(root->fs_info, "delalloc",
+					      btrfs_ino(inode), num_bytes, 1);
+	}
+	return ret;
+}
+
+/**
+ * btrfs_inode_rsv_release - release any excessive reservation.
+ * @inode - the inode we need to release from.
+ *
+ * This is the same as btrfs_block_rsv_release, except that it handles the
+ * tracepoint for the reservation.
+ */
+void btrfs_inode_rsv_release(struct btrfs_inode *inode)
+{
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+	u64 release_bytes = 0;
+
+	/*
+	 * Since we statically set the block_rsv->size we just want to say we
+	 * are releasing 0 bytes, and then we'll just get the reservation over
+	 * the size free'd.
+	 */
+	block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0,
+				&release_bytes);
+	if (release_bytes > 0)
+		trace_btrfs_space_reservation(fs_info, "delalloc",
+					      btrfs_ino(inode), release_bytes,
+					      0);
+}
+
 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
 			     struct btrfs_block_rsv *block_rsv,
 			     u64 num_bytes)
@@ -5664,7 +5734,8 @@  void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
 	if (global_rsv == block_rsv ||
 	    block_rsv->space_info != global_rsv->space_info)
 		global_rsv = NULL;
-	block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes);
+	block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes,
+				NULL);
 }
 
 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
@@ -5726,7 +5797,6 @@  static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 
 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 	fs_info->global_block_rsv.space_info = space_info;
-	fs_info->delalloc_block_rsv.space_info = space_info;
 	fs_info->trans_block_rsv.space_info = space_info;
 	fs_info->empty_block_rsv.space_info = space_info;
 	fs_info->delayed_block_rsv.space_info = space_info;
@@ -5745,9 +5815,7 @@  static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 {
 	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
-				(u64)-1);
-	WARN_ON(fs_info->delalloc_block_rsv.size > 0);
-	WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
+				(u64)-1, NULL);
 	WARN_ON(fs_info->trans_block_rsv.size > 0);
 	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
 	WARN_ON(fs_info->chunk_block_rsv.size > 0);
@@ -5786,7 +5854,7 @@  void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
 	WARN_ON_ONCE(!list_empty(&trans->new_bgs));
 
 	block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
-				trans->chunk_bytes_reserved);
+				trans->chunk_bytes_reserved, NULL);
 	trans->chunk_bytes_reserved = 0;
 }
 
@@ -5886,95 +5954,37 @@  void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
 	btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
 }
 
-/**
- * drop_over_reserved_extents - drop our extra extent reservations
- * @inode: the inode we're dropping the extent for
- *
- * We reserve extents we may use, but they may have been merged with other
- * extents and we may not need the extra reservation.
- *
- * We also call this when we've completed io to an extent or had an error and
- * cleared the outstanding extent, in either case we no longer need our
- * reservation and can drop the excess.
- */
-static unsigned drop_over_reserved_extents(struct btrfs_inode *inode)
-{
-	unsigned num_extents = 0;
-
-	if (inode->reserved_extents > inode->outstanding_extents) {
-		num_extents = inode->reserved_extents -
-			inode->outstanding_extents;
-		btrfs_mod_reserved_extents(inode, -num_extents);
-	}
-
-	if (inode->outstanding_extents == 0 &&
-	    test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
-			       &inode->runtime_flags))
-		num_extents++;
-	return num_extents;
-}
-
-/**
- * calc_csum_metadata_size - return the amount of metadata space that must be
- *	reserved/freed for the given bytes.
- * @inode: the inode we're manipulating
- * @num_bytes: the number of bytes in question
- * @reserve: 1 if we are reserving space, 0 if we are freeing space
- *
- * This adjusts the number of csum_bytes in the inode and then returns the
- * correct amount of metadata that must either be reserved or freed.  We
- * calculate how many checksums we can fit into one leaf and then divide the
- * number of bytes that will need to be checksumed by this value to figure out
- * how many checksums will be required.  If we are adding bytes then the number
- * may go up and we will return the number of additional bytes that must be
- * reserved.  If it is going down we will return the number of bytes that must
- * be freed.
- *
- * This must be called with BTRFS_I(inode)->lock held.
- */
-static u64 calc_csum_metadata_size(struct btrfs_inode *inode, u64 num_bytes,
-				   int reserve)
+static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
+						 struct btrfs_inode *inode)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
-	u64 old_csums, num_csums;
+	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+	u64 reserve_size = 0;
+	u64 csum_leaves;
+	unsigned outstanding_extents;
 
-	if (inode->flags & BTRFS_INODE_NODATASUM && inode->csum_bytes == 0)
-		return 0;
+	ASSERT(spin_is_locked(&inode->lock));
+	outstanding_extents = inode->outstanding_extents;
+	if (outstanding_extents)
+		reserve_size = btrfs_calc_trans_metadata_size(fs_info,
+						outstanding_extents + 1);
+	csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
+						 inode->csum_bytes);
+	reserve_size += btrfs_calc_trans_metadata_size(fs_info,
+						       csum_leaves);
 
-	old_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
-	if (reserve)
-		inode->csum_bytes += num_bytes;
-	else
-		inode->csum_bytes -= num_bytes;
-	num_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
-
-	/* No change, no need to reserve more */
-	if (old_csums == num_csums)
-		return 0;
-
-	if (reserve)
-		return btrfs_calc_trans_metadata_size(fs_info,
-						      num_csums - old_csums);
-
-	return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums);
+	spin_lock(&block_rsv->lock);
+	block_rsv->size = reserve_size;
+	spin_unlock(&block_rsv->lock);
 }
 
 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
 	struct btrfs_root *root = inode->root;
-	struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv;
-	u64 to_reserve = 0;
-	u64 csum_bytes;
-	unsigned nr_extents, reserve_extents;
+	unsigned nr_extents;
 	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
 	int ret = 0;
 	bool delalloc_lock = true;
-	u64 to_free = 0;
-	unsigned dropped;
-	bool release_extra = false;
-	bool underflow = false;
-	bool did_retry = false;
 
 	WARN_ON(btrfs_is_free_space_inode(inode));
 
@@ -6001,33 +6011,13 @@  int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 		mutex_lock(&inode->delalloc_mutex);
 
 	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
-retry:
+
+	/* Add our new extents and calculate the new rsv size. */
 	spin_lock(&inode->lock);
-	reserve_extents = nr_extents = count_max_extents(num_bytes);
+	nr_extents = count_max_extents(num_bytes);
 	btrfs_mod_outstanding_extents(inode, nr_extents);
-
-	/*
-	 * Because we add an outstanding extent for ordered before we clear
-	 * delalloc we will double count our outstanding extents slightly.  This
-	 * could mean that we transiently over-reserve, which could result in an
-	 * early ENOSPC if our timing is unlucky.  Keep track of the case that
-	 * we had a reservation underflow so we can retry if we fail.
-	 *
-	 * Keep in mind we can legitimately have more outstanding extents than
-	 * reserved because of fragmentation, so only allow a retry once.
-	 */
-	if (inode->outstanding_extents >
-	    inode->reserved_extents + nr_extents) {
-		reserve_extents = inode->outstanding_extents -
-			inode->reserved_extents;
-		underflow = true;
-	}
-
-	/* We always want to reserve a slot for updating the inode. */
-	to_reserve = btrfs_calc_trans_metadata_size(fs_info,
-						    reserve_extents + 1);
-	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
-	csum_bytes = inode->csum_bytes;
+	inode->csum_bytes += num_bytes;
+	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 	spin_unlock(&inode->lock);
 
 	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
@@ -6037,100 +6027,26 @@  int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 			goto out_fail;
 	}
 
-	ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush);
+	ret = btrfs_inode_rsv_refill(inode, flush);
 	if (unlikely(ret)) {
 		btrfs_qgroup_free_meta(root,
 				       nr_extents * fs_info->nodesize);
 		goto out_fail;
 	}
 
-	spin_lock(&inode->lock);
-	if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
-			     &inode->runtime_flags)) {
-		to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1);
-		release_extra = true;
-	}
-	btrfs_mod_reserved_extents(inode, reserve_extents);
-	spin_unlock(&inode->lock);
-
 	if (delalloc_lock)
 		mutex_unlock(&inode->delalloc_mutex);
-
-	if (to_reserve)
-		trace_btrfs_space_reservation(fs_info, "delalloc",
-					      btrfs_ino(inode), to_reserve, 1);
-	if (release_extra)
-		btrfs_block_rsv_release(fs_info, block_rsv,
-				btrfs_calc_trans_metadata_size(fs_info, 1));
 	return 0;
 
 out_fail:
 	spin_lock(&inode->lock);
 	nr_extents = count_max_extents(num_bytes);
 	btrfs_mod_outstanding_extents(inode, -nr_extents);
-
-	dropped = drop_over_reserved_extents(inode);
-	/*
-	 * If the inodes csum_bytes is the same as the original
-	 * csum_bytes then we know we haven't raced with any free()ers
-	 * so we can just reduce our inodes csum bytes and carry on.
-	 */
-	if (inode->csum_bytes == csum_bytes) {
-		calc_csum_metadata_size(inode, num_bytes, 0);
-	} else {
-		u64 orig_csum_bytes = inode->csum_bytes;
-		u64 bytes;
-
-		/*
-		 * This is tricky, but first we need to figure out how much we
-		 * freed from any free-ers that occurred during this
-		 * reservation, so we reset ->csum_bytes to the csum_bytes
-		 * before we dropped our lock, and then call the free for the
-		 * number of bytes that were freed while we were trying our
-		 * reservation.
-		 */
-		bytes = csum_bytes - inode->csum_bytes;
-		inode->csum_bytes = csum_bytes;
-		to_free = calc_csum_metadata_size(inode, bytes, 0);
-
-
-		/*
-		 * Now we need to see how much we would have freed had we not
-		 * been making this reservation and our ->csum_bytes were not
-		 * artificially inflated.
-		 */
-		inode->csum_bytes = csum_bytes - num_bytes;
-		bytes = csum_bytes - orig_csum_bytes;
-		bytes = calc_csum_metadata_size(inode, bytes, 0);
-
-		/*
-		 * Now reset ->csum_bytes to what it should be.  If bytes is
-		 * more than to_free then we would have freed more space had we
-		 * not had an artificially high ->csum_bytes, so we need to free
-		 * the remainder.  If bytes is the same or less then we don't
-		 * need to do anything, the other free-ers did the correct
-		 * thing.
-		 */
-		inode->csum_bytes = orig_csum_bytes - num_bytes;
-		if (bytes > to_free)
-			to_free = bytes - to_free;
-		else
-			to_free = 0;
-	}
+	inode->csum_bytes -= num_bytes;
+	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 	spin_unlock(&inode->lock);
-	if (dropped)
-		to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
 
-	if (to_free) {
-		btrfs_block_rsv_release(fs_info, block_rsv, to_free);
-		trace_btrfs_space_reservation(fs_info, "delalloc",
-					      btrfs_ino(inode), to_free, 0);
-	}
-	if (underflow && !did_retry) {
-		did_retry = true;
-		underflow = false;
-		goto retry;
-	}
+	btrfs_inode_rsv_release(inode);
 	if (delalloc_lock)
 		mutex_unlock(&inode->delalloc_mutex);
 	return ret;
@@ -6148,25 +6064,17 @@  int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
-	u64 to_free = 0;
-	unsigned dropped;
 
 	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
 	spin_lock(&inode->lock);
-	dropped = drop_over_reserved_extents(inode);
-	if (num_bytes)
-		to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+	inode->csum_bytes -= num_bytes;
+	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 	spin_unlock(&inode->lock);
-	if (dropped > 0)
-		to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
 
 	if (btrfs_is_testing(fs_info))
 		return;
 
-	trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode),
-				      to_free, 0);
-
-	btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free);
+	btrfs_inode_rsv_release(inode);
 }
 
 /**
@@ -6184,25 +6092,17 @@  void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
 	unsigned num_extents;
-	u64 to_free;
-	unsigned dropped;
 
 	spin_lock(&inode->lock);
 	num_extents = count_max_extents(num_bytes);
 	btrfs_mod_outstanding_extents(inode, -num_extents);
-	dropped = drop_over_reserved_extents(inode);
+	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 	spin_unlock(&inode->lock);
 
-	if (!dropped)
-		return;
-
 	if (btrfs_is_testing(fs_info))
 		return;
 
-	to_free = btrfs_calc_trans_metadata_size(fs_info, dropped);
-	trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode),
-				      to_free, 0);
-	btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free);
+	btrfs_inode_rsv_release(inode);
 }
 
 /**
@@ -8427,7 +8327,7 @@  static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
 			    struct btrfs_block_rsv *block_rsv, u32 blocksize)
 {
 	block_rsv_add_bytes(block_rsv, blocksize, 0);
-	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
+	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
 }
 
 /*
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 912417b..b0d1e94 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -42,6 +42,7 @@ 
 #include <linux/blkdev.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/uio.h>
+#include <linux/magic.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -303,7 +304,7 @@  static noinline int cow_file_range_inline(struct btrfs_root *root,
 		btrfs_free_path(path);
 		return PTR_ERR(trans);
 	}
-	trans->block_rsv = &fs_info->delalloc_block_rsv;
+	trans->block_rsv = &BTRFS_I(inode)->block_rsv;
 
 	if (compressed_size && compressed_pages)
 		extent_item_size = btrfs_file_extent_calc_inline_size(
@@ -2949,7 +2950,7 @@  static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 			trans = NULL;
 			goto out;
 		}
-		trans->block_rsv = &fs_info->delalloc_block_rsv;
+		trans->block_rsv = &BTRFS_I(inode)->block_rsv;
 		ret = btrfs_update_inode_fallback(trans, root, inode);
 		if (ret) /* -ENOMEM or corruption */
 			btrfs_abort_transaction(trans, ret);
@@ -2985,7 +2986,7 @@  static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 		goto out;
 	}
 
-	trans->block_rsv = &fs_info->delalloc_block_rsv;
+	trans->block_rsv = &BTRFS_I(inode)->block_rsv;
 
 	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
 		compress_type = ordered_extent->compress_type;
@@ -8834,7 +8835,6 @@  static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	if (iov_iter_rw(iter) == WRITE) {
 		up_read(&BTRFS_I(inode)->dio_sem);
 		current->journal_info = NULL;
-		btrfs_delalloc_release_extents(BTRFS_I(inode), count);
 		if (ret < 0 && ret != -EIOCBQUEUED) {
 			if (dio_data.reserve)
 				btrfs_delalloc_release_space(inode, data_reserved,
@@ -8855,6 +8855,7 @@  static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 		} else if (ret >= 0 && (size_t)ret < count)
 			btrfs_delalloc_release_space(inode, data_reserved,
 					offset, count - (size_t)ret);
+		btrfs_delalloc_release_extents(BTRFS_I(inode), count);
 	}
 out:
 	if (wakeup)
@@ -9419,6 +9420,7 @@  int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 
 struct inode *btrfs_alloc_inode(struct super_block *sb)
 {
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 	struct btrfs_inode *ei;
 	struct inode *inode;
 
@@ -9445,8 +9447,9 @@  struct inode *btrfs_alloc_inode(struct super_block *sb)
 
 	spin_lock_init(&ei->lock);
 	ei->outstanding_extents = 0;
-	ei->reserved_extents = 0;
-
+	if (sb->s_magic != BTRFS_TEST_MAGIC)
+		btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
+					      BTRFS_BLOCK_RSV_DELALLOC);
 	ei->runtime_flags = 0;
 	ei->prop_compress = BTRFS_COMPRESS_NONE;
 	ei->defrag_compress = BTRFS_COMPRESS_NONE;
@@ -9496,8 +9499,9 @@  void btrfs_destroy_inode(struct inode *inode)
 
 	WARN_ON(!hlist_empty(&inode->i_dentry));
 	WARN_ON(inode->i_data.nrpages);
+	WARN_ON(BTRFS_I(inode)->block_rsv.reserved);
+	WARN_ON(BTRFS_I(inode)->block_rsv.size);
 	WARN_ON(BTRFS_I(inode)->outstanding_extents);
-	WARN_ON(BTRFS_I(inode)->reserved_extents);
 	WARN_ON(BTRFS_I(inode)->delalloc_bytes);
 	WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
 	WARN_ON(BTRFS_I(inode)->csum_bytes);
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 5e48b98..a9e63fb 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1713,28 +1713,6 @@  TRACE_EVENT(btrfs_inode_mod_outstanding_extents,
 			(unsigned long long)__entry->ino, __entry->mod)
 );
 
-TRACE_EVENT(btrfs_inode_mod_reserved_extents,
-	TP_PROTO(struct btrfs_root *root, u64 ino, int mod),
-
-	TP_ARGS(root, ino, mod),
-
-	TP_STRUCT__entry_btrfs(
-		__field(	u64, root_objectid	)
-		__field(	u64, ino		)
-		__field(	int, mod		)
-	),
-
-	TP_fast_assign_btrfs(root->fs_info,
-		__entry->root_objectid	= root->objectid;
-		__entry->ino		= ino;
-		__entry->mod		= mod;
-	),
-
-	TP_printk_btrfs("root = %llu(%s), ino = %llu, mod = %d",
-			show_root_type(__entry->root_objectid),
-			(unsigned long long)__entry->ino, __entry->mod)
-);
-
 #endif /* _TRACE_BTRFS_H */
 
 /* This part must be outside protection */