Message ID | 1458926760-17563-8-git-send-email-jbacik@fb.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
On Fri, Mar 25, 2016 at 01:25:53PM -0400, Josef Bacik wrote: > Our enospc flushing sucks. It is born from a time where we were early > enospc'ing constantly because multiple threads would race in for the same > reservation and randomly starve other ones out. So I came up with this solution > to block any other reservations from happening while one guy tried to flush > stuff to satisfy his reservation. This gives us pretty good correctness, but > completely crap latency. > > The solution I've come up with is ticketed reservations. Basically we try to > make our reservation, and if we can't we put a ticket on a list in order and > kick off an async flusher thread. This async flusher thread does the same old > flushing we always did, just asynchronously. As space is freed and added back > to the space_info it checks and sees if we have any tickets that need > satisfying, and adds space to the tickets and wakes up anything we've satisfied. > > Once the flusher thread stops making progress it wakes up all the current > tickets and tells them to take a hike. > > There is a priority list for things that can't flush, since the async flusher > could do anything we need to avoid deadlocks. These guys get priority for > having their reservation made, and will still do manual flushing themselves in > case the async flusher isn't running. > > This patch gives us significantly better latencies. Thanks, > > Signed-off-by: Josef Bacik <jbacik@fb.com> > --- > fs/btrfs/ctree.h | 2 + > fs/btrfs/extent-tree.c | 524 +++++++++++++++++++++++++++++++++++-------------- > 2 files changed, 375 insertions(+), 151 deletions(-) > > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h > index b675066..7437c8a 100644 > --- a/fs/btrfs/ctree.h > +++ b/fs/btrfs/ctree.h > @@ -1229,6 +1229,8 @@ struct btrfs_space_info { > struct list_head list; > /* Protected by the spinlock 'lock'. */ > struct list_head ro_bgs; > + struct list_head priority_tickets; > + struct list_head tickets; > > struct rw_semaphore groups_sem; > /* for block groups in our same type */ > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c > index 0db4319..1673365 100644 > --- a/fs/btrfs/extent-tree.c > +++ b/fs/btrfs/extent-tree.c > @@ -111,6 +111,16 @@ static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, > u64 num_bytes); > int btrfs_pin_extent(struct btrfs_root *root, > u64 bytenr, u64 num_bytes, int reserved); > +static int __reserve_metadata_bytes(struct btrfs_root *root, > + struct btrfs_space_info *space_info, > + u64 orig_bytes, > + enum btrfs_reserve_flush_enum flush); > +static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, > + struct btrfs_space_info *space_info, > + u64 num_bytes); > +static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, > + struct btrfs_space_info *space_info, > + u64 num_bytes); > > static noinline int > block_group_cache_done(struct btrfs_block_group_cache *cache) > @@ -3867,6 +3877,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, > found->bytes_readonly += bytes_readonly; > if (total_bytes > 0) > found->full = 0; > + space_info_add_new_bytes(info, found, total_bytes - > + bytes_used - bytes_readonly); > spin_unlock(&found->lock); > *space_info = found; > return 0; > @@ -3901,6 +3913,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, > found->flush = 0; > init_waitqueue_head(&found->wait); > INIT_LIST_HEAD(&found->ro_bgs); > + INIT_LIST_HEAD(&found->tickets); > + INIT_LIST_HEAD(&found->priority_tickets); > > ret = kobject_init_and_add(&found->kobj, &space_info_ktype, > info->space_info_kobj, "%s", > @@ -4514,12 +4528,19 @@ static int can_overcommit(struct btrfs_root *root, > struct btrfs_space_info *space_info, u64 bytes, > enum btrfs_reserve_flush_enum flush) > { > - struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; > - u64 profile = btrfs_get_alloc_profile(root, 0); > + struct btrfs_block_rsv *global_rsv; > + u64 profile; > u64 space_size; > u64 avail; > u64 used; > > + /* Don't overcommit when in mixed mode. */ > + if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) > + return 0; > + > + BUG_ON(root->fs_info == NULL); > + global_rsv = &root->fs_info->global_block_rsv; > + profile = btrfs_get_alloc_profile(root, 0); > used = space_info->bytes_used + space_info->bytes_reserved + > space_info->bytes_pinned + space_info->bytes_readonly; > > @@ -4669,6 +4690,11 @@ skip_async: > spin_unlock(&space_info->lock); > break; > } > + if (list_empty(&space_info->tickets) && > + list_empty(&space_info->priority_tickets)) { > + spin_unlock(&space_info->lock); > + break; > + } > spin_unlock(&space_info->lock); > > loops++; > @@ -4745,6 +4771,13 @@ enum flush_state { > COMMIT_TRANS = 6, > }; > > +struct reserve_ticket { > + u64 bytes; > + int error; > + struct list_head list; > + wait_queue_head_t wait; > +}; > + > static int flush_space(struct btrfs_root *root, > struct btrfs_space_info *space_info, u64 num_bytes, > u64 orig_bytes, int state) > @@ -4802,17 +4835,22 @@ static inline u64 > btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, > struct btrfs_space_info *space_info) > { > + struct reserve_ticket *ticket; > u64 used; > u64 expected; > - u64 to_reclaim; > + u64 to_reclaim = 0; > > to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); > - spin_lock(&space_info->lock); > if (can_overcommit(root, space_info, to_reclaim, > - BTRFS_RESERVE_FLUSH_ALL)) { > - to_reclaim = 0; > - goto out; > - } > + BTRFS_RESERVE_FLUSH_ALL)) > + return 0; > + > + list_for_each_entry(ticket, &space_info->tickets, list) > + to_reclaim += ticket->bytes; > + list_for_each_entry(ticket, &space_info->priority_tickets, list) > + to_reclaim += ticket->bytes; > + if (to_reclaim) > + return to_reclaim; > > used = space_info->bytes_used + space_info->bytes_reserved + > space_info->bytes_pinned + space_info->bytes_readonly + > @@ -4828,9 +4866,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, > to_reclaim = 0; > to_reclaim = min(to_reclaim, space_info->bytes_may_use + > space_info->bytes_reserved); > -out: > - spin_unlock(&space_info->lock); > - > return to_reclaim; > } > > @@ -4847,69 +4882,169 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, > !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); > } > > -static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, > - struct btrfs_fs_info *fs_info, > - int flush_state) > +static void wake_all_tickets(struct list_head *head) > { > - u64 used; > + struct reserve_ticket *ticket; > > - spin_lock(&space_info->lock); > - /* > - * We run out of space and have not got any free space via flush_space, > - * so don't bother doing async reclaim. > - */ > - if (flush_state > COMMIT_TRANS && space_info->full) { > - spin_unlock(&space_info->lock); > - return 0; > + while (!list_empty(head)) { > + ticket = list_first_entry(head, struct reserve_ticket, list); > + list_del_init(&ticket->list); > + ticket->error = -ENOSPC; > + wake_up(&ticket->wait); > } > - > - used = space_info->bytes_used + space_info->bytes_reserved + > - space_info->bytes_pinned + space_info->bytes_readonly + > - space_info->bytes_may_use; > - if (need_do_async_reclaim(space_info, fs_info, used)) { > - spin_unlock(&space_info->lock); > - return 1; > - } > - spin_unlock(&space_info->lock); > - > - return 0; > } > > +/* > + * This is for normal flushers, we can wait all goddamned day if we want to. We > + * will loop and continuously try to flush as long as we are making progress. > + * We count progress as clearing off tickets each time we have to loop. > + */ > static void btrfs_async_reclaim_metadata_space(struct work_struct *work) > { > + struct reserve_ticket *last_ticket = NULL; > struct btrfs_fs_info *fs_info; > struct btrfs_space_info *space_info; > u64 to_reclaim; > int flush_state; > + int commit_cycles = 0; > > fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); > space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); > > + spin_lock(&space_info->lock); > to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, > space_info); > - if (!to_reclaim) > + if (!to_reclaim) { > + space_info->flush = 0; > + spin_unlock(&space_info->lock); > return; > + } > + last_ticket = list_first_entry(&space_info->tickets, > + struct reserve_ticket, list); > + spin_unlock(&space_info->lock); > > flush_state = FLUSH_DELAYED_ITEMS_NR; > do { > + struct reserve_ticket *ticket; > + int ret; > + > + ret = flush_space(fs_info->fs_root, space_info, to_reclaim, > + to_reclaim, flush_state); > + spin_lock(&space_info->lock); > + if (list_empty(&space_info->tickets)) { > + space_info->flush = 0; > + spin_unlock(&space_info->lock); > + return; > + } > + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, > + space_info); > + ticket = list_first_entry(&space_info->tickets, > + struct reserve_ticket, list); > + if (last_ticket == ticket) { > + flush_state++; > + } else { > + last_ticket = ticket; > + flush_state = FLUSH_DELAYED_ITEMS_NR; > + if (commit_cycles) > + commit_cycles--; > + } > + > + if (flush_state > COMMIT_TRANS) { > + commit_cycles++; > + if (commit_cycles > 2) { > + wake_all_tickets(&space_info->tickets); > + space_info->flush = 0; > + } else { > + flush_state = FLUSH_DELAYED_ITEMS_NR; > + } > + } > + spin_unlock(&space_info->lock); > + } while (flush_state <= COMMIT_TRANS); > +} > + > +void btrfs_init_async_reclaim_work(struct work_struct *work) > +{ > + INIT_WORK(work, btrfs_async_reclaim_metadata_space); > +} > + > +static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, > + struct btrfs_space_info *space_info, > + struct reserve_ticket *ticket) > +{ > + u64 to_reclaim; > + int flush_state = FLUSH_DELAYED_ITEMS_NR; > + > + spin_lock(&space_info->lock); > + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, > + space_info); > + if (!to_reclaim) { > + spin_unlock(&space_info->lock); > + return; > + } > + spin_unlock(&space_info->lock); > + > + do { > flush_space(fs_info->fs_root, space_info, to_reclaim, > to_reclaim, flush_state); > flush_state++; > - if (!btrfs_need_do_async_reclaim(space_info, fs_info, > - flush_state)) > + spin_lock(&space_info->lock); > + if (ticket->bytes == 0) { > + spin_unlock(&space_info->lock); > return; > + } > + spin_unlock(&space_info->lock); > + > + /* > + * Priority flushers can't wait on delalloc without > + * deadlocking. > + */ > + if (flush_state == FLUSH_DELALLOC || > + flush_state == FLUSH_DELALLOC_WAIT) > + flush_state = ALLOC_CHUNK; > } while (flush_state < COMMIT_TRANS); > } > > -void btrfs_init_async_reclaim_work(struct work_struct *work) > +static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, > + struct btrfs_space_info *space_info, > + struct reserve_ticket *ticket, u64 orig_bytes) > + > { > - INIT_WORK(work, btrfs_async_reclaim_metadata_space); > + DEFINE_WAIT(wait); > + int ret = 0; > + > + spin_lock(&space_info->lock); > + while (ticket->bytes > 0 && ticket->error == 0) { > + ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); > + if (ret) { > + ret = -EINTR; > + break; > + } > + spin_unlock(&space_info->lock); > + > + schedule(); > + > + finish_wait(&ticket->wait, &wait); > + spin_lock(&space_info->lock); > + } > + if (!ret) > + ret = ticket->error; > + if (!list_empty(&ticket->list)) > + list_del_init(&ticket->list); > + if (ticket->bytes && ticket->bytes < orig_bytes) { > + u64 num_bytes = orig_bytes - ticket->bytes; > + space_info->bytes_may_use -= num_bytes; > + trace_btrfs_space_reservation(fs_info, "space_info", > + space_info->flags, num_bytes, 0); > + } > + spin_unlock(&space_info->lock); > + > + return ret; > } > > /** > * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space > * @root - the root we're allocating for > - * @block_rsv - the block_rsv we're allocating for > + * @space_info - the space info we want to allocate from > * @orig_bytes - the number of bytes we want > * @flush - whether or not we can flush to make our reservation > * > @@ -4920,81 +5055,34 @@ void btrfs_init_async_reclaim_work(struct work_struct *work) > * regain reservations will be made and this will fail if there is not enough > * space already. > */ > -static int reserve_metadata_bytes(struct btrfs_root *root, > - struct btrfs_block_rsv *block_rsv, > - u64 orig_bytes, > - enum btrfs_reserve_flush_enum flush) > +static int __reserve_metadata_bytes(struct btrfs_root *root, > + struct btrfs_space_info *space_info, > + u64 orig_bytes, > + enum btrfs_reserve_flush_enum flush) > { > - struct btrfs_space_info *space_info = block_rsv->space_info; > + struct reserve_ticket ticket; > u64 used; > - u64 num_bytes = orig_bytes; > - int flush_state = FLUSH_DELAYED_ITEMS_NR; > int ret = 0; > - bool flushing = false; > > -again: > - ret = 0; > + ASSERT(orig_bytes); > spin_lock(&space_info->lock); > - /* > - * We only want to wait if somebody other than us is flushing and we > - * are actually allowed to flush all things. > - */ > - while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && > - space_info->flush) { > - spin_unlock(&space_info->lock); > - /* > - * If we have a trans handle we can't wait because the flusher > - * may have to commit the transaction, which would mean we would > - * deadlock since we are waiting for the flusher to finish, but > - * hold the current transaction open. > - */ > - if (current->journal_info) > - return -EAGAIN; > - ret = wait_event_killable(space_info->wait, !space_info->flush); > - /* Must have been killed, return */ > - if (ret) > - return -EINTR; > - > - spin_lock(&space_info->lock); > - } > - > ret = -ENOSPC; > used = space_info->bytes_used + space_info->bytes_reserved + > space_info->bytes_pinned + space_info->bytes_readonly + > space_info->bytes_may_use; > > /* > - * The idea here is that we've not already over-reserved the block group > - * then we can go ahead and save our reservation first and then start > - * flushing if we need to. Otherwise if we've already overcommitted > - * lets start flushing stuff first and then come back and try to make > - * our reservation. > + * If we have enough space then hooray, make our reservation and carry > + * on. If not see if we can overcommit, and if we can, hooray carry on. > + * If not things get more complicated. > */ > - if (used <= space_info->total_bytes) { > - if (used + orig_bytes <= space_info->total_bytes) { > - space_info->bytes_may_use += orig_bytes; > - trace_btrfs_space_reservation(root->fs_info, > - "space_info", space_info->flags, orig_bytes, 1); > - ret = 0; > - } else { > - /* > - * Ok set num_bytes to orig_bytes since we aren't > - * overocmmitted, this way we only try and reclaim what > - * we need. > - */ > - num_bytes = orig_bytes; > - } > - } else { > - /* > - * Ok we're over committed, set num_bytes to the overcommitted > - * amount plus the amount of bytes that we need for this > - * reservation. > - */ > - num_bytes = used - space_info->total_bytes + > - (orig_bytes * 2); > - } > - > - if (ret && can_overcommit(root, space_info, orig_bytes, flush)) { > + if (used + orig_bytes <= space_info->total_bytes) { > + space_info->bytes_may_use += orig_bytes; > + trace_btrfs_space_reservation(root->fs_info, "space_info", > + space_info->flags, orig_bytes, > + 1); > + ret = 0; > + } else if (can_overcommit(root, space_info, orig_bytes, flush)) { > space_info->bytes_may_use += orig_bytes; > trace_btrfs_space_reservation(root->fs_info, "space_info", > space_info->flags, orig_bytes, > @@ -5003,16 +5091,27 @@ again: > } > > /* > - * Couldn't make our reservation, save our place so while we're trying > - * to reclaim space we can actually use it instead of somebody else > - * stealing it from us. > + * If we couldn't make a reservation then setup our reservation ticket > + * and kick the async worker if it's not already running. > * > - * We make the other tasks wait for the flush only when we can flush > - * all things. > + * If we are a priority flusher then we just need to add our ticket to > + * the list and we will do our own flushing further down. > */ > if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { > - flushing = true; > - space_info->flush = 1; > + ticket.bytes = orig_bytes; > + ticket.error = 0; > + init_waitqueue_head(&ticket.wait); > + if (flush == BTRFS_RESERVE_FLUSH_ALL) { > + list_add_tail(&ticket.list, &space_info->tickets); > + if (!space_info->flush) { > + space_info->flush = 1; > + queue_work(system_unbound_wq, > + &root->fs_info->async_reclaim_work); > + } > + } else { > + list_add_tail(&ticket.list, > + &space_info->priority_tickets); > + } > } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { > used += orig_bytes; > /* > @@ -5027,33 +5126,56 @@ again: > &root->fs_info->async_reclaim_work); > } > spin_unlock(&space_info->lock); > - > if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) > - goto out; > + return ret; > > - ret = flush_space(root, space_info, num_bytes, orig_bytes, > - flush_state); > - flush_state++; > + if (flush == BTRFS_RESERVE_FLUSH_ALL) > + return wait_reserve_ticket(root->fs_info, space_info, &ticket, > + orig_bytes); > > - /* > - * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock > - * would happen. So skip delalloc flush. > - */ > - if (flush == BTRFS_RESERVE_FLUSH_LIMIT && > - (flush_state == FLUSH_DELALLOC || > - flush_state == FLUSH_DELALLOC_WAIT)) > - flush_state = ALLOC_CHUNK; > + ret = 0; > + priority_reclaim_metadata_space(root->fs_info, space_info, &ticket); > + spin_lock(&space_info->lock); > + if (ticket.bytes) { > + if (ticket.bytes < orig_bytes) { > + u64 num_bytes = orig_bytes - ticket.bytes; > + space_info->bytes_may_use -= num_bytes; > + trace_btrfs_space_reservation(root->fs_info, > + "space_info", space_info->flags, > + num_bytes, 0); > > - if (!ret) > - goto again; > - else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && > - flush_state < COMMIT_TRANS) > - goto again; > - else if (flush == BTRFS_RESERVE_FLUSH_ALL && > - flush_state <= COMMIT_TRANS) > - goto again; > + } > + list_del_init(&ticket.list); > + ret = -ENOSPC; > + } > + spin_unlock(&space_info->lock); > + ASSERT(list_empty(&ticket.list)); > + return ret; > +} > > -out: > +/** > + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space > + * @root - the root we're allocating for > + * @block_rsv - the block_rsv we're allocating for > + * @orig_bytes - the number of bytes we want > + * @flush - whether or not we can flush to make our reservation > + * > + * This will reserve orgi_bytes number of bytes from the space info associated > + * with the block_rsv. If there is not enough space it will make an attempt to > + * flush out space to make room. It will do this by flushing delalloc if > + * possible or committing the transaction. If flush is 0 then no attempts to > + * regain reservations will be made and this will fail if there is not enough > + * space already. > + */ > +static int reserve_metadata_bytes(struct btrfs_root *root, > + struct btrfs_block_rsv *block_rsv, > + u64 orig_bytes, > + enum btrfs_reserve_flush_enum flush) > +{ > + int ret; > + > + ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes, > + flush); > if (ret == -ENOSPC && > unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { > struct btrfs_block_rsv *global_rsv = > @@ -5066,13 +5188,8 @@ out: > if (ret == -ENOSPC) > trace_btrfs_space_reservation(root->fs_info, > "space_info:enospc", > - space_info->flags, orig_bytes, 1); > - if (flushing) { > - spin_lock(&space_info->lock); > - space_info->flush = 0; > - wake_up_all(&space_info->wait); > - spin_unlock(&space_info->lock); > - } > + block_rsv->space_info->flags, > + orig_bytes, 1); > return ret; > } > > @@ -5148,6 +5265,103 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, > return 0; > } > > +/* > + * This is for space we already have accounted in space_info->bytes_may_use, so > + * basically when we're returning space from block_rsv's. > + */ > +static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, > + struct btrfs_space_info *space_info, > + u64 num_bytes) > +{ > + struct reserve_ticket *ticket; > + struct list_head *head; > + u64 used; > + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; > + bool check_overcommit = false; > + > + spin_lock(&space_info->lock); > + head = &space_info->priority_tickets; > + > + /* > + * First we want to see if we're over our limit, because if we are then > + * we need to make sure we are still ok overcommitting before we satisfy > + * another reservation. > + */ > + used = space_info->bytes_used + space_info->bytes_reserved + > + space_info->bytes_pinned + space_info->bytes_readonly; > + if (used - num_bytes >= space_info->total_bytes) > + check_overcommit = true; 'used' without bytes_may_use should be less than ->total_bytes, you wanna check if (used + num_bytes >= space_info->total_bytes) ? Others are sane to me. Reviewed-by: Liu Bo <bo.li.liu@oracle.com> Thanks, -liubo > +again: > + while (!list_empty(head) && num_bytes) { > + ticket = list_first_entry(head, struct reserve_ticket, > + list); > + if (check_overcommit && > + !can_overcommit(fs_info->extent_root, space_info, > + ticket->bytes, flush)) > + break; > + if (num_bytes >= ticket->bytes) { > + list_del_init(&ticket->list); > + num_bytes -= ticket->bytes; > + ticket->bytes = 0; > + wake_up(&ticket->wait); > + } else { > + ticket->bytes -= num_bytes; > + num_bytes = 0; > + } > + } > + > + if (num_bytes && head == &space_info->priority_tickets) { > + head = &space_info->tickets; > + flush = BTRFS_RESERVE_FLUSH_ALL; > + goto again; > + } > + space_info->bytes_may_use -= num_bytes; > + trace_btrfs_space_reservation(fs_info, "space_info", > + space_info->flags, num_bytes, 0); > + spin_unlock(&space_info->lock); > +} > + > +/* > + * This is for newly allocated space that isn't accounted in > + * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent > + * we use this helper. > + */ > +static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, > + struct btrfs_space_info *space_info, > + u64 num_bytes) > +{ > + struct reserve_ticket *ticket; > + struct list_head *head = &space_info->priority_tickets; > + > +again: > + while (!list_empty(head) && num_bytes) { > + ticket = list_first_entry(head, struct reserve_ticket, > + list); > + if (num_bytes >= ticket->bytes) { > + trace_btrfs_space_reservation(fs_info, "space_info", > + space_info->flags, > + ticket->bytes, 1); > + list_del_init(&ticket->list); > + num_bytes -= ticket->bytes; > + space_info->bytes_may_use += ticket->bytes; > + ticket->bytes = 0; > + wake_up(&ticket->wait); > + } else { > + trace_btrfs_space_reservation(fs_info, "space_info", > + space_info->flags, > + num_bytes, 1); > + space_info->bytes_may_use += num_bytes; > + ticket->bytes -= num_bytes; > + num_bytes = 0; > + } > + } > + > + if (num_bytes && head == &space_info->priority_tickets) { > + head = &space_info->tickets; > + goto again; > + } > +} > + > static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, > struct btrfs_block_rsv *block_rsv, > struct btrfs_block_rsv *dest, u64 num_bytes) > @@ -5182,13 +5396,9 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, > } > spin_unlock(&dest->lock); > } > - if (num_bytes) { > - spin_lock(&space_info->lock); > - space_info->bytes_may_use -= num_bytes; > - trace_btrfs_space_reservation(fs_info, "space_info", > - space_info->flags, num_bytes, 0); > - spin_unlock(&space_info->lock); > - } > + if (num_bytes) > + space_info_add_old_bytes(fs_info, space_info, > + num_bytes); > } > } > > @@ -6346,17 +6556,29 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, > readonly = true; > } > spin_unlock(&cache->lock); > - if (!readonly && global_rsv->space_info == space_info) { > + if (!readonly && return_free_space && > + global_rsv->space_info == space_info) { > + u64 to_add = len; > + WARN_ON(!return_free_space); > spin_lock(&global_rsv->lock); > if (!global_rsv->full) { > - len = min(len, global_rsv->size - > - global_rsv->reserved); > - global_rsv->reserved += len; > - space_info->bytes_may_use += len; > + to_add = min(len, global_rsv->size - > + global_rsv->reserved); > + global_rsv->reserved += to_add; > + space_info->bytes_may_use += to_add; > if (global_rsv->reserved >= global_rsv->size) > global_rsv->full = 1; > + trace_btrfs_space_reservation(fs_info, > + "space_info", > + space_info->flags, > + to_add, 1); > + len -= to_add; > } > spin_unlock(&global_rsv->lock); > + /* Add to any tickets we may have */ > + if (len) > + space_info_add_new_bytes(fs_info, space_info, > + len); > } > spin_unlock(&space_info->lock); > } > -- > 2.5.0 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b675066..7437c8a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1229,6 +1229,8 @@ struct btrfs_space_info { struct list_head list; /* Protected by the spinlock 'lock'. */ struct list_head ro_bgs; + struct list_head priority_tickets; + struct list_head tickets; struct rw_semaphore groups_sem; /* for block groups in our same type */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0db4319..1673365 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -111,6 +111,16 @@ static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); int btrfs_pin_extent(struct btrfs_root *root, u64 bytenr, u64 num_bytes, int reserved); +static int __reserve_metadata_bytes(struct btrfs_root *root, + struct btrfs_space_info *space_info, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush); +static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes); +static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -3867,6 +3877,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_readonly += bytes_readonly; if (total_bytes > 0) found->full = 0; + space_info_add_new_bytes(info, found, total_bytes - + bytes_used - bytes_readonly); spin_unlock(&found->lock); *space_info = found; return 0; @@ -3901,6 +3913,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->flush = 0; init_waitqueue_head(&found->wait); INIT_LIST_HEAD(&found->ro_bgs); + INIT_LIST_HEAD(&found->tickets); + INIT_LIST_HEAD(&found->priority_tickets); ret = kobject_init_and_add(&found->kobj, &space_info_ktype, info->space_info_kobj, "%s", @@ -4514,12 +4528,19 @@ static int can_overcommit(struct btrfs_root *root, struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { - struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; - u64 profile = btrfs_get_alloc_profile(root, 0); + struct btrfs_block_rsv *global_rsv; + u64 profile; u64 space_size; u64 avail; u64 used; + /* Don't overcommit when in mixed mode. */ + if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) + return 0; + + BUG_ON(root->fs_info == NULL); + global_rsv = &root->fs_info->global_block_rsv; + profile = btrfs_get_alloc_profile(root, 0); used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly; @@ -4669,6 +4690,11 @@ skip_async: spin_unlock(&space_info->lock); break; } + if (list_empty(&space_info->tickets) && + list_empty(&space_info->priority_tickets)) { + spin_unlock(&space_info->lock); + break; + } spin_unlock(&space_info->lock); loops++; @@ -4745,6 +4771,13 @@ enum flush_state { COMMIT_TRANS = 6, }; +struct reserve_ticket { + u64 bytes; + int error; + struct list_head list; + wait_queue_head_t wait; +}; + static int flush_space(struct btrfs_root *root, struct btrfs_space_info *space_info, u64 num_bytes, u64 orig_bytes, int state) @@ -4802,17 +4835,22 @@ static inline u64 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, struct btrfs_space_info *space_info) { + struct reserve_ticket *ticket; u64 used; u64 expected; - u64 to_reclaim; + u64 to_reclaim = 0; to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - spin_lock(&space_info->lock); if (can_overcommit(root, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL)) { - to_reclaim = 0; - goto out; - } + BTRFS_RESERVE_FLUSH_ALL)) + return 0; + + list_for_each_entry(ticket, &space_info->tickets, list) + to_reclaim += ticket->bytes; + list_for_each_entry(ticket, &space_info->priority_tickets, list) + to_reclaim += ticket->bytes; + if (to_reclaim) + return to_reclaim; used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + @@ -4828,9 +4866,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, to_reclaim = 0; to_reclaim = min(to_reclaim, space_info->bytes_may_use + space_info->bytes_reserved); -out: - spin_unlock(&space_info->lock); - return to_reclaim; } @@ -4847,69 +4882,169 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); } -static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, - struct btrfs_fs_info *fs_info, - int flush_state) +static void wake_all_tickets(struct list_head *head) { - u64 used; + struct reserve_ticket *ticket; - spin_lock(&space_info->lock); - /* - * We run out of space and have not got any free space via flush_space, - * so don't bother doing async reclaim. - */ - if (flush_state > COMMIT_TRANS && space_info->full) { - spin_unlock(&space_info->lock); - return 0; + while (!list_empty(head)) { + ticket = list_first_entry(head, struct reserve_ticket, list); + list_del_init(&ticket->list); + ticket->error = -ENOSPC; + wake_up(&ticket->wait); } - - used = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; - if (need_do_async_reclaim(space_info, fs_info, used)) { - spin_unlock(&space_info->lock); - return 1; - } - spin_unlock(&space_info->lock); - - return 0; } +/* + * This is for normal flushers, we can wait all goddamned day if we want to. We + * will loop and continuously try to flush as long as we are making progress. + * We count progress as clearing off tickets each time we have to loop. + */ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) { + struct reserve_ticket *last_ticket = NULL; struct btrfs_fs_info *fs_info; struct btrfs_space_info *space_info; u64 to_reclaim; int flush_state; + int commit_cycles = 0; fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + spin_lock(&space_info->lock); to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, space_info); - if (!to_reclaim) + if (!to_reclaim) { + space_info->flush = 0; + spin_unlock(&space_info->lock); return; + } + last_ticket = list_first_entry(&space_info->tickets, + struct reserve_ticket, list); + spin_unlock(&space_info->lock); flush_state = FLUSH_DELAYED_ITEMS_NR; do { + struct reserve_ticket *ticket; + int ret; + + ret = flush_space(fs_info->fs_root, space_info, to_reclaim, + to_reclaim, flush_state); + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, + space_info); + ticket = list_first_entry(&space_info->tickets, + struct reserve_ticket, list); + if (last_ticket == ticket) { + flush_state++; + } else { + last_ticket = ticket; + flush_state = FLUSH_DELAYED_ITEMS_NR; + if (commit_cycles) + commit_cycles--; + } + + if (flush_state > COMMIT_TRANS) { + commit_cycles++; + if (commit_cycles > 2) { + wake_all_tickets(&space_info->tickets); + space_info->flush = 0; + } else { + flush_state = FLUSH_DELAYED_ITEMS_NR; + } + } + spin_unlock(&space_info->lock); + } while (flush_state <= COMMIT_TRANS); +} + +void btrfs_init_async_reclaim_work(struct work_struct *work) +{ + INIT_WORK(work, btrfs_async_reclaim_metadata_space); +} + +static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket) +{ + u64 to_reclaim; + int flush_state = FLUSH_DELAYED_ITEMS_NR; + + spin_lock(&space_info->lock); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, + space_info); + if (!to_reclaim) { + spin_unlock(&space_info->lock); + return; + } + spin_unlock(&space_info->lock); + + do { flush_space(fs_info->fs_root, space_info, to_reclaim, to_reclaim, flush_state); flush_state++; - if (!btrfs_need_do_async_reclaim(space_info, fs_info, - flush_state)) + spin_lock(&space_info->lock); + if (ticket->bytes == 0) { + spin_unlock(&space_info->lock); return; + } + spin_unlock(&space_info->lock); + + /* + * Priority flushers can't wait on delalloc without + * deadlocking. + */ + if (flush_state == FLUSH_DELALLOC || + flush_state == FLUSH_DELALLOC_WAIT) + flush_state = ALLOC_CHUNK; } while (flush_state < COMMIT_TRANS); } -void btrfs_init_async_reclaim_work(struct work_struct *work) +static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket, u64 orig_bytes) + { - INIT_WORK(work, btrfs_async_reclaim_metadata_space); + DEFINE_WAIT(wait); + int ret = 0; + + spin_lock(&space_info->lock); + while (ticket->bytes > 0 && ticket->error == 0) { + ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); + if (ret) { + ret = -EINTR; + break; + } + spin_unlock(&space_info->lock); + + schedule(); + + finish_wait(&ticket->wait, &wait); + spin_lock(&space_info->lock); + } + if (!ret) + ret = ticket->error; + if (!list_empty(&ticket->list)) + list_del_init(&ticket->list); + if (ticket->bytes && ticket->bytes < orig_bytes) { + u64 num_bytes = orig_bytes - ticket->bytes; + space_info->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, num_bytes, 0); + } + spin_unlock(&space_info->lock); + + return ret; } /** * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space * @root - the root we're allocating for - * @block_rsv - the block_rsv we're allocating for + * @space_info - the space info we want to allocate from * @orig_bytes - the number of bytes we want * @flush - whether or not we can flush to make our reservation * @@ -4920,81 +5055,34 @@ void btrfs_init_async_reclaim_work(struct work_struct *work) * regain reservations will be made and this will fail if there is not enough * space already. */ -static int reserve_metadata_bytes(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 orig_bytes, - enum btrfs_reserve_flush_enum flush) +static int __reserve_metadata_bytes(struct btrfs_root *root, + struct btrfs_space_info *space_info, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) { - struct btrfs_space_info *space_info = block_rsv->space_info; + struct reserve_ticket ticket; u64 used; - u64 num_bytes = orig_bytes; - int flush_state = FLUSH_DELAYED_ITEMS_NR; int ret = 0; - bool flushing = false; -again: - ret = 0; + ASSERT(orig_bytes); spin_lock(&space_info->lock); - /* - * We only want to wait if somebody other than us is flushing and we - * are actually allowed to flush all things. - */ - while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && - space_info->flush) { - spin_unlock(&space_info->lock); - /* - * If we have a trans handle we can't wait because the flusher - * may have to commit the transaction, which would mean we would - * deadlock since we are waiting for the flusher to finish, but - * hold the current transaction open. - */ - if (current->journal_info) - return -EAGAIN; - ret = wait_event_killable(space_info->wait, !space_info->flush); - /* Must have been killed, return */ - if (ret) - return -EINTR; - - spin_lock(&space_info->lock); - } - ret = -ENOSPC; used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + space_info->bytes_may_use; /* - * The idea here is that we've not already over-reserved the block group - * then we can go ahead and save our reservation first and then start - * flushing if we need to. Otherwise if we've already overcommitted - * lets start flushing stuff first and then come back and try to make - * our reservation. + * If we have enough space then hooray, make our reservation and carry + * on. If not see if we can overcommit, and if we can, hooray carry on. + * If not things get more complicated. */ - if (used <= space_info->total_bytes) { - if (used + orig_bytes <= space_info->total_bytes) { - space_info->bytes_may_use += orig_bytes; - trace_btrfs_space_reservation(root->fs_info, - "space_info", space_info->flags, orig_bytes, 1); - ret = 0; - } else { - /* - * Ok set num_bytes to orig_bytes since we aren't - * overocmmitted, this way we only try and reclaim what - * we need. - */ - num_bytes = orig_bytes; - } - } else { - /* - * Ok we're over committed, set num_bytes to the overcommitted - * amount plus the amount of bytes that we need for this - * reservation. - */ - num_bytes = used - space_info->total_bytes + - (orig_bytes * 2); - } - - if (ret && can_overcommit(root, space_info, orig_bytes, flush)) { + if (used + orig_bytes <= space_info->total_bytes) { + space_info->bytes_may_use += orig_bytes; + trace_btrfs_space_reservation(root->fs_info, "space_info", + space_info->flags, orig_bytes, + 1); + ret = 0; + } else if (can_overcommit(root, space_info, orig_bytes, flush)) { space_info->bytes_may_use += orig_bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", space_info->flags, orig_bytes, @@ -5003,16 +5091,27 @@ again: } /* - * Couldn't make our reservation, save our place so while we're trying - * to reclaim space we can actually use it instead of somebody else - * stealing it from us. + * If we couldn't make a reservation then setup our reservation ticket + * and kick the async worker if it's not already running. * - * We make the other tasks wait for the flush only when we can flush - * all things. + * If we are a priority flusher then we just need to add our ticket to + * the list and we will do our own flushing further down. */ if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { - flushing = true; - space_info->flush = 1; + ticket.bytes = orig_bytes; + ticket.error = 0; + init_waitqueue_head(&ticket.wait); + if (flush == BTRFS_RESERVE_FLUSH_ALL) { + list_add_tail(&ticket.list, &space_info->tickets); + if (!space_info->flush) { + space_info->flush = 1; + queue_work(system_unbound_wq, + &root->fs_info->async_reclaim_work); + } + } else { + list_add_tail(&ticket.list, + &space_info->priority_tickets); + } } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { used += orig_bytes; /* @@ -5027,33 +5126,56 @@ again: &root->fs_info->async_reclaim_work); } spin_unlock(&space_info->lock); - if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) - goto out; + return ret; - ret = flush_space(root, space_info, num_bytes, orig_bytes, - flush_state); - flush_state++; + if (flush == BTRFS_RESERVE_FLUSH_ALL) + return wait_reserve_ticket(root->fs_info, space_info, &ticket, + orig_bytes); - /* - * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock - * would happen. So skip delalloc flush. - */ - if (flush == BTRFS_RESERVE_FLUSH_LIMIT && - (flush_state == FLUSH_DELALLOC || - flush_state == FLUSH_DELALLOC_WAIT)) - flush_state = ALLOC_CHUNK; + ret = 0; + priority_reclaim_metadata_space(root->fs_info, space_info, &ticket); + spin_lock(&space_info->lock); + if (ticket.bytes) { + if (ticket.bytes < orig_bytes) { + u64 num_bytes = orig_bytes - ticket.bytes; + space_info->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(root->fs_info, + "space_info", space_info->flags, + num_bytes, 0); - if (!ret) - goto again; - else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && - flush_state < COMMIT_TRANS) - goto again; - else if (flush == BTRFS_RESERVE_FLUSH_ALL && - flush_state <= COMMIT_TRANS) - goto again; + } + list_del_init(&ticket.list); + ret = -ENOSPC; + } + spin_unlock(&space_info->lock); + ASSERT(list_empty(&ticket.list)); + return ret; +} -out: +/** + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space + * @root - the root we're allocating for + * @block_rsv - the block_rsv we're allocating for + * @orig_bytes - the number of bytes we want + * @flush - whether or not we can flush to make our reservation + * + * This will reserve orgi_bytes number of bytes from the space info associated + * with the block_rsv. If there is not enough space it will make an attempt to + * flush out space to make room. It will do this by flushing delalloc if + * possible or committing the transaction. If flush is 0 then no attempts to + * regain reservations will be made and this will fail if there is not enough + * space already. + */ +static int reserve_metadata_bytes(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) +{ + int ret; + + ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes, + flush); if (ret == -ENOSPC && unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { struct btrfs_block_rsv *global_rsv = @@ -5066,13 +5188,8 @@ out: if (ret == -ENOSPC) trace_btrfs_space_reservation(root->fs_info, "space_info:enospc", - space_info->flags, orig_bytes, 1); - if (flushing) { - spin_lock(&space_info->lock); - space_info->flush = 0; - wake_up_all(&space_info->wait); - spin_unlock(&space_info->lock); - } + block_rsv->space_info->flags, + orig_bytes, 1); return ret; } @@ -5148,6 +5265,103 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, return 0; } +/* + * This is for space we already have accounted in space_info->bytes_may_use, so + * basically when we're returning space from block_rsv's. + */ +static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes) +{ + struct reserve_ticket *ticket; + struct list_head *head; + u64 used; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; + bool check_overcommit = false; + + spin_lock(&space_info->lock); + head = &space_info->priority_tickets; + + /* + * First we want to see if we're over our limit, because if we are then + * we need to make sure we are still ok overcommitting before we satisfy + * another reservation. + */ + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly; + if (used - num_bytes >= space_info->total_bytes) + check_overcommit = true; +again: + while (!list_empty(head) && num_bytes) { + ticket = list_first_entry(head, struct reserve_ticket, + list); + if (check_overcommit && + !can_overcommit(fs_info->extent_root, space_info, + ticket->bytes, flush)) + break; + if (num_bytes >= ticket->bytes) { + list_del_init(&ticket->list); + num_bytes -= ticket->bytes; + ticket->bytes = 0; + wake_up(&ticket->wait); + } else { + ticket->bytes -= num_bytes; + num_bytes = 0; + } + } + + if (num_bytes && head == &space_info->priority_tickets) { + head = &space_info->tickets; + flush = BTRFS_RESERVE_FLUSH_ALL; + goto again; + } + space_info->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, num_bytes, 0); + spin_unlock(&space_info->lock); +} + +/* + * This is for newly allocated space that isn't accounted in + * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent + * we use this helper. + */ +static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes) +{ + struct reserve_ticket *ticket; + struct list_head *head = &space_info->priority_tickets; + +again: + while (!list_empty(head) && num_bytes) { + ticket = list_first_entry(head, struct reserve_ticket, + list); + if (num_bytes >= ticket->bytes) { + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, + ticket->bytes, 1); + list_del_init(&ticket->list); + num_bytes -= ticket->bytes; + space_info->bytes_may_use += ticket->bytes; + ticket->bytes = 0; + wake_up(&ticket->wait); + } else { + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, + num_bytes, 1); + space_info->bytes_may_use += num_bytes; + ticket->bytes -= num_bytes; + num_bytes = 0; + } + } + + if (num_bytes && head == &space_info->priority_tickets) { + head = &space_info->tickets; + goto again; + } +} + static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, struct btrfs_block_rsv *dest, u64 num_bytes) @@ -5182,13 +5396,9 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, } spin_unlock(&dest->lock); } - if (num_bytes) { - spin_lock(&space_info->lock); - space_info->bytes_may_use -= num_bytes; - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, num_bytes, 0); - spin_unlock(&space_info->lock); - } + if (num_bytes) + space_info_add_old_bytes(fs_info, space_info, + num_bytes); } } @@ -6346,17 +6556,29 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, readonly = true; } spin_unlock(&cache->lock); - if (!readonly && global_rsv->space_info == space_info) { + if (!readonly && return_free_space && + global_rsv->space_info == space_info) { + u64 to_add = len; + WARN_ON(!return_free_space); spin_lock(&global_rsv->lock); if (!global_rsv->full) { - len = min(len, global_rsv->size - - global_rsv->reserved); - global_rsv->reserved += len; - space_info->bytes_may_use += len; + to_add = min(len, global_rsv->size - + global_rsv->reserved); + global_rsv->reserved += to_add; + space_info->bytes_may_use += to_add; if (global_rsv->reserved >= global_rsv->size) global_rsv->full = 1; + trace_btrfs_space_reservation(fs_info, + "space_info", + space_info->flags, + to_add, 1); + len -= to_add; } spin_unlock(&global_rsv->lock); + /* Add to any tickets we may have */ + if (len) + space_info_add_new_bytes(fs_info, space_info, + len); } spin_unlock(&space_info->lock); }
Our enospc flushing sucks. It is born from a time where we were early enospc'ing constantly because multiple threads would race in for the same reservation and randomly starve other ones out. So I came up with this solution to block any other reservations from happening while one guy tried to flush stuff to satisfy his reservation. This gives us pretty good correctness, but completely crap latency. The solution I've come up with is ticketed reservations. Basically we try to make our reservation, and if we can't we put a ticket on a list in order and kick off an async flusher thread. This async flusher thread does the same old flushing we always did, just asynchronously. As space is freed and added back to the space_info it checks and sees if we have any tickets that need satisfying, and adds space to the tickets and wakes up anything we've satisfied. Once the flusher thread stops making progress it wakes up all the current tickets and tells them to take a hike. There is a priority list for things that can't flush, since the async flusher could do anything we need to avoid deadlocks. These guys get priority for having their reservation made, and will still do manual flushing themselves in case the async flusher isn't running. This patch gives us significantly better latencies. Thanks, Signed-off-by: Josef Bacik <jbacik@fb.com> --- fs/btrfs/ctree.h | 2 + fs/btrfs/extent-tree.c | 524 +++++++++++++++++++++++++++++++++++-------------- 2 files changed, 375 insertions(+), 151 deletions(-)