diff mbox

[v3,06/10] writeback: introduce super_operations->write_metadata

Message ID 1513029335-5112-7-git-send-email-josef@toxicpanda.com (mailing list archive)
State New, archived
Headers show

Commit Message

Josef Bacik Dec. 11, 2017, 9:55 p.m. UTC
From: Josef Bacik <jbacik@fb.com>

Now that we have metadata counters in the VM, we need to provide a way to kick
writeback on dirty metadata.  Introduce super_operations->write_metadata.  This
allows file systems to deal with writing back any dirty metadata we need based
on the writeback needs of the system.  Since there is no inode to key off of we
need a list in the bdi for dirty super blocks to be added.  From there we can
find any dirty sb's on the bdi we are currently doing writeback on and call into
their ->write_metadata callback.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Tejun Heo <tj@kernel.org>
---
 fs/fs-writeback.c                | 72 ++++++++++++++++++++++++++++++++++++----
 fs/super.c                       |  6 ++++
 include/linux/backing-dev-defs.h |  2 ++
 include/linux/fs.h               |  4 +++
 mm/backing-dev.c                 |  2 ++
 5 files changed, 80 insertions(+), 6 deletions(-)

Comments

Dave Chinner Dec. 11, 2017, 11:36 p.m. UTC | #1
On Mon, Dec 11, 2017 at 04:55:31PM -0500, Josef Bacik wrote:
> From: Josef Bacik <jbacik@fb.com>
> 
> Now that we have metadata counters in the VM, we need to provide a way to kick
> writeback on dirty metadata.  Introduce super_operations->write_metadata.  This
> allows file systems to deal with writing back any dirty metadata we need based
> on the writeback needs of the system.  Since there is no inode to key off of we
> need a list in the bdi for dirty super blocks to be added.  From there we can
> find any dirty sb's on the bdi we are currently doing writeback on and call into
> their ->write_metadata callback.
> 
> Signed-off-by: Josef Bacik <jbacik@fb.com>
> Reviewed-by: Jan Kara <jack@suse.cz>
> Reviewed-by: Tejun Heo <tj@kernel.org>
> ---
>  fs/fs-writeback.c                | 72 ++++++++++++++++++++++++++++++++++++----
>  fs/super.c                       |  6 ++++
>  include/linux/backing-dev-defs.h |  2 ++
>  include/linux/fs.h               |  4 +++
>  mm/backing-dev.c                 |  2 ++
>  5 files changed, 80 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index 987448ed7698..fba703dff678 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -1479,6 +1479,31 @@ static long writeback_chunk_size(struct bdi_writeback *wb,
>  	return pages;
>  }
>  
> +static long writeback_sb_metadata(struct super_block *sb,
> +				  struct bdi_writeback *wb,
> +				  struct wb_writeback_work *work)
> +{
> +	struct writeback_control wbc = {
> +		.sync_mode		= work->sync_mode,
> +		.tagged_writepages	= work->tagged_writepages,
> +		.for_kupdate		= work->for_kupdate,
> +		.for_background		= work->for_background,
> +		.for_sync		= work->for_sync,
> +		.range_cyclic		= work->range_cyclic,
> +		.range_start		= 0,
> +		.range_end		= LLONG_MAX,
> +	};
> +	long write_chunk;
> +
> +	write_chunk = writeback_chunk_size(wb, work);
> +	wbc.nr_to_write = write_chunk;
> +	sb->s_op->write_metadata(sb, &wbc);
> +	work->nr_pages -= write_chunk - wbc.nr_to_write;
> +
> +	return write_chunk - wbc.nr_to_write;

Ok, writeback_chunk_size() returns a page count. We've already gone
through the "metadata is not page sized" dance on the dirty
accounting side, so how are we supposed to use pages to account for
metadata writeback?

And, from what I can tell, if work->sync_mode = WB_SYNC_ALL or
work->tagged_writepages is set, this will basically tell us to flush
the entire dirty metadata cache because write_chunk will get set to
LONG_MAX.

IOWs, this would appear to me to change sync() behaviour quite
dramatically on filesystems where ->write_metadata is implemented.
That is, instead of leaving all the metadata dirty in memory and
just forcing the journal to stable storage, filesystems will be told
to also write back all their dirty metadata before sync() returns,
even though it is not necessary to provide correct sync()
semantics....

Mind you, writeback invocation is so convoluted now I could easily
be mis-interpretting this code, but it does seem to me like this
code is going to have some unintended behaviours....

Cheers,

Dave.
Josef Bacik Dec. 12, 2017, 6:05 p.m. UTC | #2
On Tue, Dec 12, 2017 at 10:36:19AM +1100, Dave Chinner wrote:
> On Mon, Dec 11, 2017 at 04:55:31PM -0500, Josef Bacik wrote:
> > From: Josef Bacik <jbacik@fb.com>
> > 
> > Now that we have metadata counters in the VM, we need to provide a way to kick
> > writeback on dirty metadata.  Introduce super_operations->write_metadata.  This
> > allows file systems to deal with writing back any dirty metadata we need based
> > on the writeback needs of the system.  Since there is no inode to key off of we
> > need a list in the bdi for dirty super blocks to be added.  From there we can
> > find any dirty sb's on the bdi we are currently doing writeback on and call into
> > their ->write_metadata callback.
> > 
> > Signed-off-by: Josef Bacik <jbacik@fb.com>
> > Reviewed-by: Jan Kara <jack@suse.cz>
> > Reviewed-by: Tejun Heo <tj@kernel.org>
> > ---
> >  fs/fs-writeback.c                | 72 ++++++++++++++++++++++++++++++++++++----
> >  fs/super.c                       |  6 ++++
> >  include/linux/backing-dev-defs.h |  2 ++
> >  include/linux/fs.h               |  4 +++
> >  mm/backing-dev.c                 |  2 ++
> >  5 files changed, 80 insertions(+), 6 deletions(-)
> > 
> > diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> > index 987448ed7698..fba703dff678 100644
> > --- a/fs/fs-writeback.c
> > +++ b/fs/fs-writeback.c
> > @@ -1479,6 +1479,31 @@ static long writeback_chunk_size(struct bdi_writeback *wb,
> >  	return pages;
> >  }
> >  
> > +static long writeback_sb_metadata(struct super_block *sb,
> > +				  struct bdi_writeback *wb,
> > +				  struct wb_writeback_work *work)
> > +{
> > +	struct writeback_control wbc = {
> > +		.sync_mode		= work->sync_mode,
> > +		.tagged_writepages	= work->tagged_writepages,
> > +		.for_kupdate		= work->for_kupdate,
> > +		.for_background		= work->for_background,
> > +		.for_sync		= work->for_sync,
> > +		.range_cyclic		= work->range_cyclic,
> > +		.range_start		= 0,
> > +		.range_end		= LLONG_MAX,
> > +	};
> > +	long write_chunk;
> > +
> > +	write_chunk = writeback_chunk_size(wb, work);
> > +	wbc.nr_to_write = write_chunk;
> > +	sb->s_op->write_metadata(sb, &wbc);
> > +	work->nr_pages -= write_chunk - wbc.nr_to_write;
> > +
> > +	return write_chunk - wbc.nr_to_write;
> 
> Ok, writeback_chunk_size() returns a page count. We've already gone
> through the "metadata is not page sized" dance on the dirty
> accounting side, so how are we supposed to use pages to account for
> metadata writeback?
> 

This is just one of those things that's going to be slightly shitty.  It's the
same for memory reclaim, all of those places use pages so we just take
METADATA_*_BYTES >> PAGE_SHIFT to get pages and figure it's close enough.

> And, from what I can tell, if work->sync_mode = WB_SYNC_ALL or
> work->tagged_writepages is set, this will basically tell us to flush
> the entire dirty metadata cache because write_chunk will get set to
> LONG_MAX.
> 
> IOWs, this would appear to me to change sync() behaviour quite
> dramatically on filesystems where ->write_metadata is implemented.
> That is, instead of leaving all the metadata dirty in memory and
> just forcing the journal to stable storage, filesystems will be told
> to also write back all their dirty metadata before sync() returns,
> even though it is not necessary to provide correct sync()
> semantics....

Well for btrfs that's exactly what we have currently since it's just backed by
an inode.  Obviously this is different for journaled fs'es, but I assumed that
in your case you would either not use this part of the infrastructure or simply
ignore WB_SYNC_ALL and use WB_SYNC_NONE as a way to be nice under memory
pressure or whatever.

> 
> Mind you, writeback invocation is so convoluted now I could easily
> be mis-interpretting this code, but it does seem to me like this
> code is going to have some unintended behaviours....
> 

I don't think so, because right now this behavior is exactly what btrfs has
currently with it's inode setup.  I didn't really think the journaled use case
out since you guys are already rate limited by the journal.  If you would want
to start using this stuff what would you like to see done instead?  Thanks,

Josef
Dave Chinner Dec. 12, 2017, 10:20 p.m. UTC | #3
On Tue, Dec 12, 2017 at 01:05:35PM -0500, Josef Bacik wrote:
> On Tue, Dec 12, 2017 at 10:36:19AM +1100, Dave Chinner wrote:
> > On Mon, Dec 11, 2017 at 04:55:31PM -0500, Josef Bacik wrote:
> > > From: Josef Bacik <jbacik@fb.com>
> > > 
> > > Now that we have metadata counters in the VM, we need to provide a way to kick
> > > writeback on dirty metadata.  Introduce super_operations->write_metadata.  This
> > > allows file systems to deal with writing back any dirty metadata we need based
> > > on the writeback needs of the system.  Since there is no inode to key off of we
> > > need a list in the bdi for dirty super blocks to be added.  From there we can
> > > find any dirty sb's on the bdi we are currently doing writeback on and call into
> > > their ->write_metadata callback.
> > > 
> > > Signed-off-by: Josef Bacik <jbacik@fb.com>
> > > Reviewed-by: Jan Kara <jack@suse.cz>
> > > Reviewed-by: Tejun Heo <tj@kernel.org>
> > > ---
> > >  fs/fs-writeback.c                | 72 ++++++++++++++++++++++++++++++++++++----
> > >  fs/super.c                       |  6 ++++
> > >  include/linux/backing-dev-defs.h |  2 ++
> > >  include/linux/fs.h               |  4 +++
> > >  mm/backing-dev.c                 |  2 ++
> > >  5 files changed, 80 insertions(+), 6 deletions(-)
> > > 
> > > diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> > > index 987448ed7698..fba703dff678 100644
> > > --- a/fs/fs-writeback.c
> > > +++ b/fs/fs-writeback.c
> > > @@ -1479,6 +1479,31 @@ static long writeback_chunk_size(struct bdi_writeback *wb,
> > >  	return pages;
> > >  }
> > >  
> > > +static long writeback_sb_metadata(struct super_block *sb,
> > > +				  struct bdi_writeback *wb,
> > > +				  struct wb_writeback_work *work)
> > > +{
> > > +	struct writeback_control wbc = {
> > > +		.sync_mode		= work->sync_mode,
> > > +		.tagged_writepages	= work->tagged_writepages,
> > > +		.for_kupdate		= work->for_kupdate,
> > > +		.for_background		= work->for_background,
> > > +		.for_sync		= work->for_sync,
> > > +		.range_cyclic		= work->range_cyclic,
> > > +		.range_start		= 0,
> > > +		.range_end		= LLONG_MAX,
> > > +	};
> > > +	long write_chunk;
> > > +
> > > +	write_chunk = writeback_chunk_size(wb, work);
> > > +	wbc.nr_to_write = write_chunk;
> > > +	sb->s_op->write_metadata(sb, &wbc);
> > > +	work->nr_pages -= write_chunk - wbc.nr_to_write;
> > > +
> > > +	return write_chunk - wbc.nr_to_write;
> > 
> > Ok, writeback_chunk_size() returns a page count. We've already gone
> > through the "metadata is not page sized" dance on the dirty
> > accounting side, so how are we supposed to use pages to account for
> > metadata writeback?
> > 
> 
> This is just one of those things that's going to be slightly shitty.  It's the
> same for memory reclaim, all of those places use pages so we just take
> METADATA_*_BYTES >> PAGE_SHIFT to get pages and figure it's close enough.

Ok, so that isn't exactly easy to deal with, because all our
metadata writeback is based on log sequence number targets (i.e. how
far to push the tail of the log towards the current head). We've
actually got no idea how pages/bytes actually map to a LSN target
because while we might account a full buffer as dirty for memory
reclaim purposes (up to 64k in size), we might have only logged 128
bytes of it.

i.e. if we are asked to push 2MB of metadata and we treat that as
2MB of log space (i.e. push target of tail LSN + 2MB) we could have
logged several tens of megabytes of dirty metadata in that LSN
range and have to flush it all. OTOH, if the buffers are fully
logged, then that same target might only flush 1.5MB of metadata
once all the log overhead is taken into account.

So there's a fairly large disconnect between the "flush N bytes of
metadata" API and the "push to a target LSN" that XFS uses for
flushing metadata in aged order. I'm betting that extN and otehr
filesystems might have similar mismatches with their journal
flushing...

> > And, from what I can tell, if work->sync_mode = WB_SYNC_ALL or
> > work->tagged_writepages is set, this will basically tell us to flush
> > the entire dirty metadata cache because write_chunk will get set to
> > LONG_MAX.
> > 
> > IOWs, this would appear to me to change sync() behaviour quite
> > dramatically on filesystems where ->write_metadata is implemented.
> > That is, instead of leaving all the metadata dirty in memory and
> > just forcing the journal to stable storage, filesystems will be told
> > to also write back all their dirty metadata before sync() returns,
> > even though it is not necessary to provide correct sync()
> > semantics....
> 
> Well for btrfs that's exactly what we have currently since it's just backed by
> an inode.

Hmmmm. That explains a lot.

Seems to me that btrfs is the odd one out here, so I'm not sure a
mechanism primarily designed for btrfs is going to work
generically....

> Obviously this is different for journaled fs'es, but I assumed that
> in your case you would either not use this part of the infrastructure or simply
> ignore WB_SYNC_ALL and use WB_SYNC_NONE as a way to be nice under memory
> pressure or whatever.

I don't think that designing an interface with the assumption other
filesystems will abuse it until it works for them is a great process
to follow...

> > Mind you, writeback invocation is so convoluted now I could easily
> > be mis-interpretting this code, but it does seem to me like this
> > code is going to have some unintended behaviours....
> > 
> 
> I don't think so, because right now this behavior is exactly what btrfs has
> currently with it's inode setup.  I didn't really think the journaled use case
> out since you guys are already rate limited by the journal.

We are?

XFS is rate limited by metadata writeback, not journal throughput.
Yes, journal space is limited by the metadata writeback rate, but
journalling itself is not the bottleneck.

> If you would want
> to start using this stuff what would you like to see done instead?  Thanks,

If this is all about reacting to memory pressure, then writeback is
not the mechanism that should drive this writeback. Reacting to
memory pressure is what shrinkers are for, and XFS already triggers
metadata writeback on memory pressure. Hence I don't see how this
writeback mechanism would help us if we have to abuse it to infer
"memory pressure occurring"

What I was hoping for was this interface to be a mechanism to drive
periodic background metadata writeback from the VFS so that when we
start to run out of memory the VFS has already started to ramp up
the rate of metadata writeback so we don't have huge amounts of dirty
metadata to write back during superblock shrinker based reclaim.

i.e. it works more like dirty background data writeback, get's the
amount of work to do from the amount of dirty metadata associated
with the bdi and doesn't actually do anything when operations like
sync() are run because there isn't a need to writeback metadata in
those operations.

IOWs, treating metadata like it's one great big data inode doesn't
seem to me to be the right abstraction to use for this - in most
fileystems it's a bunch of objects with a complex dependency tree
and unknown write ordering, not an inode full of data that can be
sequentially written.

Maybe we need multiple ops with well defined behaviours. e.g.
->writeback_metadata() for background writeback, ->sync_metadata() for
sync based operations. That way different filesystems can ignore the
parts they don't need simply by not implementing those operations,
and the writeback code doesn't need to try to cater for all
operations through the one op. The writeback code should be cleaner,
the filesystem code should be cleaner, and we can tailor the work
guidelines for each operation separately so there's less mismatch
between what writeback is asking and how filesystems track dirty
metadata...

Cheers,

Dave.
Josef Bacik Dec. 12, 2017, 11:59 p.m. UTC | #4
On Wed, Dec 13, 2017 at 09:20:04AM +1100, Dave Chinner wrote:
> On Tue, Dec 12, 2017 at 01:05:35PM -0500, Josef Bacik wrote:
> > On Tue, Dec 12, 2017 at 10:36:19AM +1100, Dave Chinner wrote:
> > > On Mon, Dec 11, 2017 at 04:55:31PM -0500, Josef Bacik wrote:
> > > > From: Josef Bacik <jbacik@fb.com>
> > > > 
> > > > Now that we have metadata counters in the VM, we need to provide a way to kick
> > > > writeback on dirty metadata.  Introduce super_operations->write_metadata.  This
> > > > allows file systems to deal with writing back any dirty metadata we need based
> > > > on the writeback needs of the system.  Since there is no inode to key off of we
> > > > need a list in the bdi for dirty super blocks to be added.  From there we can
> > > > find any dirty sb's on the bdi we are currently doing writeback on and call into
> > > > their ->write_metadata callback.
> > > > 
> > > > Signed-off-by: Josef Bacik <jbacik@fb.com>
> > > > Reviewed-by: Jan Kara <jack@suse.cz>
> > > > Reviewed-by: Tejun Heo <tj@kernel.org>
> > > > ---
> > > >  fs/fs-writeback.c                | 72 ++++++++++++++++++++++++++++++++++++----
> > > >  fs/super.c                       |  6 ++++
> > > >  include/linux/backing-dev-defs.h |  2 ++
> > > >  include/linux/fs.h               |  4 +++
> > > >  mm/backing-dev.c                 |  2 ++
> > > >  5 files changed, 80 insertions(+), 6 deletions(-)
> > > > 
> > > > diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> > > > index 987448ed7698..fba703dff678 100644
> > > > --- a/fs/fs-writeback.c
> > > > +++ b/fs/fs-writeback.c
> > > > @@ -1479,6 +1479,31 @@ static long writeback_chunk_size(struct bdi_writeback *wb,
> > > >  	return pages;
> > > >  }
> > > >  
> > > > +static long writeback_sb_metadata(struct super_block *sb,
> > > > +				  struct bdi_writeback *wb,
> > > > +				  struct wb_writeback_work *work)
> > > > +{
> > > > +	struct writeback_control wbc = {
> > > > +		.sync_mode		= work->sync_mode,
> > > > +		.tagged_writepages	= work->tagged_writepages,
> > > > +		.for_kupdate		= work->for_kupdate,
> > > > +		.for_background		= work->for_background,
> > > > +		.for_sync		= work->for_sync,
> > > > +		.range_cyclic		= work->range_cyclic,
> > > > +		.range_start		= 0,
> > > > +		.range_end		= LLONG_MAX,
> > > > +	};
> > > > +	long write_chunk;
> > > > +
> > > > +	write_chunk = writeback_chunk_size(wb, work);
> > > > +	wbc.nr_to_write = write_chunk;
> > > > +	sb->s_op->write_metadata(sb, &wbc);
> > > > +	work->nr_pages -= write_chunk - wbc.nr_to_write;
> > > > +
> > > > +	return write_chunk - wbc.nr_to_write;
> > > 
> > > Ok, writeback_chunk_size() returns a page count. We've already gone
> > > through the "metadata is not page sized" dance on the dirty
> > > accounting side, so how are we supposed to use pages to account for
> > > metadata writeback?
> > > 
> > 
> > This is just one of those things that's going to be slightly shitty.  It's the
> > same for memory reclaim, all of those places use pages so we just take
> > METADATA_*_BYTES >> PAGE_SHIFT to get pages and figure it's close enough.
> 
> Ok, so that isn't exactly easy to deal with, because all our
> metadata writeback is based on log sequence number targets (i.e. how
> far to push the tail of the log towards the current head). We've
> actually got no idea how pages/bytes actually map to a LSN target
> because while we might account a full buffer as dirty for memory
> reclaim purposes (up to 64k in size), we might have only logged 128
> bytes of it.
> 
> i.e. if we are asked to push 2MB of metadata and we treat that as
> 2MB of log space (i.e. push target of tail LSN + 2MB) we could have
> logged several tens of megabytes of dirty metadata in that LSN
> range and have to flush it all. OTOH, if the buffers are fully
> logged, then that same target might only flush 1.5MB of metadata
> once all the log overhead is taken into account.
> 
> So there's a fairly large disconnect between the "flush N bytes of
> metadata" API and the "push to a target LSN" that XFS uses for
> flushing metadata in aged order. I'm betting that extN and otehr
> filesystems might have similar mismatches with their journal
> flushing...
> 

If there's not a correlation then there's no sense in xfs using this.  If btrfs
has 16gib of dirty metadata then that's exactly how much we have to write out,
which is what this is designed for.

> > > And, from what I can tell, if work->sync_mode = WB_SYNC_ALL or
> > > work->tagged_writepages is set, this will basically tell us to flush
> > > the entire dirty metadata cache because write_chunk will get set to
> > > LONG_MAX.
> > > 
> > > IOWs, this would appear to me to change sync() behaviour quite
> > > dramatically on filesystems where ->write_metadata is implemented.
> > > That is, instead of leaving all the metadata dirty in memory and
> > > just forcing the journal to stable storage, filesystems will be told
> > > to also write back all their dirty metadata before sync() returns,
> > > even though it is not necessary to provide correct sync()
> > > semantics....
> > 
> > Well for btrfs that's exactly what we have currently since it's just backed by
> > an inode.
> 
> Hmmmm. That explains a lot.
> 
> Seems to me that btrfs is the odd one out here, so I'm not sure a
> mechanism primarily designed for btrfs is going to work
> generically....
> 

The generic stuff is very lightweight specifically because we don't need a whole
lot, just a way to get all of the balance dirty pages logic without duplicating
it internally in btrfs.

> > Obviously this is different for journaled fs'es, but I assumed that
> > in your case you would either not use this part of the infrastructure or simply
> > ignore WB_SYNC_ALL and use WB_SYNC_NONE as a way to be nice under memory
> > pressure or whatever.
> 
> I don't think that designing an interface with the assumption other
> filesystems will abuse it until it works for them is a great process
> to follow...
> 

Again not really designing it with your stuff in mind.  ext* and xfs already
handle dirty metadata fine, btrfs is the odd man out so we need a little extra.
It would be cool to at least use the accounting part of it in xfs and ext* so we
could see how much of the system memory is in use by metadata, but I imagine the
dirty metadata tracking is going to be mostly useless for you guys.

> > > Mind you, writeback invocation is so convoluted now I could easily
> > > be mis-interpretting this code, but it does seem to me like this
> > > code is going to have some unintended behaviours....
> > > 
> > 
> > I don't think so, because right now this behavior is exactly what btrfs has
> > currently with it's inode setup.  I didn't really think the journaled use case
> > out since you guys are already rate limited by the journal.
> 
> We are?
> 
> XFS is rate limited by metadata writeback, not journal throughput.
> Yes, journal space is limited by the metadata writeback rate, but
> journalling itself is not the bottleneck.
> 

I'm not saying "rate limited" as in xfs sucks because journal.  I'm saying your
dirty metadata foot print is limited by your journal size, so you aren't going
to have gigabytes of dirty metadata sitting around needing to be flushed (I
assume, I'm going on previous discussions with you about this.)

> > If you would want
> > to start using this stuff what would you like to see done instead?  Thanks,
> 
> If this is all about reacting to memory pressure, then writeback is
> not the mechanism that should drive this writeback. Reacting to
> memory pressure is what shrinkers are for, and XFS already triggers
> metadata writeback on memory pressure. Hence I don't see how this
> writeback mechanism would help us if we have to abuse it to infer
> "memory pressure occurring"
> 

This isn't reacting to memory pressure, it's reacting to dirty pressure.  Btrfs
is only limited by system memory for its metadata, so I want all the benefits of
years of work on balance_dirty_pages() without having to duplicate the effort
internally in btrfs.  This is how I'm going about doing it.

> What I was hoping for was this interface to be a mechanism to drive
> periodic background metadata writeback from the VFS so that when we
> start to run out of memory the VFS has already started to ramp up
> the rate of metadata writeback so we don't have huge amounts of dirty
> metadata to write back during superblock shrinker based reclaim.
> 
> i.e. it works more like dirty background data writeback, get's the
> amount of work to do from the amount of dirty metadata associated
> with the bdi and doesn't actually do anything when operations like
> sync() are run because there isn't a need to writeback metadata in
> those operations.
> 
> IOWs, treating metadata like it's one great big data inode doesn't
> seem to me to be the right abstraction to use for this - in most
> fileystems it's a bunch of objects with a complex dependency tree
> and unknown write ordering, not an inode full of data that can be
> sequentially written.

But this isn't dictating what to write out, just how much we need to undirty.
How the fs wants to write stuff out is completely up to the file system, I
specifically made it as generic as possible so we could do whatever we felt like
with the numbers we got.  This work gives you exactly what you want, a callback
when balance dirty pages is telling us that hey we have too much dirty memory in
use on the system.

> 
> Maybe we need multiple ops with well defined behaviours. e.g.
> ->writeback_metadata() for background writeback, ->sync_metadata() for
> sync based operations. That way different filesystems can ignore the
> parts they don't need simply by not implementing those operations,
> and the writeback code doesn't need to try to cater for all
> operations through the one op. The writeback code should be cleaner,
> the filesystem code should be cleaner, and we can tailor the work
> guidelines for each operation separately so there's less mismatch
> between what writeback is asking and how filesystems track dirty
> metadata...
> 

So I don't mind adding new things or changing around, but this is just getting
us the same behavior that I mentioned before, only at a higher level.  We want
the balance_dirty_pages() stuff to be able to dip into metadata writeback via
the method that I've implemented here.  Basically do data writeback, and if we
didn't do enough do some metadata writeback.  With what you've proposed we would
keep that and instead of doing ->write_metadata() when we have SYNC_ALL we'd
just do ->sync_metadata() and let the fs figure out what to do, which is what I
was suggesting fs'es do.

The problem is there's a disconnect between what btrfs and ext4 do with their
dirty metadata and what xfs does.  Ext4 is going to log entire blocks into the
journal, so there's a 1:1 mapping of dirty metadata to what's going to be
written out.  So telling it "write x pages" worth of metadata is going to be
somewhat useful.  That's not the case for xfs, and I'm not sure what a good way
to accommodate you would look like.  My first thought is a ratio, but man trying
to change how we dealt with slab ratios made me want to suck start a shotgun so I
don't really want to do something like that again.

How would you prefer to get information to act on from upper layers?  Personally
I feel like the generic writeback stuff already gives us enough info and we can
figure out what we want to do from there.  Thanks,

Josef

ps: I'm going to try and stay up for a while so we can hash this out now instead
of switching back and forth through our timezones.
Jan Kara Dec. 19, 2017, 12:07 p.m. UTC | #5
On Wed 13-12-17 09:20:04, Dave Chinner wrote:
> On Tue, Dec 12, 2017 at 01:05:35PM -0500, Josef Bacik wrote:
> > On Tue, Dec 12, 2017 at 10:36:19AM +1100, Dave Chinner wrote:
> > > On Mon, Dec 11, 2017 at 04:55:31PM -0500, Josef Bacik wrote:
> > > > From: Josef Bacik <jbacik@fb.com>
> > > > 
> > > > Now that we have metadata counters in the VM, we need to provide a way to kick
> > > > writeback on dirty metadata.  Introduce super_operations->write_metadata.  This
> > > > allows file systems to deal with writing back any dirty metadata we need based
> > > > on the writeback needs of the system.  Since there is no inode to key off of we
> > > > need a list in the bdi for dirty super blocks to be added.  From there we can
> > > > find any dirty sb's on the bdi we are currently doing writeback on and call into
> > > > their ->write_metadata callback.
> > > > 
> > > > Signed-off-by: Josef Bacik <jbacik@fb.com>
> > > > Reviewed-by: Jan Kara <jack@suse.cz>
> > > > Reviewed-by: Tejun Heo <tj@kernel.org>
> > > > ---
> > > >  fs/fs-writeback.c                | 72 ++++++++++++++++++++++++++++++++++++----
> > > >  fs/super.c                       |  6 ++++
> > > >  include/linux/backing-dev-defs.h |  2 ++
> > > >  include/linux/fs.h               |  4 +++
> > > >  mm/backing-dev.c                 |  2 ++
> > > >  5 files changed, 80 insertions(+), 6 deletions(-)
> > > > 
> > > > diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> > > > index 987448ed7698..fba703dff678 100644
> > > > --- a/fs/fs-writeback.c
> > > > +++ b/fs/fs-writeback.c
> > > > @@ -1479,6 +1479,31 @@ static long writeback_chunk_size(struct bdi_writeback *wb,
> > > >  	return pages;
> > > >  }
> > > >  
> > > > +static long writeback_sb_metadata(struct super_block *sb,
> > > > +				  struct bdi_writeback *wb,
> > > > +				  struct wb_writeback_work *work)
> > > > +{
> > > > +	struct writeback_control wbc = {
> > > > +		.sync_mode		= work->sync_mode,
> > > > +		.tagged_writepages	= work->tagged_writepages,
> > > > +		.for_kupdate		= work->for_kupdate,
> > > > +		.for_background		= work->for_background,
> > > > +		.for_sync		= work->for_sync,
> > > > +		.range_cyclic		= work->range_cyclic,
> > > > +		.range_start		= 0,
> > > > +		.range_end		= LLONG_MAX,
> > > > +	};
> > > > +	long write_chunk;
> > > > +
> > > > +	write_chunk = writeback_chunk_size(wb, work);
> > > > +	wbc.nr_to_write = write_chunk;
> > > > +	sb->s_op->write_metadata(sb, &wbc);
> > > > +	work->nr_pages -= write_chunk - wbc.nr_to_write;
> > > > +
> > > > +	return write_chunk - wbc.nr_to_write;
> > > 
> > > Ok, writeback_chunk_size() returns a page count. We've already gone
> > > through the "metadata is not page sized" dance on the dirty
> > > accounting side, so how are we supposed to use pages to account for
> > > metadata writeback?
> > > 
> > 
> > This is just one of those things that's going to be slightly shitty.  It's the
> > same for memory reclaim, all of those places use pages so we just take
> > METADATA_*_BYTES >> PAGE_SHIFT to get pages and figure it's close enough.
> 
> Ok, so that isn't exactly easy to deal with, because all our
> metadata writeback is based on log sequence number targets (i.e. how
> far to push the tail of the log towards the current head). We've
> actually got no idea how pages/bytes actually map to a LSN target
> because while we might account a full buffer as dirty for memory
> reclaim purposes (up to 64k in size), we might have only logged 128
> bytes of it.
> 
> i.e. if we are asked to push 2MB of metadata and we treat that as
> 2MB of log space (i.e. push target of tail LSN + 2MB) we could have
> logged several tens of megabytes of dirty metadata in that LSN
> range and have to flush it all. OTOH, if the buffers are fully
> logged, then that same target might only flush 1.5MB of metadata
> once all the log overhead is taken into account.
> 
> So there's a fairly large disconnect between the "flush N bytes of
> metadata" API and the "push to a target LSN" that XFS uses for
> flushing metadata in aged order. I'm betting that extN and otehr
> filesystems might have similar mismatches with their journal
> flushing...

Well, for ext4 it isn't as bad since we do full block logging only. So if
we are asked to flush N pages, we can easily translate that to number of fs
blocks and flush that many from the oldest transaction.

Couldn't XFS just track how much it has cleaned (from reclaim perspective)
when pushing items from AIL (which is what I suppose XFS would do in
response to metadata writeback request) and just stop pushing when it has
cleaned as much as it was asked to?

> > > And, from what I can tell, if work->sync_mode = WB_SYNC_ALL or
> > > work->tagged_writepages is set, this will basically tell us to flush
> > > the entire dirty metadata cache because write_chunk will get set to
> > > LONG_MAX.
> > > 
> > > IOWs, this would appear to me to change sync() behaviour quite
> > > dramatically on filesystems where ->write_metadata is implemented.
> > > That is, instead of leaving all the metadata dirty in memory and
> > > just forcing the journal to stable storage, filesystems will be told
> > > to also write back all their dirty metadata before sync() returns,
> > > even though it is not necessary to provide correct sync()
> > > semantics....
> > 
> > Well for btrfs that's exactly what we have currently since it's just backed by
> > an inode.
> 
> Hmmmm. That explains a lot.
> 
> Seems to me that btrfs is the odd one out here, so I'm not sure a
> mechanism primarily designed for btrfs is going to work
> generically....

For record ext4 is currently behaving the way btrfs is as well (at least in
practice). We expose committed but not yet checkpointed transaction data
(equivalent of XFS's AIL AFAICT) in block device's page cache as dirty
buffers. Thus calls like sync_blockdev() which are called as part of
sync(2) will result in flushing all of those metadata buffers to disk
(although as you properly point out it is not strictly necessary for
correctness of sync(2)).

> > Obviously this is different for journaled fs'es, but I assumed that
> > in your case you would either not use this part of the infrastructure or simply
> > ignore WB_SYNC_ALL and use WB_SYNC_NONE as a way to be nice under memory
> > pressure or whatever.
> 
> I don't think that designing an interface with the assumption other
> filesystems will abuse it until it works for them is a great process
> to follow...
> 
> > > Mind you, writeback invocation is so convoluted now I could easily
> > > be mis-interpretting this code, but it does seem to me like this
> > > code is going to have some unintended behaviours....
> > > 
> > 
> > I don't think so, because right now this behavior is exactly what btrfs has
> > currently with it's inode setup.  I didn't really think the journaled use case
> > out since you guys are already rate limited by the journal.
> 
> We are?
> 
> XFS is rate limited by metadata writeback, not journal throughput.
> Yes, journal space is limited by the metadata writeback rate, but
> journalling itself is not the bottleneck.
> 
> > If you would want
> > to start using this stuff what would you like to see done instead?  Thanks,
> 
> If this is all about reacting to memory pressure, then writeback is
> not the mechanism that should drive this writeback. Reacting to
> memory pressure is what shrinkers are for, and XFS already triggers
> metadata writeback on memory pressure. Hence I don't see how this
> writeback mechanism would help us if we have to abuse it to infer
> "memory pressure occurring"
> 
> What I was hoping for was this interface to be a mechanism to drive
> periodic background metadata writeback from the VFS so that when we
> start to run out of memory the VFS has already started to ramp up
> the rate of metadata writeback so we don't have huge amounts of dirty
> metadata to write back during superblock shrinker based reclaim.

Yeah, that's where I'd like this patch set to end up as well and I believe
Josef is as well - he wants to prevent too much dirty metadata to get
accumulated after all and background writeback of metadata is a part of
that.

> i.e. it works more like dirty background data writeback, get's the
> amount of work to do from the amount of dirty metadata associated
> with the bdi and doesn't actually do anything when operations like
> sync() are run because there isn't a need to writeback metadata in
> those operations.
> 
> IOWs, treating metadata like it's one great big data inode doesn't
> seem to me to be the right abstraction to use for this - in most
> fileystems it's a bunch of objects with a complex dependency tree
> and unknown write ordering, not an inode full of data that can be
> sequentially written.
> 
> Maybe we need multiple ops with well defined behaviours. e.g.
> ->writeback_metadata() for background writeback, ->sync_metadata() for
> sync based operations. That way different filesystems can ignore the
> parts they don't need simply by not implementing those operations,
> and the writeback code doesn't need to try to cater for all
> operations through the one op. The writeback code should be cleaner,
> the filesystem code should be cleaner, and we can tailor the work
> guidelines for each operation separately so there's less mismatch
> between what writeback is asking and how filesystems track dirty
> metadata...

I agree that writeback for memory cleaning and writeback for data integrity
are two very different things especially for metadata. In fact for data
integrity writeback we already have ->sync_fs operation so there the
functionality gets duplicated. What we could do is that in
writeback_sb_inodes() we'd call ->write_metadata only when
work->for_kupdate or work->for_background is set. That way ->write_metadata
would be called only for memory cleaning purposes.

								Honza
Jan Kara Dec. 19, 2017, 12:21 p.m. UTC | #6
On Mon 11-12-17 16:55:31, Josef Bacik wrote:
> @@ -1621,12 +1647,18 @@ static long writeback_sb_inodes(struct super_block *sb,
>  		 * background threshold and other termination conditions.
>  		 */
>  		if (wrote) {
> -			if (time_is_before_jiffies(start_time + HZ / 10UL))
> -				break;
> -			if (work->nr_pages <= 0)
> +			if (time_is_before_jiffies(start_time + HZ / 10UL) ||
> +			    work->nr_pages <= 0) {
> +				done = true;
>  				break;
> +			}
>  		}
>  	}
> +	if (!done && sb->s_op->write_metadata) {
> +		spin_unlock(&wb->list_lock);
> +		wrote += writeback_sb_metadata(sb, wb, work);
> +		spin_lock(&wb->list_lock);
> +	}
>  	return wrote;
>  }

One thing I've notice when looking at this patch again: This duplicates the
metadata writeback done in __writeback_inodes_wb(). So you probably need a
new helper function like writeback_sb() that will call writeback_sb_inodes()
and handle metadata writeback and call that from wb_writeback() instead of
writeback_sb_inodes() directly.

								Honza

> @@ -1635,6 +1667,7 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
>  {
>  	unsigned long start_time = jiffies;
>  	long wrote = 0;
> +	bool done = false;
>  
>  	while (!list_empty(&wb->b_io)) {
>  		struct inode *inode = wb_inode(wb->b_io.prev);
> @@ -1654,12 +1687,39 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
>  
>  		/* refer to the same tests at the end of writeback_sb_inodes */
>  		if (wrote) {
> -			if (time_is_before_jiffies(start_time + HZ / 10UL))
> -				break;
> -			if (work->nr_pages <= 0)
> +			if (time_is_before_jiffies(start_time + HZ / 10UL) ||
> +			    work->nr_pages <= 0) {
> +				done = true;
>  				break;
> +			}
>  		}
>  	}
> +
> +	if (!done && wb_stat(wb, WB_METADATA_DIRTY_BYTES)) {
> +		LIST_HEAD(list);
> +
> +		spin_unlock(&wb->list_lock);
> +		spin_lock(&wb->bdi->sb_list_lock);
> +		list_splice_init(&wb->bdi->dirty_sb_list, &list);
> +		while (!list_empty(&list)) {
> +			struct super_block *sb;
> +
> +			sb = list_first_entry(&list, struct super_block,
> +					      s_bdi_dirty_list);
> +			list_move_tail(&sb->s_bdi_dirty_list,
> +				       &wb->bdi->dirty_sb_list);
> +			if (!sb->s_op->write_metadata)
> +				continue;
> +			if (!trylock_super(sb))
> +				continue;
> +			spin_unlock(&wb->bdi->sb_list_lock);
> +			wrote += writeback_sb_metadata(sb, wb, work);
> +			spin_lock(&wb->bdi->sb_list_lock);
> +			up_read(&sb->s_umount);
> +		}
> +		spin_unlock(&wb->bdi->sb_list_lock);
> +		spin_lock(&wb->list_lock);
> +	}
>  	/* Leave any unwritten inodes on b_io */
>  	return wrote;
>  }
Dave Chinner Dec. 19, 2017, 9:35 p.m. UTC | #7
On Tue, Dec 19, 2017 at 01:07:09PM +0100, Jan Kara wrote:
> On Wed 13-12-17 09:20:04, Dave Chinner wrote:
> > On Tue, Dec 12, 2017 at 01:05:35PM -0500, Josef Bacik wrote:
> > > On Tue, Dec 12, 2017 at 10:36:19AM +1100, Dave Chinner wrote:
> > > > On Mon, Dec 11, 2017 at 04:55:31PM -0500, Josef Bacik wrote:
> > > This is just one of those things that's going to be slightly shitty.  It's the
> > > same for memory reclaim, all of those places use pages so we just take
> > > METADATA_*_BYTES >> PAGE_SHIFT to get pages and figure it's close enough.
> > 
> > Ok, so that isn't exactly easy to deal with, because all our
> > metadata writeback is based on log sequence number targets (i.e. how
> > far to push the tail of the log towards the current head). We've
> > actually got no idea how pages/bytes actually map to a LSN target
> > because while we might account a full buffer as dirty for memory
> > reclaim purposes (up to 64k in size), we might have only logged 128
> > bytes of it.
> > 
> > i.e. if we are asked to push 2MB of metadata and we treat that as
> > 2MB of log space (i.e. push target of tail LSN + 2MB) we could have
> > logged several tens of megabytes of dirty metadata in that LSN
> > range and have to flush it all. OTOH, if the buffers are fully
> > logged, then that same target might only flush 1.5MB of metadata
> > once all the log overhead is taken into account.
> > 
> > So there's a fairly large disconnect between the "flush N bytes of
> > metadata" API and the "push to a target LSN" that XFS uses for
> > flushing metadata in aged order. I'm betting that extN and otehr
> > filesystems might have similar mismatches with their journal
> > flushing...
> 
> Well, for ext4 it isn't as bad since we do full block logging only. So if
> we are asked to flush N pages, we can easily translate that to number of fs
> blocks and flush that many from the oldest transaction.
> 
> Couldn't XFS just track how much it has cleaned (from reclaim perspective)
> when pushing items from AIL (which is what I suppose XFS would do in
> response to metadata writeback request) and just stop pushing when it has
> cleaned as much as it was asked to?

If only it were that simple :/

To start with, flushing the dirty objects (such as inodes) to their
backing buffers do not mean the the object is clean once the
writeback completes. XFS has decoupled in-memory objects with
logical object logging rather than logging physical buffers, and
so can be modified and dirtied while the inode buffer
is being written back. Hence if we just count things like "buffer
size written" it's not actually a correct account of the amount of
dirty metadata we've cleaned. If we don't get that right, it'll
result in accounting errors and incorrect behaviour.

The bigger problem, however, is that we have no channel to return
flush information from the AIL pushing to whatever caller asked for
the push. Pushing metadata is completely decoupled from every other
subsystem. i.e. the caller asked the xfsaild to push to a specific
LSN (e.g. to free up a certain amount of log space for new
transactions), and *nothing* has any idea of how much metadata we'll
need to write to push the tail of the log to that LSN.

It's also completely asynchronous - there's no mechanism for waiting
on a push to a specific LSN. Anything that needs a specific amount
of log space to be available waits in ordered ticket queues on the
log tail moving forwards. The only interfaces that have access to
the log tail ticket waiting is the transaction reservation
subsystem, which cannot be used during metadata writeback because
that's a guaranteed deadlock vector....

Saying "just account for bytes written" assumes directly connected,
synchronous dispatch metadata writeback infrastructure which we
simply don't have in XFS. "just clean this many bytes" doesn't
really fit at all because we have no way of referencing that to the
distance we need to push the tail of the log. An interface that
tells us "clean this percentage of dirty metadata" is much more
useful because we can map that easily to a log sequence number
based push target....

> > IOWs, treating metadata like it's one great big data inode doesn't
> > seem to me to be the right abstraction to use for this - in most
> > fileystems it's a bunch of objects with a complex dependency tree
> > and unknown write ordering, not an inode full of data that can be
> > sequentially written.
> > 
> > Maybe we need multiple ops with well defined behaviours. e.g.
> > ->writeback_metadata() for background writeback, ->sync_metadata() for
> > sync based operations. That way different filesystems can ignore the
> > parts they don't need simply by not implementing those operations,
> > and the writeback code doesn't need to try to cater for all
> > operations through the one op. The writeback code should be cleaner,
> > the filesystem code should be cleaner, and we can tailor the work
> > guidelines for each operation separately so there's less mismatch
> > between what writeback is asking and how filesystems track dirty
> > metadata...
> 
> I agree that writeback for memory cleaning and writeback for data integrity
> are two very different things especially for metadata. In fact for data
> integrity writeback we already have ->sync_fs operation so there the
> functionality gets duplicated. What we could do is that in
> writeback_sb_inodes() we'd call ->write_metadata only when
> work->for_kupdate or work->for_background is set. That way ->write_metadata
> would be called only for memory cleaning purposes.

That makes sense, but I still think we need a better indication of
how much writeback we need to do than just "writeback this chunk of
pages". That "writeback a chunk" interface is necessary to share
writeback bandwidth across numerous data inodes so that we don't
starve any one inode of writeback bandwidth. That's unnecessary for
metadata writeback on a superblock - we don't need to share that
bandwidth around hundreds or thousands of inodes. What we actually
need to know is how much writeback we need to do as a total of all
the dirty metadata on the superblock.

Sure, that's not ideal for btrfs and mayext4, but we can write a
simple generic helper that converts "flush X percent of dirty
metadata" to a page/byte chunk as the current code does. DOing it
this way allows filesystems to completely internalise the accounting
that needs to be done, rather than trying to hack around a
writeback accounting interface with large impedance mismatches to
how the filesystem accounts for dirty metadata and/or tracks
writeback progress.

Cheers,

Dave.
Jan Kara Dec. 20, 2017, 2:30 p.m. UTC | #8
On Wed 20-12-17 08:35:05, Dave Chinner wrote:
> On Tue, Dec 19, 2017 at 01:07:09PM +0100, Jan Kara wrote:
> > On Wed 13-12-17 09:20:04, Dave Chinner wrote:
> > > On Tue, Dec 12, 2017 at 01:05:35PM -0500, Josef Bacik wrote:
> > > > On Tue, Dec 12, 2017 at 10:36:19AM +1100, Dave Chinner wrote:
> > > > > On Mon, Dec 11, 2017 at 04:55:31PM -0500, Josef Bacik wrote:
> > > > This is just one of those things that's going to be slightly shitty.  It's the
> > > > same for memory reclaim, all of those places use pages so we just take
> > > > METADATA_*_BYTES >> PAGE_SHIFT to get pages and figure it's close enough.
> > > 
> > > Ok, so that isn't exactly easy to deal with, because all our
> > > metadata writeback is based on log sequence number targets (i.e. how
> > > far to push the tail of the log towards the current head). We've
> > > actually got no idea how pages/bytes actually map to a LSN target
> > > because while we might account a full buffer as dirty for memory
> > > reclaim purposes (up to 64k in size), we might have only logged 128
> > > bytes of it.
> > > 
> > > i.e. if we are asked to push 2MB of metadata and we treat that as
> > > 2MB of log space (i.e. push target of tail LSN + 2MB) we could have
> > > logged several tens of megabytes of dirty metadata in that LSN
> > > range and have to flush it all. OTOH, if the buffers are fully
> > > logged, then that same target might only flush 1.5MB of metadata
> > > once all the log overhead is taken into account.
> > > 
> > > So there's a fairly large disconnect between the "flush N bytes of
> > > metadata" API and the "push to a target LSN" that XFS uses for
> > > flushing metadata in aged order. I'm betting that extN and otehr
> > > filesystems might have similar mismatches with their journal
> > > flushing...
> > 
> > Well, for ext4 it isn't as bad since we do full block logging only. So if
> > we are asked to flush N pages, we can easily translate that to number of fs
> > blocks and flush that many from the oldest transaction.
> > 
> > Couldn't XFS just track how much it has cleaned (from reclaim perspective)
> > when pushing items from AIL (which is what I suppose XFS would do in
> > response to metadata writeback request) and just stop pushing when it has
> > cleaned as much as it was asked to?
> 
> If only it were that simple :/
> 
> To start with, flushing the dirty objects (such as inodes) to their
> backing buffers do not mean the the object is clean once the
> writeback completes. XFS has decoupled in-memory objects with
> logical object logging rather than logging physical buffers, and
> so can be modified and dirtied while the inode buffer
> is being written back. Hence if we just count things like "buffer
> size written" it's not actually a correct account of the amount of
> dirty metadata we've cleaned. If we don't get that right, it'll
> result in accounting errors and incorrect behaviour.
> 
> The bigger problem, however, is that we have no channel to return
> flush information from the AIL pushing to whatever caller asked for
> the push. Pushing metadata is completely decoupled from every other
> subsystem. i.e. the caller asked the xfsaild to push to a specific
> LSN (e.g. to free up a certain amount of log space for new
> transactions), and *nothing* has any idea of how much metadata we'll
> need to write to push the tail of the log to that LSN.
> 
> It's also completely asynchronous - there's no mechanism for waiting
> on a push to a specific LSN. Anything that needs a specific amount
> of log space to be available waits in ordered ticket queues on the
> log tail moving forwards. The only interfaces that have access to
> the log tail ticket waiting is the transaction reservation
> subsystem, which cannot be used during metadata writeback because
> that's a guaranteed deadlock vector....
> 
> Saying "just account for bytes written" assumes directly connected,
> synchronous dispatch metadata writeback infrastructure which we
> simply don't have in XFS. "just clean this many bytes" doesn't
> really fit at all because we have no way of referencing that to the
> distance we need to push the tail of the log. An interface that
> tells us "clean this percentage of dirty metadata" is much more
> useful because we can map that easily to a log sequence number
> based push target....

OK, understood.

> > > IOWs, treating metadata like it's one great big data inode doesn't
> > > seem to me to be the right abstraction to use for this - in most
> > > fileystems it's a bunch of objects with a complex dependency tree
> > > and unknown write ordering, not an inode full of data that can be
> > > sequentially written.
> > > 
> > > Maybe we need multiple ops with well defined behaviours. e.g.
> > > ->writeback_metadata() for background writeback, ->sync_metadata() for
> > > sync based operations. That way different filesystems can ignore the
> > > parts they don't need simply by not implementing those operations,
> > > and the writeback code doesn't need to try to cater for all
> > > operations through the one op. The writeback code should be cleaner,
> > > the filesystem code should be cleaner, and we can tailor the work
> > > guidelines for each operation separately so there's less mismatch
> > > between what writeback is asking and how filesystems track dirty
> > > metadata...
> > 
> > I agree that writeback for memory cleaning and writeback for data integrity
> > are two very different things especially for metadata. In fact for data
> > integrity writeback we already have ->sync_fs operation so there the
> > functionality gets duplicated. What we could do is that in
> > writeback_sb_inodes() we'd call ->write_metadata only when
> > work->for_kupdate or work->for_background is set. That way ->write_metadata
> > would be called only for memory cleaning purposes.
> 
> That makes sense, but I still think we need a better indication of
> how much writeback we need to do than just "writeback this chunk of
> pages". That "writeback a chunk" interface is necessary to share
> writeback bandwidth across numerous data inodes so that we don't
> starve any one inode of writeback bandwidth. That's unnecessary for
> metadata writeback on a superblock - we don't need to share that
> bandwidth around hundreds or thousands of inodes. What we actually
> need to know is how much writeback we need to do as a total of all
> the dirty metadata on the superblock.
> 
> Sure, that's not ideal for btrfs and mayext4, but we can write a
> simple generic helper that converts "flush X percent of dirty
> metadata" to a page/byte chunk as the current code does. DOing it
> this way allows filesystems to completely internalise the accounting
> that needs to be done, rather than trying to hack around a
> writeback accounting interface with large impedance mismatches to
> how the filesystem accounts for dirty metadata and/or tracks
> writeback progress.

Let me think loud on how we could tie this into how memory cleaning
writeback currently works - the one with for_background == 1 which is
generally used to get amount of dirty pages in the system under control.
We have a queue of inodes to write, we iterate over this queue and ask each
inode to write some amount (e.g. 64 M - exact amount depends on measured
writeback bandwidth etc.). Some amount from that inode gets written and we
continue with the next inode in the queue (put this one at the end of the
queue if it still has dirty pages). We do this until:

a) the number of dirty pages in the system is below background dirty limit
   and the number dirty pages for this device is below background dirty
   limit for this device.
b) run out of dirty inodes on this device
c) someone queues different type of writeback

And we need to somehow incorporate metadata writeback into this loop. I see
two questions here:

1) When / how often should we ask for metadata writeback?
2) How much to ask to write in one go?

The second question is especially tricky in the presence of completely
async metadata flushing in XFS - we can ask to write say half of dirty
metadata but then we have no idea whether the next observation of dirty
metadata counters is with that part of metadata already under writeback /
cleaned or whether xfsaild didn't even start working and pushing more has
no sense. Partly, this could be dealt with by telling the filesystem
"metadata dirty target" - i.e. "get your dirty metadata counters below X"
- and whether we communicate that in bytes, pages, or a fraction of
current dirty metadata counter value is a detail I don't have a strong
opinion on now. And the fact is the amount written by the filesystem
doesn't have to be very accurate anyway - we basically just want to make
some forward progress with writing metadata, don't want that to take too
long (so that other writeback from the thread isn't stalled), and if
writeback code is unhappy about the state of counters next time it looks,
it will ask the filesystem again...

This gets me directly to another problem with async nature of XFS metadata
writeback. That is that it could get writeback thread into busyloop - we
are supposed to terminate memory cleaning writeback only once dirty
counters are below limit and in case dirty metadata is causing counters to
be over limit, we would just ask in a loop XFS to get metadata below the
target. I suppose XFS could just return "nothing written" from its
->write_metadata operation and in such case we could sleep a bit before
going for another writeback loop (the same thing happens when filesystem
reports all inodes are locked / busy and it cannot writeback anything). But
it's getting a bit ugly and is it really better than somehow waiting inside
XFS for metadata writeback to occur?  Any idea Dave?

Regarding question 1). What Josef does is that once we went through all
queued inodes and wrote some amount from each one, we'd go and ask fs to
write some metadata. And then we'll again go to write inodes that are still
dirty. That is somewhat rough but I guess it is fine for now.

								Honza
Josef Bacik Jan. 2, 2018, 4:13 p.m. UTC | #9
On Wed, Dec 20, 2017 at 03:30:55PM +0100, Jan Kara wrote:
> On Wed 20-12-17 08:35:05, Dave Chinner wrote:
> > On Tue, Dec 19, 2017 at 01:07:09PM +0100, Jan Kara wrote:
> > > On Wed 13-12-17 09:20:04, Dave Chinner wrote:
> > > > On Tue, Dec 12, 2017 at 01:05:35PM -0500, Josef Bacik wrote:
> > > > > On Tue, Dec 12, 2017 at 10:36:19AM +1100, Dave Chinner wrote:
> > > > > > On Mon, Dec 11, 2017 at 04:55:31PM -0500, Josef Bacik wrote:
> > > > > This is just one of those things that's going to be slightly shitty.  It's the
> > > > > same for memory reclaim, all of those places use pages so we just take
> > > > > METADATA_*_BYTES >> PAGE_SHIFT to get pages and figure it's close enough.
> > > > 
> > > > Ok, so that isn't exactly easy to deal with, because all our
> > > > metadata writeback is based on log sequence number targets (i.e. how
> > > > far to push the tail of the log towards the current head). We've
> > > > actually got no idea how pages/bytes actually map to a LSN target
> > > > because while we might account a full buffer as dirty for memory
> > > > reclaim purposes (up to 64k in size), we might have only logged 128
> > > > bytes of it.
> > > > 
> > > > i.e. if we are asked to push 2MB of metadata and we treat that as
> > > > 2MB of log space (i.e. push target of tail LSN + 2MB) we could have
> > > > logged several tens of megabytes of dirty metadata in that LSN
> > > > range and have to flush it all. OTOH, if the buffers are fully
> > > > logged, then that same target might only flush 1.5MB of metadata
> > > > once all the log overhead is taken into account.
> > > > 
> > > > So there's a fairly large disconnect between the "flush N bytes of
> > > > metadata" API and the "push to a target LSN" that XFS uses for
> > > > flushing metadata in aged order. I'm betting that extN and otehr
> > > > filesystems might have similar mismatches with their journal
> > > > flushing...
> > > 
> > > Well, for ext4 it isn't as bad since we do full block logging only. So if
> > > we are asked to flush N pages, we can easily translate that to number of fs
> > > blocks and flush that many from the oldest transaction.
> > > 
> > > Couldn't XFS just track how much it has cleaned (from reclaim perspective)
> > > when pushing items from AIL (which is what I suppose XFS would do in
> > > response to metadata writeback request) and just stop pushing when it has
> > > cleaned as much as it was asked to?
> > 
> > If only it were that simple :/
> > 
> > To start with, flushing the dirty objects (such as inodes) to their
> > backing buffers do not mean the the object is clean once the
> > writeback completes. XFS has decoupled in-memory objects with
> > logical object logging rather than logging physical buffers, and
> > so can be modified and dirtied while the inode buffer
> > is being written back. Hence if we just count things like "buffer
> > size written" it's not actually a correct account of the amount of
> > dirty metadata we've cleaned. If we don't get that right, it'll
> > result in accounting errors and incorrect behaviour.
> > 
> > The bigger problem, however, is that we have no channel to return
> > flush information from the AIL pushing to whatever caller asked for
> > the push. Pushing metadata is completely decoupled from every other
> > subsystem. i.e. the caller asked the xfsaild to push to a specific
> > LSN (e.g. to free up a certain amount of log space for new
> > transactions), and *nothing* has any idea of how much metadata we'll
> > need to write to push the tail of the log to that LSN.
> > 
> > It's also completely asynchronous - there's no mechanism for waiting
> > on a push to a specific LSN. Anything that needs a specific amount
> > of log space to be available waits in ordered ticket queues on the
> > log tail moving forwards. The only interfaces that have access to
> > the log tail ticket waiting is the transaction reservation
> > subsystem, which cannot be used during metadata writeback because
> > that's a guaranteed deadlock vector....
> > 
> > Saying "just account for bytes written" assumes directly connected,
> > synchronous dispatch metadata writeback infrastructure which we
> > simply don't have in XFS. "just clean this many bytes" doesn't
> > really fit at all because we have no way of referencing that to the
> > distance we need to push the tail of the log. An interface that
> > tells us "clean this percentage of dirty metadata" is much more
> > useful because we can map that easily to a log sequence number
> > based push target....
> 
> OK, understood.
> 
> > > > IOWs, treating metadata like it's one great big data inode doesn't
> > > > seem to me to be the right abstraction to use for this - in most
> > > > fileystems it's a bunch of objects with a complex dependency tree
> > > > and unknown write ordering, not an inode full of data that can be
> > > > sequentially written.
> > > > 
> > > > Maybe we need multiple ops with well defined behaviours. e.g.
> > > > ->writeback_metadata() for background writeback, ->sync_metadata() for
> > > > sync based operations. That way different filesystems can ignore the
> > > > parts they don't need simply by not implementing those operations,
> > > > and the writeback code doesn't need to try to cater for all
> > > > operations through the one op. The writeback code should be cleaner,
> > > > the filesystem code should be cleaner, and we can tailor the work
> > > > guidelines for each operation separately so there's less mismatch
> > > > between what writeback is asking and how filesystems track dirty
> > > > metadata...
> > > 
> > > I agree that writeback for memory cleaning and writeback for data integrity
> > > are two very different things especially for metadata. In fact for data
> > > integrity writeback we already have ->sync_fs operation so there the
> > > functionality gets duplicated. What we could do is that in
> > > writeback_sb_inodes() we'd call ->write_metadata only when
> > > work->for_kupdate or work->for_background is set. That way ->write_metadata
> > > would be called only for memory cleaning purposes.
> > 
> > That makes sense, but I still think we need a better indication of
> > how much writeback we need to do than just "writeback this chunk of
> > pages". That "writeback a chunk" interface is necessary to share
> > writeback bandwidth across numerous data inodes so that we don't
> > starve any one inode of writeback bandwidth. That's unnecessary for
> > metadata writeback on a superblock - we don't need to share that
> > bandwidth around hundreds or thousands of inodes. What we actually
> > need to know is how much writeback we need to do as a total of all
> > the dirty metadata on the superblock.
> > 
> > Sure, that's not ideal for btrfs and mayext4, but we can write a
> > simple generic helper that converts "flush X percent of dirty
> > metadata" to a page/byte chunk as the current code does. DOing it
> > this way allows filesystems to completely internalise the accounting
> > that needs to be done, rather than trying to hack around a
> > writeback accounting interface with large impedance mismatches to
> > how the filesystem accounts for dirty metadata and/or tracks
> > writeback progress.
> 
> Let me think loud on how we could tie this into how memory cleaning
> writeback currently works - the one with for_background == 1 which is
> generally used to get amount of dirty pages in the system under control.
> We have a queue of inodes to write, we iterate over this queue and ask each
> inode to write some amount (e.g. 64 M - exact amount depends on measured
> writeback bandwidth etc.). Some amount from that inode gets written and we
> continue with the next inode in the queue (put this one at the end of the
> queue if it still has dirty pages). We do this until:
> 
> a) the number of dirty pages in the system is below background dirty limit
>    and the number dirty pages for this device is below background dirty
>    limit for this device.
> b) run out of dirty inodes on this device
> c) someone queues different type of writeback
> 
> And we need to somehow incorporate metadata writeback into this loop. I see
> two questions here:
> 
> 1) When / how often should we ask for metadata writeback?
> 2) How much to ask to write in one go?
> 
> The second question is especially tricky in the presence of completely
> async metadata flushing in XFS - we can ask to write say half of dirty
> metadata but then we have no idea whether the next observation of dirty
> metadata counters is with that part of metadata already under writeback /
> cleaned or whether xfsaild didn't even start working and pushing more has
> no sense. Partly, this could be dealt with by telling the filesystem
> "metadata dirty target" - i.e. "get your dirty metadata counters below X"
> - and whether we communicate that in bytes, pages, or a fraction of
> current dirty metadata counter value is a detail I don't have a strong
> opinion on now. And the fact is the amount written by the filesystem
> doesn't have to be very accurate anyway - we basically just want to make
> some forward progress with writing metadata, don't want that to take too
> long (so that other writeback from the thread isn't stalled), and if
> writeback code is unhappy about the state of counters next time it looks,
> it will ask the filesystem again...
> 
> This gets me directly to another problem with async nature of XFS metadata
> writeback. That is that it could get writeback thread into busyloop - we
> are supposed to terminate memory cleaning writeback only once dirty
> counters are below limit and in case dirty metadata is causing counters to
> be over limit, we would just ask in a loop XFS to get metadata below the
> target. I suppose XFS could just return "nothing written" from its
> ->write_metadata operation and in such case we could sleep a bit before
> going for another writeback loop (the same thing happens when filesystem
> reports all inodes are locked / busy and it cannot writeback anything). But
> it's getting a bit ugly and is it really better than somehow waiting inside
> XFS for metadata writeback to occur?  Any idea Dave?
> 
> Regarding question 1). What Josef does is that once we went through all
> queued inodes and wrote some amount from each one, we'd go and ask fs to
> write some metadata. And then we'll again go to write inodes that are still
> dirty. That is somewhat rough but I guess it is fine for now.
> 

Alright I'm back from vacation so am sufficiently hungover to try and figure
this out.  Btrfs and ext4 account their dirty metadata directly and reclaim it
like inodes, xfs doesn't.  Btrfs does do something similar to what xfs does with
delayed updates, but we just use the enospc logic to trigger when to update the
metadata blocks, and then those just get written out via the dirty balancing
stuff.  Since xfs doesn't have a direct way to tie that together, you'd rather
we'd have some sort of ratio so you know you need to flush dirty inodes, correct
Dave?

I don't think this is solvable for xfs.  The whole vm is around pages/bytes.
The only place we have this ratio thing is in slab reclaim, and we only have to
worry about actual memory pressure there because we have a nice external
trigger, we're out of pages.

For dirty throttling we have to know how much we're pushing and how much we need
to push, and that _requires_ bytes/pages.  And not like "we can only send you
bytes/pages to reclaim" but like the throttling stuff has all of it's accounting
in bytes/pages, so putting in arbitrary object counts into this logic is not
going to be straightforward.  The system administrator sets their dirty limits
to absolute numbers or % of total memory.  If xfs can't account for its metadata
this way then I don't think it can use any sort of infrastructure we provide in
the current framework.  We'd have to completely overhaul the dirty throttling
stuff for it to work, and even then we still need bandwidth of the device which
means how many bytes we're writing out.

I have an alternative proposal.  We keep these patches the way they are and use
it for btrfs and ext4 since our actual metadata pool is tracked similarly to
inodes.

Then to accomodate xfs (and btrfs and ext4) we have a separate throttling system
that is fs object based.  I know xfs and btrfs bypass the mark_inode_dirty()
stuff, but there's no reason we couldn't strap some logic in there to account
for how many dirty objects we have lying around.  Then we need to come up with a
way we want to limit the objects.  I feel like this part is going to be very fs
specific, btrfs will have it's enospc checks, ext4 and xfs will have log space
checks.  We add this into balance_dirty_pages(), so we benefit from the
per-process rate limiting, and we just have super->balance_metadata() and inside
balance_metadata() the fs takes into account the current dirty objects usage and
how much space we have left and then does it's job for throttling stuff.  If
there's plenty of space or whatever it's just a no-op and returns.

This would couple nicely with what I've done in these patches, as the
balance_metadata() would simply move our in-memory updates into the buffers
themselves, making them dirty.  Then the actualy dirty metadata block stuff
could see if it's time to do writeout.

And this would make xfs so it doesn't do writeout from the slab reclamation
path, which Facebook constantly has to patch out because it just make xfs
unusable in our environments.

From here we could then extend the fs object dirty balancing to have more
generic logic for when we need to flush, but I feel like that's going to be a
much larger project than just providing a callback.  This doesn't get us
anything super new or fancy, but could even be used in the memory reclaim path
in a less critical area to flush dirty metadata rather than the slab reclaim
path that xfs currently uses.  And then we can build more complicated things
from there.  What do you think of this?

Josef
Dave Chinner Jan. 3, 2018, 2:32 a.m. UTC | #10
On Tue, Jan 02, 2018 at 11:13:06AM -0500, Josef Bacik wrote:
> On Wed, Dec 20, 2017 at 03:30:55PM +0100, Jan Kara wrote:
> > On Wed 20-12-17 08:35:05, Dave Chinner wrote:
> > > On Tue, Dec 19, 2017 at 01:07:09PM +0100, Jan Kara wrote:
> > > > On Wed 13-12-17 09:20:04, Dave Chinner wrote:
> > > > > IOWs, treating metadata like it's one great big data inode doesn't
> > > > > seem to me to be the right abstraction to use for this - in most
> > > > > fileystems it's a bunch of objects with a complex dependency tree
> > > > > and unknown write ordering, not an inode full of data that can be
> > > > > sequentially written.
> > > > > 
> > > > > Maybe we need multiple ops with well defined behaviours. e.g.
> > > > > ->writeback_metadata() for background writeback, ->sync_metadata() for
> > > > > sync based operations. That way different filesystems can ignore the
> > > > > parts they don't need simply by not implementing those operations,
> > > > > and the writeback code doesn't need to try to cater for all
> > > > > operations through the one op. The writeback code should be cleaner,
> > > > > the filesystem code should be cleaner, and we can tailor the work
> > > > > guidelines for each operation separately so there's less mismatch
> > > > > between what writeback is asking and how filesystems track dirty
> > > > > metadata...
> > > > 
> > > > I agree that writeback for memory cleaning and writeback for data integrity
> > > > are two very different things especially for metadata. In fact for data
> > > > integrity writeback we already have ->sync_fs operation so there the
> > > > functionality gets duplicated. What we could do is that in
> > > > writeback_sb_inodes() we'd call ->write_metadata only when
> > > > work->for_kupdate or work->for_background is set. That way ->write_metadata
> > > > would be called only for memory cleaning purposes.
> > > 
> > > That makes sense, but I still think we need a better indication of
> > > how much writeback we need to do than just "writeback this chunk of
> > > pages". That "writeback a chunk" interface is necessary to share
> > > writeback bandwidth across numerous data inodes so that we don't
> > > starve any one inode of writeback bandwidth. That's unnecessary for
> > > metadata writeback on a superblock - we don't need to share that
> > > bandwidth around hundreds or thousands of inodes. What we actually
> > > need to know is how much writeback we need to do as a total of all
> > > the dirty metadata on the superblock.
> > > 
> > > Sure, that's not ideal for btrfs and mayext4, but we can write a
> > > simple generic helper that converts "flush X percent of dirty
> > > metadata" to a page/byte chunk as the current code does. DOing it
> > > this way allows filesystems to completely internalise the accounting
> > > that needs to be done, rather than trying to hack around a
> > > writeback accounting interface with large impedance mismatches to
> > > how the filesystem accounts for dirty metadata and/or tracks
> > > writeback progress.
> > 
> > Let me think loud on how we could tie this into how memory cleaning
> > writeback currently works - the one with for_background == 1 which is
> > generally used to get amount of dirty pages in the system under control.
> > We have a queue of inodes to write, we iterate over this queue and ask each
> > inode to write some amount (e.g. 64 M - exact amount depends on measured

It's a maximum of 1024 pages per inode.

> > writeback bandwidth etc.). Some amount from that inode gets written and we
> > continue with the next inode in the queue (put this one at the end of the
> > queue if it still has dirty pages). We do this until:
> > 
> > a) the number of dirty pages in the system is below background dirty limit
> >    and the number dirty pages for this device is below background dirty
> >    limit for this device.
> > b) run out of dirty inodes on this device
> > c) someone queues different type of writeback
> > 
> > And we need to somehow incorporate metadata writeback into this loop. I see
> > two questions here:
> > 
> > 1) When / how often should we ask for metadata writeback?
> > 2) How much to ask to write in one go?
> > 
> > The second question is especially tricky in the presence of completely
> > async metadata flushing in XFS - we can ask to write say half of dirty
> > metadata but then we have no idea whether the next observation of dirty
> > metadata counters is with that part of metadata already under writeback /
> > cleaned or whether xfsaild didn't even start working and pushing more has
> > no sense.

Well, like with ext4, we've also got to consider that a bunch of the
recently dirtied metadata (e.g. from delalloc, EOF updates on IO
completion, etc) is still pinned in memory because the
journal has not been flushed/checkpointed. Hence we should not be
attempting to write back metadata we've dirtied as a result of
writing data in the background writeback loop.

That greatly simplifies what we need to consider here. That is, we
just need to sample the ratio of dirty metadata to clean metadata
before we start data writeback, and we calculate the amount of
metadata writeback we should trigger from there. We only need to
do this *once* per background writeback scan for a superblock
as there is no need for sharing bandwidth between lots of data
inodes - there's only one metadata inode for ext4/btrfs, and XFS is
completely async....

> > Partly, this could be dealt with by telling the filesystem
> > "metadata dirty target" - i.e. "get your dirty metadata counters below X"
> > - and whether we communicate that in bytes, pages, or a fraction of
> > current dirty metadata counter value is a detail I don't have a strong
> > opinion on now. And the fact is the amount written by the filesystem
> > doesn't have to be very accurate anyway - we basically just want to make
> > some forward progress with writing metadata, don't want that to take too
> > long (so that other writeback from the thread isn't stalled), and if
> > writeback code is unhappy about the state of counters next time it looks,
> > it will ask the filesystem again...

Right. The problem is communicating "how much" to the filesystem in
a useful manner....

> > This gets me directly to another problem with async nature of XFS metadata
> > writeback. That is that it could get writeback thread into busyloop - we
> > are supposed to terminate memory cleaning writeback only once dirty
> > counters are below limit and in case dirty metadata is causing counters to
> > be over limit, we would just ask in a loop XFS to get metadata below the
> > target. I suppose XFS could just return "nothing written" from its
> > ->write_metadata operation and in such case we could sleep a bit before
> > going for another writeback loop (the same thing happens when filesystem
> > reports all inodes are locked / busy and it cannot writeback anything). But
> > it's getting a bit ugly and is it really better than somehow waiting inside
> > XFS for metadata writeback to occur?  Any idea Dave?

I tend to think that the whole point of background writeback is to
do it asynchronously and keep the IO pipe full by avoiding blocking
on any specific object. i.e. if we can't do writeback from this
object, then skip it and do it from the next....

I think we could probably block ->write_metadata if necessary via a
completion/wakeup style notification when a specific LSN is reached
by the log tail, but realistically if there's any amount of data
needing to be written it'll throttle data writes because the IO
pipeline is being kept full by background metadata writes....

> > Regarding question 1). What Josef does is that once we went through all
> > queued inodes and wrote some amount from each one, we'd go and ask fs to
> > write some metadata. And then we'll again go to write inodes that are still
> > dirty. That is somewhat rough but I guess it is fine for now.
> > 
> 
> Alright I'm back from vacation so am sufficiently hungover to try and figure
> this out.  Btrfs and ext4 account their dirty metadata directly and reclaim it
> like inodes, xfs doesn't.

Terminology: "reclaim" is not what we do when accounting for
writeback IO completion.

And we've already been through the accounting side of things - we
can add that to XFS once it's converted to byte-based accounting.

> Btrfs does do something similar to what xfs does with
> delayed updates, but we just use the enospc logic to trigger when to update the
> metadata blocks, and then those just get written out via the dirty balancing
> stuff.  Since xfs doesn't have a direct way to tie that together, you'd rather
> we'd have some sort of ratio so you know you need to flush dirty inodes, correct
> Dave?

Again, terminology: We don't "need to flush dirty inodes" in XFS,
we need to flush /metadata objects/.

> I don't think this is solvable for xfs.  The whole vm is around pages/bytes.
> The only place we have this ratio thing is in slab reclaim, and we only have to
> worry about actual memory pressure there because we have a nice external
> trigger, we're out of pages.

WE don't need all of the complexity of slab reclaim, though. That's
a complete red herring.

All that is needed is for the writeback API to tell us "flush X% of
your dirty metadata".  We will have cached data and metadata in
bytes and dirty cached data and metadata in bytes at the generic
writeback level - it's not at all difficult to turn that into a
flush ratio. e.g. take the amount we are over the dirty metadata
background threshold, request writeback for that amount of metadata
as a percentage of the overall dirty metadata.

> For dirty throttling we have to know how much we're pushing and how much we need
> to push, and that _requires_ bytes/pages.

Dirty throttling does not need to know how much work you've asked
the filesystem to do. It does it's own accounting of bytes/pages
being cleaned based on the accounting updates from the filesystem
metadata object IO completion routines. That is what needs to be in
bytes/pages for dirty throttling to work.

> And not like "we can only send you
> bytes/pages to reclaim" but like the throttling stuff has all of it's accounting
> in bytes/pages, so putting in arbitrary object counts into this logic is not
> going to be straightforward.  The system administrator sets their dirty limits
> to absolute numbers or % of total memory.  If xfs can't account for its metadata
> this way then I don't think it can use any sort of infrastructure we provide in
> the current framework.

XFS will account for clean/dirty metadata in bytes, just like btrfs
and ext4 will do. We've already been over this and *solved that
problem*.

But really, though, I'm fed up with having to fight time and time
again over simple changes to core infrastructure that make it
generic rather than specifically tailored to the filesystem that
wants it first.  Merge whatever crap you need for btrfs and I'll
make it work for XFS later and leave what gets fed to btrfs
completely unchanged.

-Dave.
Jan Kara Jan. 3, 2018, 1:59 p.m. UTC | #11
On Wed 03-01-18 13:32:19, Dave Chinner wrote:
> On Tue, Jan 02, 2018 at 11:13:06AM -0500, Josef Bacik wrote:
> > On Wed, Dec 20, 2017 at 03:30:55PM +0100, Jan Kara wrote:
> > > On Wed 20-12-17 08:35:05, Dave Chinner wrote:
> > > > On Tue, Dec 19, 2017 at 01:07:09PM +0100, Jan Kara wrote:
> > > > > On Wed 13-12-17 09:20:04, Dave Chinner wrote:
> > > > > > IOWs, treating metadata like it's one great big data inode doesn't
> > > > > > seem to me to be the right abstraction to use for this - in most
> > > > > > fileystems it's a bunch of objects with a complex dependency tree
> > > > > > and unknown write ordering, not an inode full of data that can be
> > > > > > sequentially written.
> > > > > > 
> > > > > > Maybe we need multiple ops with well defined behaviours. e.g.
> > > > > > ->writeback_metadata() for background writeback, ->sync_metadata() for
> > > > > > sync based operations. That way different filesystems can ignore the
> > > > > > parts they don't need simply by not implementing those operations,
> > > > > > and the writeback code doesn't need to try to cater for all
> > > > > > operations through the one op. The writeback code should be cleaner,
> > > > > > the filesystem code should be cleaner, and we can tailor the work
> > > > > > guidelines for each operation separately so there's less mismatch
> > > > > > between what writeback is asking and how filesystems track dirty
> > > > > > metadata...
> > > > > 
> > > > > I agree that writeback for memory cleaning and writeback for data integrity
> > > > > are two very different things especially for metadata. In fact for data
> > > > > integrity writeback we already have ->sync_fs operation so there the
> > > > > functionality gets duplicated. What we could do is that in
> > > > > writeback_sb_inodes() we'd call ->write_metadata only when
> > > > > work->for_kupdate or work->for_background is set. That way ->write_metadata
> > > > > would be called only for memory cleaning purposes.
> > > > 
> > > > That makes sense, but I still think we need a better indication of
> > > > how much writeback we need to do than just "writeback this chunk of
> > > > pages". That "writeback a chunk" interface is necessary to share
> > > > writeback bandwidth across numerous data inodes so that we don't
> > > > starve any one inode of writeback bandwidth. That's unnecessary for
> > > > metadata writeback on a superblock - we don't need to share that
> > > > bandwidth around hundreds or thousands of inodes. What we actually
> > > > need to know is how much writeback we need to do as a total of all
> > > > the dirty metadata on the superblock.
> > > > 
> > > > Sure, that's not ideal for btrfs and mayext4, but we can write a
> > > > simple generic helper that converts "flush X percent of dirty
> > > > metadata" to a page/byte chunk as the current code does. DOing it
> > > > this way allows filesystems to completely internalise the accounting
> > > > that needs to be done, rather than trying to hack around a
> > > > writeback accounting interface with large impedance mismatches to
> > > > how the filesystem accounts for dirty metadata and/or tracks
> > > > writeback progress.
> > > 
> > > Let me think loud on how we could tie this into how memory cleaning
> > > writeback currently works - the one with for_background == 1 which is
> > > generally used to get amount of dirty pages in the system under control.
> > > We have a queue of inodes to write, we iterate over this queue and ask each
> > > inode to write some amount (e.g. 64 M - exact amount depends on measured
> 
> It's a maximum of 1024 pages per inode.

That's actually a minimum, not maximum, if I read the code in
writeback_chunk_size() right.

> > > writeback bandwidth etc.). Some amount from that inode gets written and we
> > > continue with the next inode in the queue (put this one at the end of the
> > > queue if it still has dirty pages). We do this until:
> > > 
> > > a) the number of dirty pages in the system is below background dirty limit
> > >    and the number dirty pages for this device is below background dirty
> > >    limit for this device.
> > > b) run out of dirty inodes on this device
> > > c) someone queues different type of writeback
> > > 
> > > And we need to somehow incorporate metadata writeback into this loop. I see
> > > two questions here:
> > > 
> > > 1) When / how often should we ask for metadata writeback?
> > > 2) How much to ask to write in one go?
> > > 
> > > The second question is especially tricky in the presence of completely
> > > async metadata flushing in XFS - we can ask to write say half of dirty
> > > metadata but then we have no idea whether the next observation of dirty
> > > metadata counters is with that part of metadata already under writeback /
> > > cleaned or whether xfsaild didn't even start working and pushing more has
> > > no sense.
> 
> Well, like with ext4, we've also got to consider that a bunch of the
> recently dirtied metadata (e.g. from delalloc, EOF updates on IO
> completion, etc) is still pinned in memory because the
> journal has not been flushed/checkpointed. Hence we should not be
> attempting to write back metadata we've dirtied as a result of
> writing data in the background writeback loop.

Agreed. Actually for ext4 I would not expose 'pinned' buffers as dirty to
VM - the journalling layer currently already works that way and it works
well for us. But that's just a small technical detail and different
filesystems can decide differently.

> That greatly simplifies what we need to consider here. That is, we
> just need to sample the ratio of dirty metadata to clean metadata
> before we start data writeback, and we calculate the amount of
> metadata writeback we should trigger from there. We only need to
> do this *once* per background writeback scan for a superblock
> as there is no need for sharing bandwidth between lots of data
> inodes - there's only one metadata inode for ext4/btrfs, and XFS is
> completely async....

OK, agreed again.

> > > Partly, this could be dealt with by telling the filesystem
> > > "metadata dirty target" - i.e. "get your dirty metadata counters below X"
> > > - and whether we communicate that in bytes, pages, or a fraction of
> > > current dirty metadata counter value is a detail I don't have a strong
> > > opinion on now. And the fact is the amount written by the filesystem
> > > doesn't have to be very accurate anyway - we basically just want to make
> > > some forward progress with writing metadata, don't want that to take too
> > > long (so that other writeback from the thread isn't stalled), and if
> > > writeback code is unhappy about the state of counters next time it looks,
> > > it will ask the filesystem again...
> 
> Right. The problem is communicating "how much" to the filesystem in
> a useful manner....

Yep. I'm fine with communication in the form of 'write X% of your dirty
metadata'. That should be useful for XFS and as you mentioned in some
previous email, we can provide a helper function to compute number of pages
to write (including some reasonable upper limit to bound time spent in one
->write_metadata invocation) for ext4 and btrfs.

> > > This gets me directly to another problem with async nature of XFS metadata
> > > writeback. That is that it could get writeback thread into busyloop - we
> > > are supposed to terminate memory cleaning writeback only once dirty
> > > counters are below limit and in case dirty metadata is causing counters to
> > > be over limit, we would just ask in a loop XFS to get metadata below the
> > > target. I suppose XFS could just return "nothing written" from its
> > > ->write_metadata operation and in such case we could sleep a bit before
> > > going for another writeback loop (the same thing happens when filesystem
> > > reports all inodes are locked / busy and it cannot writeback anything). But
> > > it's getting a bit ugly and is it really better than somehow waiting inside
> > > XFS for metadata writeback to occur?  Any idea Dave?
> 
> I tend to think that the whole point of background writeback is to
> do it asynchronously and keep the IO pipe full by avoiding blocking
> on any specific object. i.e. if we can't do writeback from this
> object, then skip it and do it from the next....

Agreed.

> I think we could probably block ->write_metadata if necessary via a
> completion/wakeup style notification when a specific LSN is reached
> by the log tail, but realistically if there's any amount of data
> needing to be written it'll throttle data writes because the IO
> pipeline is being kept full by background metadata writes....

So the problem I'm concerned about is a corner case. Consider a situation
when you have no dirty data, only dirty metadata but enough of them to
trigger background writeback. How should metadata writeback behave for XFS
in this case? Who should be responsible that wb_writeback() just does not
loop invoking ->write_metadata() as fast as CPU allows until xfsaild makes
enough progress?

Thinking about this today, I think this looping prevention belongs to
wb_writeback(). Sadly we don't have much info to decide how long to sleep
before trying more writeback so we'd have to just sleep for
<some_magic_amount> if we found no writeback happened in the last writeback
round before going through the whole writeback loop again. And
->write_metadata() for XFS would need to always return 0 (as in "no progress
made") to make sure this busyloop avoidance logic in wb_writeback()
triggers. ext4 and btrfs would return number of bytes written from
->write_metadata (or just 1 would be enough to indicate some progress in
metadata writeback was made and busyloop avoidance is not needed).

So overall I think I have pretty clear idea on how this all should work to
make ->write_metadata useful for btrfs, XFS, and ext4 and we agree on the
plan.

								Honza
Josef Bacik Jan. 3, 2018, 3:49 p.m. UTC | #12
On Wed, Jan 03, 2018 at 02:59:21PM +0100, Jan Kara wrote:
> On Wed 03-01-18 13:32:19, Dave Chinner wrote:
> > On Tue, Jan 02, 2018 at 11:13:06AM -0500, Josef Bacik wrote:
> > > On Wed, Dec 20, 2017 at 03:30:55PM +0100, Jan Kara wrote:
> > > > On Wed 20-12-17 08:35:05, Dave Chinner wrote:
> > > > > On Tue, Dec 19, 2017 at 01:07:09PM +0100, Jan Kara wrote:
> > > > > > On Wed 13-12-17 09:20:04, Dave Chinner wrote:
> > > > > > > IOWs, treating metadata like it's one great big data inode doesn't
> > > > > > > seem to me to be the right abstraction to use for this - in most
> > > > > > > fileystems it's a bunch of objects with a complex dependency tree
> > > > > > > and unknown write ordering, not an inode full of data that can be
> > > > > > > sequentially written.
> > > > > > > 
> > > > > > > Maybe we need multiple ops with well defined behaviours. e.g.
> > > > > > > ->writeback_metadata() for background writeback, ->sync_metadata() for
> > > > > > > sync based operations. That way different filesystems can ignore the
> > > > > > > parts they don't need simply by not implementing those operations,
> > > > > > > and the writeback code doesn't need to try to cater for all
> > > > > > > operations through the one op. The writeback code should be cleaner,
> > > > > > > the filesystem code should be cleaner, and we can tailor the work
> > > > > > > guidelines for each operation separately so there's less mismatch
> > > > > > > between what writeback is asking and how filesystems track dirty
> > > > > > > metadata...
> > > > > > 
> > > > > > I agree that writeback for memory cleaning and writeback for data integrity
> > > > > > are two very different things especially for metadata. In fact for data
> > > > > > integrity writeback we already have ->sync_fs operation so there the
> > > > > > functionality gets duplicated. What we could do is that in
> > > > > > writeback_sb_inodes() we'd call ->write_metadata only when
> > > > > > work->for_kupdate or work->for_background is set. That way ->write_metadata
> > > > > > would be called only for memory cleaning purposes.
> > > > > 
> > > > > That makes sense, but I still think we need a better indication of
> > > > > how much writeback we need to do than just "writeback this chunk of
> > > > > pages". That "writeback a chunk" interface is necessary to share
> > > > > writeback bandwidth across numerous data inodes so that we don't
> > > > > starve any one inode of writeback bandwidth. That's unnecessary for
> > > > > metadata writeback on a superblock - we don't need to share that
> > > > > bandwidth around hundreds or thousands of inodes. What we actually
> > > > > need to know is how much writeback we need to do as a total of all
> > > > > the dirty metadata on the superblock.
> > > > > 
> > > > > Sure, that's not ideal for btrfs and mayext4, but we can write a
> > > > > simple generic helper that converts "flush X percent of dirty
> > > > > metadata" to a page/byte chunk as the current code does. DOing it
> > > > > this way allows filesystems to completely internalise the accounting
> > > > > that needs to be done, rather than trying to hack around a
> > > > > writeback accounting interface with large impedance mismatches to
> > > > > how the filesystem accounts for dirty metadata and/or tracks
> > > > > writeback progress.
> > > > 
> > > > Let me think loud on how we could tie this into how memory cleaning
> > > > writeback currently works - the one with for_background == 1 which is
> > > > generally used to get amount of dirty pages in the system under control.
> > > > We have a queue of inodes to write, we iterate over this queue and ask each
> > > > inode to write some amount (e.g. 64 M - exact amount depends on measured
> > 
> > It's a maximum of 1024 pages per inode.
> 
> That's actually a minimum, not maximum, if I read the code in
> writeback_chunk_size() right.
> 
> > > > writeback bandwidth etc.). Some amount from that inode gets written and we
> > > > continue with the next inode in the queue (put this one at the end of the
> > > > queue if it still has dirty pages). We do this until:
> > > > 
> > > > a) the number of dirty pages in the system is below background dirty limit
> > > >    and the number dirty pages for this device is below background dirty
> > > >    limit for this device.
> > > > b) run out of dirty inodes on this device
> > > > c) someone queues different type of writeback
> > > > 
> > > > And we need to somehow incorporate metadata writeback into this loop. I see
> > > > two questions here:
> > > > 
> > > > 1) When / how often should we ask for metadata writeback?
> > > > 2) How much to ask to write in one go?
> > > > 
> > > > The second question is especially tricky in the presence of completely
> > > > async metadata flushing in XFS - we can ask to write say half of dirty
> > > > metadata but then we have no idea whether the next observation of dirty
> > > > metadata counters is with that part of metadata already under writeback /
> > > > cleaned or whether xfsaild didn't even start working and pushing more has
> > > > no sense.
> > 
> > Well, like with ext4, we've also got to consider that a bunch of the
> > recently dirtied metadata (e.g. from delalloc, EOF updates on IO
> > completion, etc) is still pinned in memory because the
> > journal has not been flushed/checkpointed. Hence we should not be
> > attempting to write back metadata we've dirtied as a result of
> > writing data in the background writeback loop.
> 
> Agreed. Actually for ext4 I would not expose 'pinned' buffers as dirty to
> VM - the journalling layer currently already works that way and it works
> well for us. But that's just a small technical detail and different
> filesystems can decide differently.
> 
> > That greatly simplifies what we need to consider here. That is, we
> > just need to sample the ratio of dirty metadata to clean metadata
> > before we start data writeback, and we calculate the amount of
> > metadata writeback we should trigger from there. We only need to
> > do this *once* per background writeback scan for a superblock
> > as there is no need for sharing bandwidth between lots of data
> > inodes - there's only one metadata inode for ext4/btrfs, and XFS is
> > completely async....
> 
> OK, agreed again.
> 
> > > > Partly, this could be dealt with by telling the filesystem
> > > > "metadata dirty target" - i.e. "get your dirty metadata counters below X"
> > > > - and whether we communicate that in bytes, pages, or a fraction of
> > > > current dirty metadata counter value is a detail I don't have a strong
> > > > opinion on now. And the fact is the amount written by the filesystem
> > > > doesn't have to be very accurate anyway - we basically just want to make
> > > > some forward progress with writing metadata, don't want that to take too
> > > > long (so that other writeback from the thread isn't stalled), and if
> > > > writeback code is unhappy about the state of counters next time it looks,
> > > > it will ask the filesystem again...
> > 
> > Right. The problem is communicating "how much" to the filesystem in
> > a useful manner....
> 
> Yep. I'm fine with communication in the form of 'write X% of your dirty
> metadata'. That should be useful for XFS and as you mentioned in some
> previous email, we can provide a helper function to compute number of pages
> to write (including some reasonable upper limit to bound time spent in one
> ->write_metadata invocation) for ext4 and btrfs.
> 
> > > > This gets me directly to another problem with async nature of XFS metadata
> > > > writeback. That is that it could get writeback thread into busyloop - we
> > > > are supposed to terminate memory cleaning writeback only once dirty
> > > > counters are below limit and in case dirty metadata is causing counters to
> > > > be over limit, we would just ask in a loop XFS to get metadata below the
> > > > target. I suppose XFS could just return "nothing written" from its
> > > > ->write_metadata operation and in such case we could sleep a bit before
> > > > going for another writeback loop (the same thing happens when filesystem
> > > > reports all inodes are locked / busy and it cannot writeback anything). But
> > > > it's getting a bit ugly and is it really better than somehow waiting inside
> > > > XFS for metadata writeback to occur?  Any idea Dave?
> > 
> > I tend to think that the whole point of background writeback is to
> > do it asynchronously and keep the IO pipe full by avoiding blocking
> > on any specific object. i.e. if we can't do writeback from this
> > object, then skip it and do it from the next....
> 
> Agreed.
> 
> > I think we could probably block ->write_metadata if necessary via a
> > completion/wakeup style notification when a specific LSN is reached
> > by the log tail, but realistically if there's any amount of data
> > needing to be written it'll throttle data writes because the IO
> > pipeline is being kept full by background metadata writes....
> 
> So the problem I'm concerned about is a corner case. Consider a situation
> when you have no dirty data, only dirty metadata but enough of them to
> trigger background writeback. How should metadata writeback behave for XFS
> in this case? Who should be responsible that wb_writeback() just does not
> loop invoking ->write_metadata() as fast as CPU allows until xfsaild makes
> enough progress?
> 
> Thinking about this today, I think this looping prevention belongs to
> wb_writeback(). Sadly we don't have much info to decide how long to sleep
> before trying more writeback so we'd have to just sleep for
> <some_magic_amount> if we found no writeback happened in the last writeback
> round before going through the whole writeback loop again. And
> ->write_metadata() for XFS would need to always return 0 (as in "no progress
> made") to make sure this busyloop avoidance logic in wb_writeback()
> triggers. ext4 and btrfs would return number of bytes written from
> ->write_metadata (or just 1 would be enough to indicate some progress in
> metadata writeback was made and busyloop avoidance is not needed).
> 
> So overall I think I have pretty clear idea on how this all should work to
> make ->write_metadata useful for btrfs, XFS, and ext4 and we agree on the
> plan.
> 

I'm glad you do, I'm still confused.  I'm totally fine with sending a % to the
fs to figure out what it wants, what I'm confused about is how to get that % for
xfs?  Since xfs doesn't mark its actual buffers dirty, so wouldn't use
account_metadata_dirtied and it's family, how do we generate this % for xfs?  Or
am I misunderstanding and you do plan to use those helpers?  If you do plan to
use them, then we just need to figure out what we want the ratio to be of, and
then you'll be happy Dave?  I'm not trying to argue with you Dave, we're just in
that "talking past each other" stage of every email conversation we've ever had,
I'm trying to get to the "we both understand what we're both saying and are
happy again" stage.  Thanks,

Josef
Jan Kara Jan. 3, 2018, 4:26 p.m. UTC | #13
On Wed 03-01-18 10:49:33, Josef Bacik wrote:
> On Wed, Jan 03, 2018 at 02:59:21PM +0100, Jan Kara wrote:
> > On Wed 03-01-18 13:32:19, Dave Chinner wrote:
> > > On Tue, Jan 02, 2018 at 11:13:06AM -0500, Josef Bacik wrote:
> > > > On Wed, Dec 20, 2017 at 03:30:55PM +0100, Jan Kara wrote:
> > > > > On Wed 20-12-17 08:35:05, Dave Chinner wrote:
> > > > > > On Tue, Dec 19, 2017 at 01:07:09PM +0100, Jan Kara wrote:
> > > > > > > On Wed 13-12-17 09:20:04, Dave Chinner wrote:
> > > > > > > > IOWs, treating metadata like it's one great big data inode doesn't
> > > > > > > > seem to me to be the right abstraction to use for this - in most
> > > > > > > > fileystems it's a bunch of objects with a complex dependency tree
> > > > > > > > and unknown write ordering, not an inode full of data that can be
> > > > > > > > sequentially written.
> > > > > > > > 
> > > > > > > > Maybe we need multiple ops with well defined behaviours. e.g.
> > > > > > > > ->writeback_metadata() for background writeback, ->sync_metadata() for
> > > > > > > > sync based operations. That way different filesystems can ignore the
> > > > > > > > parts they don't need simply by not implementing those operations,
> > > > > > > > and the writeback code doesn't need to try to cater for all
> > > > > > > > operations through the one op. The writeback code should be cleaner,
> > > > > > > > the filesystem code should be cleaner, and we can tailor the work
> > > > > > > > guidelines for each operation separately so there's less mismatch
> > > > > > > > between what writeback is asking and how filesystems track dirty
> > > > > > > > metadata...
> > > > > > > 
> > > > > > > I agree that writeback for memory cleaning and writeback for data integrity
> > > > > > > are two very different things especially for metadata. In fact for data
> > > > > > > integrity writeback we already have ->sync_fs operation so there the
> > > > > > > functionality gets duplicated. What we could do is that in
> > > > > > > writeback_sb_inodes() we'd call ->write_metadata only when
> > > > > > > work->for_kupdate or work->for_background is set. That way ->write_metadata
> > > > > > > would be called only for memory cleaning purposes.
> > > > > > 
> > > > > > That makes sense, but I still think we need a better indication of
> > > > > > how much writeback we need to do than just "writeback this chunk of
> > > > > > pages". That "writeback a chunk" interface is necessary to share
> > > > > > writeback bandwidth across numerous data inodes so that we don't
> > > > > > starve any one inode of writeback bandwidth. That's unnecessary for
> > > > > > metadata writeback on a superblock - we don't need to share that
> > > > > > bandwidth around hundreds or thousands of inodes. What we actually
> > > > > > need to know is how much writeback we need to do as a total of all
> > > > > > the dirty metadata on the superblock.
> > > > > > 
> > > > > > Sure, that's not ideal for btrfs and mayext4, but we can write a
> > > > > > simple generic helper that converts "flush X percent of dirty
> > > > > > metadata" to a page/byte chunk as the current code does. DOing it
> > > > > > this way allows filesystems to completely internalise the accounting
> > > > > > that needs to be done, rather than trying to hack around a
> > > > > > writeback accounting interface with large impedance mismatches to
> > > > > > how the filesystem accounts for dirty metadata and/or tracks
> > > > > > writeback progress.
> > > > > 
> > > > > Let me think loud on how we could tie this into how memory cleaning
> > > > > writeback currently works - the one with for_background == 1 which is
> > > > > generally used to get amount of dirty pages in the system under control.
> > > > > We have a queue of inodes to write, we iterate over this queue and ask each
> > > > > inode to write some amount (e.g. 64 M - exact amount depends on measured
> > > 
> > > It's a maximum of 1024 pages per inode.
> > 
> > That's actually a minimum, not maximum, if I read the code in
> > writeback_chunk_size() right.
> > 
> > > > > writeback bandwidth etc.). Some amount from that inode gets written and we
> > > > > continue with the next inode in the queue (put this one at the end of the
> > > > > queue if it still has dirty pages). We do this until:
> > > > > 
> > > > > a) the number of dirty pages in the system is below background dirty limit
> > > > >    and the number dirty pages for this device is below background dirty
> > > > >    limit for this device.
> > > > > b) run out of dirty inodes on this device
> > > > > c) someone queues different type of writeback
> > > > > 
> > > > > And we need to somehow incorporate metadata writeback into this loop. I see
> > > > > two questions here:
> > > > > 
> > > > > 1) When / how often should we ask for metadata writeback?
> > > > > 2) How much to ask to write in one go?
> > > > > 
> > > > > The second question is especially tricky in the presence of completely
> > > > > async metadata flushing in XFS - we can ask to write say half of dirty
> > > > > metadata but then we have no idea whether the next observation of dirty
> > > > > metadata counters is with that part of metadata already under writeback /
> > > > > cleaned or whether xfsaild didn't even start working and pushing more has
> > > > > no sense.
> > > 
> > > Well, like with ext4, we've also got to consider that a bunch of the
> > > recently dirtied metadata (e.g. from delalloc, EOF updates on IO
> > > completion, etc) is still pinned in memory because the
> > > journal has not been flushed/checkpointed. Hence we should not be
> > > attempting to write back metadata we've dirtied as a result of
> > > writing data in the background writeback loop.
> > 
> > Agreed. Actually for ext4 I would not expose 'pinned' buffers as dirty to
> > VM - the journalling layer currently already works that way and it works
> > well for us. But that's just a small technical detail and different
> > filesystems can decide differently.
> > 
> > > That greatly simplifies what we need to consider here. That is, we
> > > just need to sample the ratio of dirty metadata to clean metadata
> > > before we start data writeback, and we calculate the amount of
> > > metadata writeback we should trigger from there. We only need to
> > > do this *once* per background writeback scan for a superblock
> > > as there is no need for sharing bandwidth between lots of data
> > > inodes - there's only one metadata inode for ext4/btrfs, and XFS is
> > > completely async....
> > 
> > OK, agreed again.
> > 
> > > > > Partly, this could be dealt with by telling the filesystem
> > > > > "metadata dirty target" - i.e. "get your dirty metadata counters below X"
> > > > > - and whether we communicate that in bytes, pages, or a fraction of
> > > > > current dirty metadata counter value is a detail I don't have a strong
> > > > > opinion on now. And the fact is the amount written by the filesystem
> > > > > doesn't have to be very accurate anyway - we basically just want to make
> > > > > some forward progress with writing metadata, don't want that to take too
> > > > > long (so that other writeback from the thread isn't stalled), and if
> > > > > writeback code is unhappy about the state of counters next time it looks,
> > > > > it will ask the filesystem again...
> > > 
> > > Right. The problem is communicating "how much" to the filesystem in
> > > a useful manner....
> > 
> > Yep. I'm fine with communication in the form of 'write X% of your dirty
> > metadata'. That should be useful for XFS and as you mentioned in some
> > previous email, we can provide a helper function to compute number of pages
> > to write (including some reasonable upper limit to bound time spent in one
> > ->write_metadata invocation) for ext4 and btrfs.
> > 
> > > > > This gets me directly to another problem with async nature of XFS metadata
> > > > > writeback. That is that it could get writeback thread into busyloop - we
> > > > > are supposed to terminate memory cleaning writeback only once dirty
> > > > > counters are below limit and in case dirty metadata is causing counters to
> > > > > be over limit, we would just ask in a loop XFS to get metadata below the
> > > > > target. I suppose XFS could just return "nothing written" from its
> > > > > ->write_metadata operation and in such case we could sleep a bit before
> > > > > going for another writeback loop (the same thing happens when filesystem
> > > > > reports all inodes are locked / busy and it cannot writeback anything). But
> > > > > it's getting a bit ugly and is it really better than somehow waiting inside
> > > > > XFS for metadata writeback to occur?  Any idea Dave?
> > > 
> > > I tend to think that the whole point of background writeback is to
> > > do it asynchronously and keep the IO pipe full by avoiding blocking
> > > on any specific object. i.e. if we can't do writeback from this
> > > object, then skip it and do it from the next....
> > 
> > Agreed.
> > 
> > > I think we could probably block ->write_metadata if necessary via a
> > > completion/wakeup style notification when a specific LSN is reached
> > > by the log tail, but realistically if there's any amount of data
> > > needing to be written it'll throttle data writes because the IO
> > > pipeline is being kept full by background metadata writes....
> > 
> > So the problem I'm concerned about is a corner case. Consider a situation
> > when you have no dirty data, only dirty metadata but enough of them to
> > trigger background writeback. How should metadata writeback behave for XFS
> > in this case? Who should be responsible that wb_writeback() just does not
> > loop invoking ->write_metadata() as fast as CPU allows until xfsaild makes
> > enough progress?
> > 
> > Thinking about this today, I think this looping prevention belongs to
> > wb_writeback(). Sadly we don't have much info to decide how long to sleep
> > before trying more writeback so we'd have to just sleep for
> > <some_magic_amount> if we found no writeback happened in the last writeback
> > round before going through the whole writeback loop again. And
> > ->write_metadata() for XFS would need to always return 0 (as in "no progress
> > made") to make sure this busyloop avoidance logic in wb_writeback()
> > triggers. ext4 and btrfs would return number of bytes written from
> > ->write_metadata (or just 1 would be enough to indicate some progress in
> > metadata writeback was made and busyloop avoidance is not needed).
> > 
> > So overall I think I have pretty clear idea on how this all should work to
> > make ->write_metadata useful for btrfs, XFS, and ext4 and we agree on the
> > plan.
> > 
> 
> I'm glad you do, I'm still confused.  I'm totally fine with sending a % to the
> fs to figure out what it wants, what I'm confused about is how to get that % for
> xfs?  Since xfs doesn't mark its actual buffers dirty, so wouldn't use
> account_metadata_dirtied and it's family, how do we generate this % for xfs?  Or
> am I misunderstanding and you do plan to use those helpers?

AFAIU he plans to use account_metadata_dirtied() & co. in XFS.

> If you do plan to use them, then we just need to figure out what we want
> the ratio to be of, and then you'll be happy Dave?

Reasonably natural dirty target would be dirty_background_ratio of total
metadata amount to be dirty. We would have to be somewhat creative if
dirty_background_bytes is actually set instead of dirty_background_ratio
and use ratio like dirty_background_bytes / (data + metadata amount) but
it's doable...

								Honza
Josef Bacik Jan. 3, 2018, 4:29 p.m. UTC | #14
On Wed, Jan 03, 2018 at 05:26:03PM +0100, Jan Kara wrote:
> On Wed 03-01-18 10:49:33, Josef Bacik wrote:
> > On Wed, Jan 03, 2018 at 02:59:21PM +0100, Jan Kara wrote:
> > > On Wed 03-01-18 13:32:19, Dave Chinner wrote:
> > > > On Tue, Jan 02, 2018 at 11:13:06AM -0500, Josef Bacik wrote:
> > > > > On Wed, Dec 20, 2017 at 03:30:55PM +0100, Jan Kara wrote:
> > > > > > On Wed 20-12-17 08:35:05, Dave Chinner wrote:
> > > > > > > On Tue, Dec 19, 2017 at 01:07:09PM +0100, Jan Kara wrote:
> > > > > > > > On Wed 13-12-17 09:20:04, Dave Chinner wrote:
> > > > > > > > > IOWs, treating metadata like it's one great big data inode doesn't
> > > > > > > > > seem to me to be the right abstraction to use for this - in most
> > > > > > > > > fileystems it's a bunch of objects with a complex dependency tree
> > > > > > > > > and unknown write ordering, not an inode full of data that can be
> > > > > > > > > sequentially written.
> > > > > > > > > 
> > > > > > > > > Maybe we need multiple ops with well defined behaviours. e.g.
> > > > > > > > > ->writeback_metadata() for background writeback, ->sync_metadata() for
> > > > > > > > > sync based operations. That way different filesystems can ignore the
> > > > > > > > > parts they don't need simply by not implementing those operations,
> > > > > > > > > and the writeback code doesn't need to try to cater for all
> > > > > > > > > operations through the one op. The writeback code should be cleaner,
> > > > > > > > > the filesystem code should be cleaner, and we can tailor the work
> > > > > > > > > guidelines for each operation separately so there's less mismatch
> > > > > > > > > between what writeback is asking and how filesystems track dirty
> > > > > > > > > metadata...
> > > > > > > > 
> > > > > > > > I agree that writeback for memory cleaning and writeback for data integrity
> > > > > > > > are two very different things especially for metadata. In fact for data
> > > > > > > > integrity writeback we already have ->sync_fs operation so there the
> > > > > > > > functionality gets duplicated. What we could do is that in
> > > > > > > > writeback_sb_inodes() we'd call ->write_metadata only when
> > > > > > > > work->for_kupdate or work->for_background is set. That way ->write_metadata
> > > > > > > > would be called only for memory cleaning purposes.
> > > > > > > 
> > > > > > > That makes sense, but I still think we need a better indication of
> > > > > > > how much writeback we need to do than just "writeback this chunk of
> > > > > > > pages". That "writeback a chunk" interface is necessary to share
> > > > > > > writeback bandwidth across numerous data inodes so that we don't
> > > > > > > starve any one inode of writeback bandwidth. That's unnecessary for
> > > > > > > metadata writeback on a superblock - we don't need to share that
> > > > > > > bandwidth around hundreds or thousands of inodes. What we actually
> > > > > > > need to know is how much writeback we need to do as a total of all
> > > > > > > the dirty metadata on the superblock.
> > > > > > > 
> > > > > > > Sure, that's not ideal for btrfs and mayext4, but we can write a
> > > > > > > simple generic helper that converts "flush X percent of dirty
> > > > > > > metadata" to a page/byte chunk as the current code does. DOing it
> > > > > > > this way allows filesystems to completely internalise the accounting
> > > > > > > that needs to be done, rather than trying to hack around a
> > > > > > > writeback accounting interface with large impedance mismatches to
> > > > > > > how the filesystem accounts for dirty metadata and/or tracks
> > > > > > > writeback progress.
> > > > > > 
> > > > > > Let me think loud on how we could tie this into how memory cleaning
> > > > > > writeback currently works - the one with for_background == 1 which is
> > > > > > generally used to get amount of dirty pages in the system under control.
> > > > > > We have a queue of inodes to write, we iterate over this queue and ask each
> > > > > > inode to write some amount (e.g. 64 M - exact amount depends on measured
> > > > 
> > > > It's a maximum of 1024 pages per inode.
> > > 
> > > That's actually a minimum, not maximum, if I read the code in
> > > writeback_chunk_size() right.
> > > 
> > > > > > writeback bandwidth etc.). Some amount from that inode gets written and we
> > > > > > continue with the next inode in the queue (put this one at the end of the
> > > > > > queue if it still has dirty pages). We do this until:
> > > > > > 
> > > > > > a) the number of dirty pages in the system is below background dirty limit
> > > > > >    and the number dirty pages for this device is below background dirty
> > > > > >    limit for this device.
> > > > > > b) run out of dirty inodes on this device
> > > > > > c) someone queues different type of writeback
> > > > > > 
> > > > > > And we need to somehow incorporate metadata writeback into this loop. I see
> > > > > > two questions here:
> > > > > > 
> > > > > > 1) When / how often should we ask for metadata writeback?
> > > > > > 2) How much to ask to write in one go?
> > > > > > 
> > > > > > The second question is especially tricky in the presence of completely
> > > > > > async metadata flushing in XFS - we can ask to write say half of dirty
> > > > > > metadata but then we have no idea whether the next observation of dirty
> > > > > > metadata counters is with that part of metadata already under writeback /
> > > > > > cleaned or whether xfsaild didn't even start working and pushing more has
> > > > > > no sense.
> > > > 
> > > > Well, like with ext4, we've also got to consider that a bunch of the
> > > > recently dirtied metadata (e.g. from delalloc, EOF updates on IO
> > > > completion, etc) is still pinned in memory because the
> > > > journal has not been flushed/checkpointed. Hence we should not be
> > > > attempting to write back metadata we've dirtied as a result of
> > > > writing data in the background writeback loop.
> > > 
> > > Agreed. Actually for ext4 I would not expose 'pinned' buffers as dirty to
> > > VM - the journalling layer currently already works that way and it works
> > > well for us. But that's just a small technical detail and different
> > > filesystems can decide differently.
> > > 
> > > > That greatly simplifies what we need to consider here. That is, we
> > > > just need to sample the ratio of dirty metadata to clean metadata
> > > > before we start data writeback, and we calculate the amount of
> > > > metadata writeback we should trigger from there. We only need to
> > > > do this *once* per background writeback scan for a superblock
> > > > as there is no need for sharing bandwidth between lots of data
> > > > inodes - there's only one metadata inode for ext4/btrfs, and XFS is
> > > > completely async....
> > > 
> > > OK, agreed again.
> > > 
> > > > > > Partly, this could be dealt with by telling the filesystem
> > > > > > "metadata dirty target" - i.e. "get your dirty metadata counters below X"
> > > > > > - and whether we communicate that in bytes, pages, or a fraction of
> > > > > > current dirty metadata counter value is a detail I don't have a strong
> > > > > > opinion on now. And the fact is the amount written by the filesystem
> > > > > > doesn't have to be very accurate anyway - we basically just want to make
> > > > > > some forward progress with writing metadata, don't want that to take too
> > > > > > long (so that other writeback from the thread isn't stalled), and if
> > > > > > writeback code is unhappy about the state of counters next time it looks,
> > > > > > it will ask the filesystem again...
> > > > 
> > > > Right. The problem is communicating "how much" to the filesystem in
> > > > a useful manner....
> > > 
> > > Yep. I'm fine with communication in the form of 'write X% of your dirty
> > > metadata'. That should be useful for XFS and as you mentioned in some
> > > previous email, we can provide a helper function to compute number of pages
> > > to write (including some reasonable upper limit to bound time spent in one
> > > ->write_metadata invocation) for ext4 and btrfs.
> > > 
> > > > > > This gets me directly to another problem with async nature of XFS metadata
> > > > > > writeback. That is that it could get writeback thread into busyloop - we
> > > > > > are supposed to terminate memory cleaning writeback only once dirty
> > > > > > counters are below limit and in case dirty metadata is causing counters to
> > > > > > be over limit, we would just ask in a loop XFS to get metadata below the
> > > > > > target. I suppose XFS could just return "nothing written" from its
> > > > > > ->write_metadata operation and in such case we could sleep a bit before
> > > > > > going for another writeback loop (the same thing happens when filesystem
> > > > > > reports all inodes are locked / busy and it cannot writeback anything). But
> > > > > > it's getting a bit ugly and is it really better than somehow waiting inside
> > > > > > XFS for metadata writeback to occur?  Any idea Dave?
> > > > 
> > > > I tend to think that the whole point of background writeback is to
> > > > do it asynchronously and keep the IO pipe full by avoiding blocking
> > > > on any specific object. i.e. if we can't do writeback from this
> > > > object, then skip it and do it from the next....
> > > 
> > > Agreed.
> > > 
> > > > I think we could probably block ->write_metadata if necessary via a
> > > > completion/wakeup style notification when a specific LSN is reached
> > > > by the log tail, but realistically if there's any amount of data
> > > > needing to be written it'll throttle data writes because the IO
> > > > pipeline is being kept full by background metadata writes....
> > > 
> > > So the problem I'm concerned about is a corner case. Consider a situation
> > > when you have no dirty data, only dirty metadata but enough of them to
> > > trigger background writeback. How should metadata writeback behave for XFS
> > > in this case? Who should be responsible that wb_writeback() just does not
> > > loop invoking ->write_metadata() as fast as CPU allows until xfsaild makes
> > > enough progress?
> > > 
> > > Thinking about this today, I think this looping prevention belongs to
> > > wb_writeback(). Sadly we don't have much info to decide how long to sleep
> > > before trying more writeback so we'd have to just sleep for
> > > <some_magic_amount> if we found no writeback happened in the last writeback
> > > round before going through the whole writeback loop again. And
> > > ->write_metadata() for XFS would need to always return 0 (as in "no progress
> > > made") to make sure this busyloop avoidance logic in wb_writeback()
> > > triggers. ext4 and btrfs would return number of bytes written from
> > > ->write_metadata (or just 1 would be enough to indicate some progress in
> > > metadata writeback was made and busyloop avoidance is not needed).
> > > 
> > > So overall I think I have pretty clear idea on how this all should work to
> > > make ->write_metadata useful for btrfs, XFS, and ext4 and we agree on the
> > > plan.
> > > 
> > 
> > I'm glad you do, I'm still confused.  I'm totally fine with sending a % to the
> > fs to figure out what it wants, what I'm confused about is how to get that % for
> > xfs?  Since xfs doesn't mark its actual buffers dirty, so wouldn't use
> > account_metadata_dirtied and it's family, how do we generate this % for xfs?  Or
> > am I misunderstanding and you do plan to use those helpers?
> 
> AFAIU he plans to use account_metadata_dirtied() & co. in XFS.
> 
> > If you do plan to use them, then we just need to figure out what we want
> > the ratio to be of, and then you'll be happy Dave?
> 
> Reasonably natural dirty target would be dirty_background_ratio of total
> metadata amount to be dirty. We would have to be somewhat creative if
> dirty_background_bytes is actually set instead of dirty_background_ratio
> and use ratio like dirty_background_bytes / (data + metadata amount) but
> it's doable...
> 

Oh ok well if that's the case then I'll fix this up to be a ratio, test
everything, and send it along probably early next week.  Thanks,

Josef
Dave Chinner Jan. 4, 2018, 1:32 a.m. UTC | #15
On Wed, Jan 03, 2018 at 02:59:21PM +0100, Jan Kara wrote:
> On Wed 03-01-18 13:32:19, Dave Chinner wrote:
> > I think we could probably block ->write_metadata if necessary via a
> > completion/wakeup style notification when a specific LSN is reached
> > by the log tail, but realistically if there's any amount of data
> > needing to be written it'll throttle data writes because the IO
> > pipeline is being kept full by background metadata writes....
> 
> So the problem I'm concerned about is a corner case. Consider a situation
> when you have no dirty data, only dirty metadata but enough of them to
> trigger background writeback. How should metadata writeback behave for XFS
> in this case? Who should be responsible that wb_writeback() just does not
> loop invoking ->write_metadata() as fast as CPU allows until xfsaild makes
> enough progress?
>
> Thinking about this today, I think this looping prevention belongs to
> wb_writeback().

Well, backgroudn data writeback can block in two ways. One is during
IO submission when the request queue is full, the other is when all
dirty inodes have had some work done on them and have all been moved
to b_more_io - wb_writeback waits for the __I_SYNC bit to be cleared
on the last(?) inode on that list, hence backing off before
submitting more IO.

IOws, there's a "during writeback" blocking mechanism as well as a
"between cycles" block mechanism.

> Sadly we don't have much info to decide how long to sleep
> before trying more writeback so we'd have to just sleep for
> <some_magic_amount> if we found no writeback happened in the last writeback
> round before going through the whole writeback loop again.

Right - I don't think we can provide a generic "between cycles"
blocking mechanism for XFS, but I'm pretty sure we can emulate a
"during writeback" blocking mechanism to avoid busy looping inside
the XFS code.

e.g. if we get a writeback call that asks for 5% to be written,
and we already have a metadata writeback target of 5% in place,
that means we should block for a while. That would emulate request
queue blocking and prevent busy looping in this case....

> And
> ->write_metadata() for XFS would need to always return 0 (as in "no progress
> made") to make sure this busyloop avoidance logic in wb_writeback()
> triggers. ext4 and btrfs would return number of bytes written from
> ->write_metadata (or just 1 would be enough to indicate some progress in
> metadata writeback was made and busyloop avoidance is not needed).

Well, if we block for a little while, we can indicate that progress
has been made and this whole mess would go away, right?

Cheers,

Dave.
Jan Kara Jan. 4, 2018, 9:10 a.m. UTC | #16
On Thu 04-01-18 12:32:07, Dave Chinner wrote:
> On Wed, Jan 03, 2018 at 02:59:21PM +0100, Jan Kara wrote:
> > On Wed 03-01-18 13:32:19, Dave Chinner wrote:
> > > I think we could probably block ->write_metadata if necessary via a
> > > completion/wakeup style notification when a specific LSN is reached
> > > by the log tail, but realistically if there's any amount of data
> > > needing to be written it'll throttle data writes because the IO
> > > pipeline is being kept full by background metadata writes....
> > 
> > So the problem I'm concerned about is a corner case. Consider a situation
> > when you have no dirty data, only dirty metadata but enough of them to
> > trigger background writeback. How should metadata writeback behave for XFS
> > in this case? Who should be responsible that wb_writeback() just does not
> > loop invoking ->write_metadata() as fast as CPU allows until xfsaild makes
> > enough progress?
> >
> > Thinking about this today, I think this looping prevention belongs to
> > wb_writeback().
> 
> Well, backgroudn data writeback can block in two ways. One is during
> IO submission when the request queue is full, the other is when all
> dirty inodes have had some work done on them and have all been moved
> to b_more_io - wb_writeback waits for the __I_SYNC bit to be cleared
> on the last(?) inode on that list, hence backing off before
> submitting more IO.
> 
> IOws, there's a "during writeback" blocking mechanism as well as a
> "between cycles" block mechanism.
> 
> > Sadly we don't have much info to decide how long to sleep
> > before trying more writeback so we'd have to just sleep for
> > <some_magic_amount> if we found no writeback happened in the last writeback
> > round before going through the whole writeback loop again.
> 
> Right - I don't think we can provide a generic "between cycles"
> blocking mechanism for XFS, but I'm pretty sure we can emulate a
> "during writeback" blocking mechanism to avoid busy looping inside
> the XFS code.
> 
> e.g. if we get a writeback call that asks for 5% to be written,
> and we already have a metadata writeback target of 5% in place,
> that means we should block for a while. That would emulate request
> queue blocking and prevent busy looping in this case....

If you can do this in XFS then fine, it saves some mess in the generic
code.

> > And
> > ->write_metadata() for XFS would need to always return 0 (as in "no progress
> > made") to make sure this busyloop avoidance logic in wb_writeback()
> > triggers. ext4 and btrfs would return number of bytes written from
> > ->write_metadata (or just 1 would be enough to indicate some progress in
> > metadata writeback was made and busyloop avoidance is not needed).
> 
> Well, if we block for a little while, we can indicate that progress
> has been made and this whole mess would go away, right?

Right. So let's just ignore the problem for the sake of Josef's patch set.
Once the patches land and when XFS starts using the infrastructure, we will
make sure this is handled properly.

								Honza
Chandan Rajendra Jan. 29, 2018, 9:06 a.m. UTC | #17
On Wednesday, January 3, 2018 9:59:24 PM IST Josef Bacik wrote:
> On Wed, Jan 03, 2018 at 05:26:03PM +0100, Jan Kara wrote:

> 
> Oh ok well if that's the case then I'll fix this up to be a ratio, test
> everything, and send it along probably early next week.  Thanks,
> 

Hi Josef,

Did you get a chance to work on the next version of this patchset?
Chandan Rajendra Sept. 28, 2018, 8:37 a.m. UTC | #18
On Monday, January 29, 2018 2:36:15 PM IST Chandan Rajendra wrote:
> On Wednesday, January 3, 2018 9:59:24 PM IST Josef Bacik wrote:
> > On Wed, Jan 03, 2018 at 05:26:03PM +0100, Jan Kara wrote:
> 
> > 
> > Oh ok well if that's the case then I'll fix this up to be a ratio, test
> > everything, and send it along probably early next week.  Thanks,
> > 
> 
> Hi Josef,
> 
> Did you get a chance to work on the next version of this patchset?
> 
> 
> 

Josef,  Any updates on this and the "Kill Btree inode" patchset?
diff mbox

Patch

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 987448ed7698..fba703dff678 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1479,6 +1479,31 @@  static long writeback_chunk_size(struct bdi_writeback *wb,
 	return pages;
 }
 
+static long writeback_sb_metadata(struct super_block *sb,
+				  struct bdi_writeback *wb,
+				  struct wb_writeback_work *work)
+{
+	struct writeback_control wbc = {
+		.sync_mode		= work->sync_mode,
+		.tagged_writepages	= work->tagged_writepages,
+		.for_kupdate		= work->for_kupdate,
+		.for_background		= work->for_background,
+		.for_sync		= work->for_sync,
+		.range_cyclic		= work->range_cyclic,
+		.range_start		= 0,
+		.range_end		= LLONG_MAX,
+	};
+	long write_chunk;
+
+	write_chunk = writeback_chunk_size(wb, work);
+	wbc.nr_to_write = write_chunk;
+	sb->s_op->write_metadata(sb, &wbc);
+	work->nr_pages -= write_chunk - wbc.nr_to_write;
+
+	return write_chunk - wbc.nr_to_write;
+}
+
+
 /*
  * Write a portion of b_io inodes which belong to @sb.
  *
@@ -1505,6 +1530,7 @@  static long writeback_sb_inodes(struct super_block *sb,
 	unsigned long start_time = jiffies;
 	long write_chunk;
 	long wrote = 0;  /* count both pages and inodes */
+	bool done = false;
 
 	while (!list_empty(&wb->b_io)) {
 		struct inode *inode = wb_inode(wb->b_io.prev);
@@ -1621,12 +1647,18 @@  static long writeback_sb_inodes(struct super_block *sb,
 		 * background threshold and other termination conditions.
 		 */
 		if (wrote) {
-			if (time_is_before_jiffies(start_time + HZ / 10UL))
-				break;
-			if (work->nr_pages <= 0)
+			if (time_is_before_jiffies(start_time + HZ / 10UL) ||
+			    work->nr_pages <= 0) {
+				done = true;
 				break;
+			}
 		}
 	}
+	if (!done && sb->s_op->write_metadata) {
+		spin_unlock(&wb->list_lock);
+		wrote += writeback_sb_metadata(sb, wb, work);
+		spin_lock(&wb->list_lock);
+	}
 	return wrote;
 }
 
@@ -1635,6 +1667,7 @@  static long __writeback_inodes_wb(struct bdi_writeback *wb,
 {
 	unsigned long start_time = jiffies;
 	long wrote = 0;
+	bool done = false;
 
 	while (!list_empty(&wb->b_io)) {
 		struct inode *inode = wb_inode(wb->b_io.prev);
@@ -1654,12 +1687,39 @@  static long __writeback_inodes_wb(struct bdi_writeback *wb,
 
 		/* refer to the same tests at the end of writeback_sb_inodes */
 		if (wrote) {
-			if (time_is_before_jiffies(start_time + HZ / 10UL))
-				break;
-			if (work->nr_pages <= 0)
+			if (time_is_before_jiffies(start_time + HZ / 10UL) ||
+			    work->nr_pages <= 0) {
+				done = true;
 				break;
+			}
 		}
 	}
+
+	if (!done && wb_stat(wb, WB_METADATA_DIRTY_BYTES)) {
+		LIST_HEAD(list);
+
+		spin_unlock(&wb->list_lock);
+		spin_lock(&wb->bdi->sb_list_lock);
+		list_splice_init(&wb->bdi->dirty_sb_list, &list);
+		while (!list_empty(&list)) {
+			struct super_block *sb;
+
+			sb = list_first_entry(&list, struct super_block,
+					      s_bdi_dirty_list);
+			list_move_tail(&sb->s_bdi_dirty_list,
+				       &wb->bdi->dirty_sb_list);
+			if (!sb->s_op->write_metadata)
+				continue;
+			if (!trylock_super(sb))
+				continue;
+			spin_unlock(&wb->bdi->sb_list_lock);
+			wrote += writeback_sb_metadata(sb, wb, work);
+			spin_lock(&wb->bdi->sb_list_lock);
+			up_read(&sb->s_umount);
+		}
+		spin_unlock(&wb->bdi->sb_list_lock);
+		spin_lock(&wb->list_lock);
+	}
 	/* Leave any unwritten inodes on b_io */
 	return wrote;
 }
diff --git a/fs/super.c b/fs/super.c
index 166c4ee0d0ed..2290bef486a3 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -214,6 +214,7 @@  static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	spin_lock_init(&s->s_inode_list_lock);
 	INIT_LIST_HEAD(&s->s_inodes_wb);
 	spin_lock_init(&s->s_inode_wblist_lock);
+	INIT_LIST_HEAD(&s->s_bdi_dirty_list);
 
 	if (list_lru_init_memcg(&s->s_dentry_lru))
 		goto fail;
@@ -446,6 +447,11 @@  void generic_shutdown_super(struct super_block *sb)
 	spin_unlock(&sb_lock);
 	up_write(&sb->s_umount);
 	if (sb->s_bdi != &noop_backing_dev_info) {
+		if (!list_empty(&sb->s_bdi_dirty_list)) {
+			spin_lock(&sb->s_bdi->sb_list_lock);
+			list_del_init(&sb->s_bdi_dirty_list);
+			spin_unlock(&sb->s_bdi->sb_list_lock);
+		}
 		bdi_put(sb->s_bdi);
 		sb->s_bdi = &noop_backing_dev_info;
 	}
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 78c65e2910dc..a961f9a51a38 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -176,6 +176,8 @@  struct backing_dev_info {
 
 	struct timer_list laptop_mode_wb_timer;
 
+	spinlock_t sb_list_lock;
+	struct list_head dirty_sb_list;
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debug_dir;
 	struct dentry *debug_stats;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 339e73742e73..298a28eaed2b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1440,6 +1440,8 @@  struct super_block {
 
 	spinlock_t		s_inode_wblist_lock;
 	struct list_head	s_inodes_wb;	/* writeback inodes */
+
+	struct list_head        s_bdi_dirty_list;
 } __randomize_layout;
 
 /* Helper functions so that in most cases filesystems will
@@ -1830,6 +1832,8 @@  struct super_operations {
 				  struct shrink_control *);
 	long (*free_cached_objects)(struct super_block *,
 				    struct shrink_control *);
+	void (*write_metadata)(struct super_block *sb,
+			       struct writeback_control *wbc);
 };
 
 /*
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0aad67bc0898..e3aa4e0dd15e 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -839,6 +839,8 @@  static int bdi_init(struct backing_dev_info *bdi)
 	bdi->max_prop_frac = FPROP_FRAC_BASE;
 	INIT_LIST_HEAD(&bdi->bdi_list);
 	INIT_LIST_HEAD(&bdi->wb_list);
+	INIT_LIST_HEAD(&bdi->dirty_sb_list);
+	spin_lock_init(&bdi->sb_list_lock);
 	init_waitqueue_head(&bdi->wb_waitq);
 
 	ret = cgwb_bdi_init(bdi);