Message ID | 20250313233341.1675324-29-dhowells@redhat.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | ceph, rbd, netfs: Make ceph fully use netfslib | expand |
On Thu, 2025-03-13 at 23:33 +0000, David Howells wrote: > Make some adjustments to the handling of netfs groups so that ceph can use > them for snap contexts: > > - Move netfs_get_group(), netfs_put_group() and netfs_put_group_many() to > linux/netfs.h so that ceph can build its snap context on netfs groups. > > - Move netfs_set_group() and __netfs_set_group() to linux/netfs.h so that > ceph_dirty_folio() can call them from inside of the locked section in > which it finds the snap context to attach. > > - Provide a netfs_writepages_group() that takes a group as a parameter and > attaches it to the request and make netfs_free_request() drop the ref on > it. netfs_writepages() then becomes a wrapper that passes in a NULL > group. > > - In netfs_perform_write(), only consider a folio to have a conflicting > group if the folio's group pointer isn't NULL and if the folio is dirty. > > - In netfs_perform_write(), interject a small 10ms sleep after every 16 > attempts to flush a folio within a single call. > > Signed-off-by: David Howells <dhowells@redhat.com> > cc: Jeff Layton <jlayton@kernel.org> > cc: Viacheslav Dubeyko <slava@dubeyko.com> > cc: Alex Markuze <amarkuze@redhat.com> > cc: Ilya Dryomov <idryomov@gmail.com> > cc: ceph-devel@vger.kernel.org > cc: linux-fsdevel@vger.kernel.org > --- > fs/netfs/buffered_write.c | 25 ++++------------- > fs/netfs/internal.h | 32 --------------------- > fs/netfs/objects.c | 1 + > fs/netfs/write_issue.c | 38 +++++++++++++++++++++---- > include/linux/netfs.h | 59 +++++++++++++++++++++++++++++++++++++++ > 5 files changed, 98 insertions(+), 57 deletions(-) > > diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c > index 0245449b93e3..12ddbe9bc78b 100644 > --- a/fs/netfs/buffered_write.c > +++ b/fs/netfs/buffered_write.c > @@ -11,26 +11,9 @@ > #include <linux/pagemap.h> > #include <linux/slab.h> > #include <linux/pagevec.h> > +#include <linux/delay.h> > #include "internal.h" > > -static void __netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) > -{ > - if (netfs_group) > - folio_attach_private(folio, netfs_get_group(netfs_group)); > -} > - > -static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) > -{ > - void *priv = folio_get_private(folio); > - > - if (unlikely(priv != netfs_group)) { > - if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE)) > - folio_attach_private(folio, netfs_get_group(netfs_group)); > - else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE) > - folio_detach_private(folio); > - } > -} > - > /* > * Grab a folio for writing and lock it. Attempt to allocate as large a folio > * as possible to hold as much of the remaining length as possible in one go. > @@ -113,6 +96,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, > }; > struct netfs_io_request *wreq = NULL; > struct folio *folio = NULL, *writethrough = NULL; > + unsigned int flush_counter = 0; > unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0; > ssize_t written = 0, ret, ret2; > loff_t i_size, pos = iocb->ki_pos; > @@ -208,7 +192,8 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, > group = netfs_folio_group(folio); > > if (unlikely(group != netfs_group) && > - group != NETFS_FOLIO_COPY_TO_CACHE) > + group != NETFS_FOLIO_COPY_TO_CACHE && > + (group || folio_test_dirty(folio))) I am trying to follow to this complex condition. Is it possible case that folio is dirty but we don't flush the content? > goto flush_content; > > if (folio_test_uptodate(folio)) { > @@ -341,6 +326,8 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, > trace_netfs_folio(folio, netfs_flush_content); > folio_unlock(folio); > folio_put(folio); > + if ((++flush_counter & 0xf) == 0xf) > + msleep(10); Do we really need to use sleep? And why is it 10 ms? And even if we would like to use sleep, then it is better to introduce the named constant. And what is teh justification for 10 ms? > ret = filemap_write_and_wait_range(mapping, fpos, fpos + flen - 1); > if (ret < 0) > goto error_folio_unlock; > diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h > index eebb4f0f660e..2a6123c4da35 100644 > --- a/fs/netfs/internal.h > +++ b/fs/netfs/internal.h > @@ -261,38 +261,6 @@ static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx) > #endif > } > > -/* > - * Get a ref on a netfs group attached to a dirty page (e.g. a ceph snap). > - */ > -static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_group) > -{ > - if (netfs_group && netfs_group != NETFS_FOLIO_COPY_TO_CACHE) > - refcount_inc(&netfs_group->ref); > - return netfs_group; > -} > - > -/* > - * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap). > - */ > -static inline void netfs_put_group(struct netfs_group *netfs_group) > -{ > - if (netfs_group && > - netfs_group != NETFS_FOLIO_COPY_TO_CACHE && > - refcount_dec_and_test(&netfs_group->ref)) > - netfs_group->free(netfs_group); > -} > - > -/* > - * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap). > - */ > -static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr) > -{ > - if (netfs_group && > - netfs_group != NETFS_FOLIO_COPY_TO_CACHE && > - refcount_sub_and_test(nr, &netfs_group->ref)) > - netfs_group->free(netfs_group); > -} > - > /* > * Check to see if a buffer aligns with the crypto block size. If it doesn't > * the crypto layer is going to copy all the data - in which case relying on > diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c > index 52d6fce70837..7fdbaa5c5cab 100644 > --- a/fs/netfs/objects.c > +++ b/fs/netfs/objects.c > @@ -153,6 +153,7 @@ static void netfs_free_request(struct work_struct *work) > kvfree(rreq->direct_bv); > } > > + netfs_put_group(rreq->group); > rolling_buffer_clear(&rreq->buffer); > rolling_buffer_clear(&rreq->bounce); > if (test_bit(NETFS_RREQ_PUT_RMW_TAIL, &rreq->flags)) > diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c > index 93601033ba08..3921fcf4f859 100644 > --- a/fs/netfs/write_issue.c > +++ b/fs/netfs/write_issue.c > @@ -418,7 +418,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq, > netfs_issue_write(wreq, upload); > } else if (fgroup != wreq->group) { > /* We can't write this page to the server yet. */ > - kdebug("wrong group"); > + kdebug("wrong group %px != %px", fgroup, wreq->group); I believe to use the %px is not very good practice. Do we really need to show the real pointer? > folio_redirty_for_writepage(wbc, folio); > folio_unlock(folio); > netfs_issue_write(wreq, upload); > @@ -593,11 +593,19 @@ static void netfs_end_issue_write(struct netfs_io_request *wreq) > netfs_wake_write_collector(wreq, false); > } > > -/* > - * Write some of the pending data back to the server > +/** > + * netfs_writepages_group - Flush data from the pagecache for a file > + * @mapping: The file to flush from > + * @wbc: Details of what should be flushed > + * @group: The write grouping to flush (or NULL) > + * > + * Start asynchronous write back operations to flush dirty data belonging to a > + * particular group in a file's pagecache back to the server and to the local > + * cache. > */ > -int netfs_writepages(struct address_space *mapping, > - struct writeback_control *wbc) > +int netfs_writepages_group(struct address_space *mapping, > + struct writeback_control *wbc, > + struct netfs_group *group) > { > struct netfs_inode *ictx = netfs_inode(mapping->host); > struct netfs_io_request *wreq = NULL; > @@ -618,12 +626,15 @@ int netfs_writepages(struct address_space *mapping, > if (!folio) > goto out; > > - wreq = netfs_create_write_req(mapping, NULL, folio_pos(folio), NETFS_WRITEBACK); > + wreq = netfs_create_write_req(mapping, NULL, folio_pos(folio), > + NETFS_WRITEBACK); > if (IS_ERR(wreq)) { > error = PTR_ERR(wreq); > goto couldnt_start; > } > > + wreq->group = netfs_get_group(group); > + > trace_netfs_write(wreq, netfs_write_trace_writeback); > netfs_stat(&netfs_n_wh_writepages); > > @@ -659,6 +670,21 @@ int netfs_writepages(struct address_space *mapping, > _leave(" = %d", error); > return error; > } > +EXPORT_SYMBOL(netfs_writepages_group); > + > +/** > + * netfs_writepages - Flush data from the pagecache for a file > + * @mapping: The file to flush from > + * @wbc: Details of what should be flushed > + * > + * Start asynchronous write back operations to flush dirty data in a file's > + * pagecache back to the server and to the local cache. > + */ > +int netfs_writepages(struct address_space *mapping, > + struct writeback_control *wbc) > +{ > + return netfs_writepages_group(mapping, wbc, NULL); > +} > EXPORT_SYMBOL(netfs_writepages); > > /* > diff --git a/include/linux/netfs.h b/include/linux/netfs.h > index a67297de8a20..69052ac47ab1 100644 > --- a/include/linux/netfs.h > +++ b/include/linux/netfs.h > @@ -457,6 +457,9 @@ int netfs_read_folio(struct file *, struct folio *); > int netfs_write_begin(struct netfs_inode *, struct file *, > struct address_space *, loff_t pos, unsigned int len, > struct folio **, void **fsdata); > +int netfs_writepages_group(struct address_space *mapping, > + struct writeback_control *wbc, > + struct netfs_group *group); > int netfs_writepages(struct address_space *mapping, > struct writeback_control *wbc); > bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio); > @@ -597,4 +600,60 @@ static inline void netfs_wait_for_outstanding_io(struct inode *inode) > wait_var_event(&ictx->io_count, atomic_read(&ictx->io_count) == 0); > } > > +/* > + * Get a ref on a netfs group attached to a dirty page (e.g. a ceph snap). > + */ > +static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_group) > +{ > + if (netfs_group && netfs_group != NETFS_FOLIO_COPY_TO_CACHE) The netfs_group is a pointer. Is it correct comparison of pointer with the NETFS_FOLIO_COPY_TO_CACHE constant? > + refcount_inc(&netfs_group->ref); > + return netfs_group; > +} > + > +/* > + * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap). > + */ > +static inline void netfs_put_group(struct netfs_group *netfs_group) > +{ > + if (netfs_group && > + netfs_group != NETFS_FOLIO_COPY_TO_CACHE && Ditto. The same question here. > + refcount_dec_and_test(&netfs_group->ref)) > + netfs_group->free(netfs_group); > +} > + > +/* > + * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap). > + */ > +static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr) > +{ > + if (netfs_group && > + netfs_group != NETFS_FOLIO_COPY_TO_CACHE && Ditto. Thanks, Slava. > + refcount_sub_and_test(nr, &netfs_group->ref)) > + netfs_group->free(netfs_group); > +} > + > +/* > + * Set the group pointer directly on a folio. > + */ > +static inline void __netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) > +{ > + if (netfs_group) > + folio_attach_private(folio, netfs_get_group(netfs_group)); > +} > + > +/* > + * Set the group pointer on a folio or the folio info record. > + */ > +static inline void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) > +{ > + void *priv = folio_get_private(folio); > + > + if (unlikely(priv != netfs_group)) { > + if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE)) > + folio_attach_private(folio, netfs_get_group(netfs_group)); > + else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE) > + folio_detach_private(folio); > + } > +} > + > #endif /* _LINUX_NETFS_H */ > >
Viacheslav Dubeyko <Slava.Dubeyko@ibm.com> wrote: > > if (unlikely(group != netfs_group) && > > - group != NETFS_FOLIO_COPY_TO_CACHE) > > + group != NETFS_FOLIO_COPY_TO_CACHE && > > + (group || folio_test_dirty(folio))) > > I am trying to follow to this complex condition. Is it possible case that > folio is dirty but we don't flush the content? It's slightly complicated by fscache. The way I have made local caching work for things that use netfslib fully is that the writeback code copies the data to the cache. We achieve this by marking the pages dirty when we read them from the server. However, so that we don't *also* write the clean data back to the server, the writeback group[*] field is set to a special value (NETFS_FOLIO_COPY_TO_CACHE) and we make the assumption that the writeback group is only actually going to be used by the filesystem if the page is actually modified - in which case the writeback group field is overwritten. [*] This is either folio->private or in a netfs_folio struct attached to folio->private. Note that folio->private is set to be removed in the future. In the event that a page is modified it will be written back to the server(s) and the cache, assuming there is a cache. Also note the netfs_io_stream struct. There are two in the netfs_io_request struct and these are used to separately manage and divide up the writes to a server and to the cache. I've also left the possibility open that we can have more than two streams in the event that we need to write the data to multiple servers. Further, another reason for making writeback write the data to both the cache and the server is that if you are using content encryption, the data is encrypted and then the ciphertext is written to both the server and the cache. > Is it possible case that folio is dirty but we don't flush the content? Anyway, to answer the question more specifically, yes. If the folio is dirty and in the same writeback group (e.g. most recent ceph snap context), then we can presumably keep modifying it. And if the folio is marked dirty and is marked NETFS_FOLIO_COPY_TO_CACHE, then we can just overwrite it, replace or clear the NETFS_FOLIO_COPY_TO_CACHE mark and then it just becomes a regular dirty page. It will get written to fscache either way. > > + if ((++flush_counter & 0xf) == 0xf) > > + msleep(10); > > Do we really need to use sleep? And why is it 10 ms? And even if we would > like to use sleep, then it is better to introduce the named constant. And > what is teh justification for 10 ms? At the moment, debugging and stopping it from running wild in a tight loop when a mistake is made. Remember: at this point, this is a WIP. But in reality, we might see this if we're indulging in cache ping-pong between two clients. I'm not sure how this might be mitigated in the ceph environment - if that's not already done. > > - kdebug("wrong group"); > > + kdebug("wrong group %px != %px", fgroup, wreq->group); > > I believe to use the %px is not very good practice. Do we really need to show > the real pointer? At some point I need to test interference from someone cranking the snaps and I'll probably need this then - though it might be better to make a tracepoint for it. > > +/* > > + * Get a ref on a netfs group attached to a dirty page (e.g. a ceph snap). > > + */ > > +static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_group) > > +{ > > + if (netfs_group && netfs_group != NETFS_FOLIO_COPY_TO_CACHE) > > The netfs_group is a pointer. Is it correct comparison of pointer with the > NETFS_FOLIO_COPY_TO_CACHE constant? This constant? #define NETFS_FOLIO_COPY_TO_CACHE ((struct netfs_group *)0x356UL) /* Write to the cache only */ Yes. See explanation above. David
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 0245449b93e3..12ddbe9bc78b 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -11,26 +11,9 @@ #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/pagevec.h> +#include <linux/delay.h> #include "internal.h" -static void __netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) -{ - if (netfs_group) - folio_attach_private(folio, netfs_get_group(netfs_group)); -} - -static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) -{ - void *priv = folio_get_private(folio); - - if (unlikely(priv != netfs_group)) { - if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE)) - folio_attach_private(folio, netfs_get_group(netfs_group)); - else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE) - folio_detach_private(folio); - } -} - /* * Grab a folio for writing and lock it. Attempt to allocate as large a folio * as possible to hold as much of the remaining length as possible in one go. @@ -113,6 +96,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, }; struct netfs_io_request *wreq = NULL; struct folio *folio = NULL, *writethrough = NULL; + unsigned int flush_counter = 0; unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0; ssize_t written = 0, ret, ret2; loff_t i_size, pos = iocb->ki_pos; @@ -208,7 +192,8 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, group = netfs_folio_group(folio); if (unlikely(group != netfs_group) && - group != NETFS_FOLIO_COPY_TO_CACHE) + group != NETFS_FOLIO_COPY_TO_CACHE && + (group || folio_test_dirty(folio))) goto flush_content; if (folio_test_uptodate(folio)) { @@ -341,6 +326,8 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, trace_netfs_folio(folio, netfs_flush_content); folio_unlock(folio); folio_put(folio); + if ((++flush_counter & 0xf) == 0xf) + msleep(10); ret = filemap_write_and_wait_range(mapping, fpos, fpos + flen - 1); if (ret < 0) goto error_folio_unlock; diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index eebb4f0f660e..2a6123c4da35 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -261,38 +261,6 @@ static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx) #endif } -/* - * Get a ref on a netfs group attached to a dirty page (e.g. a ceph snap). - */ -static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_group) -{ - if (netfs_group && netfs_group != NETFS_FOLIO_COPY_TO_CACHE) - refcount_inc(&netfs_group->ref); - return netfs_group; -} - -/* - * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap). - */ -static inline void netfs_put_group(struct netfs_group *netfs_group) -{ - if (netfs_group && - netfs_group != NETFS_FOLIO_COPY_TO_CACHE && - refcount_dec_and_test(&netfs_group->ref)) - netfs_group->free(netfs_group); -} - -/* - * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap). - */ -static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr) -{ - if (netfs_group && - netfs_group != NETFS_FOLIO_COPY_TO_CACHE && - refcount_sub_and_test(nr, &netfs_group->ref)) - netfs_group->free(netfs_group); -} - /* * Check to see if a buffer aligns with the crypto block size. If it doesn't * the crypto layer is going to copy all the data - in which case relying on diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index 52d6fce70837..7fdbaa5c5cab 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -153,6 +153,7 @@ static void netfs_free_request(struct work_struct *work) kvfree(rreq->direct_bv); } + netfs_put_group(rreq->group); rolling_buffer_clear(&rreq->buffer); rolling_buffer_clear(&rreq->bounce); if (test_bit(NETFS_RREQ_PUT_RMW_TAIL, &rreq->flags)) diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 93601033ba08..3921fcf4f859 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -418,7 +418,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq, netfs_issue_write(wreq, upload); } else if (fgroup != wreq->group) { /* We can't write this page to the server yet. */ - kdebug("wrong group"); + kdebug("wrong group %px != %px", fgroup, wreq->group); folio_redirty_for_writepage(wbc, folio); folio_unlock(folio); netfs_issue_write(wreq, upload); @@ -593,11 +593,19 @@ static void netfs_end_issue_write(struct netfs_io_request *wreq) netfs_wake_write_collector(wreq, false); } -/* - * Write some of the pending data back to the server +/** + * netfs_writepages_group - Flush data from the pagecache for a file + * @mapping: The file to flush from + * @wbc: Details of what should be flushed + * @group: The write grouping to flush (or NULL) + * + * Start asynchronous write back operations to flush dirty data belonging to a + * particular group in a file's pagecache back to the server and to the local + * cache. */ -int netfs_writepages(struct address_space *mapping, - struct writeback_control *wbc) +int netfs_writepages_group(struct address_space *mapping, + struct writeback_control *wbc, + struct netfs_group *group) { struct netfs_inode *ictx = netfs_inode(mapping->host); struct netfs_io_request *wreq = NULL; @@ -618,12 +626,15 @@ int netfs_writepages(struct address_space *mapping, if (!folio) goto out; - wreq = netfs_create_write_req(mapping, NULL, folio_pos(folio), NETFS_WRITEBACK); + wreq = netfs_create_write_req(mapping, NULL, folio_pos(folio), + NETFS_WRITEBACK); if (IS_ERR(wreq)) { error = PTR_ERR(wreq); goto couldnt_start; } + wreq->group = netfs_get_group(group); + trace_netfs_write(wreq, netfs_write_trace_writeback); netfs_stat(&netfs_n_wh_writepages); @@ -659,6 +670,21 @@ int netfs_writepages(struct address_space *mapping, _leave(" = %d", error); return error; } +EXPORT_SYMBOL(netfs_writepages_group); + +/** + * netfs_writepages - Flush data from the pagecache for a file + * @mapping: The file to flush from + * @wbc: Details of what should be flushed + * + * Start asynchronous write back operations to flush dirty data in a file's + * pagecache back to the server and to the local cache. + */ +int netfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return netfs_writepages_group(mapping, wbc, NULL); +} EXPORT_SYMBOL(netfs_writepages); /* diff --git a/include/linux/netfs.h b/include/linux/netfs.h index a67297de8a20..69052ac47ab1 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -457,6 +457,9 @@ int netfs_read_folio(struct file *, struct folio *); int netfs_write_begin(struct netfs_inode *, struct file *, struct address_space *, loff_t pos, unsigned int len, struct folio **, void **fsdata); +int netfs_writepages_group(struct address_space *mapping, + struct writeback_control *wbc, + struct netfs_group *group); int netfs_writepages(struct address_space *mapping, struct writeback_control *wbc); bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio); @@ -597,4 +600,60 @@ static inline void netfs_wait_for_outstanding_io(struct inode *inode) wait_var_event(&ictx->io_count, atomic_read(&ictx->io_count) == 0); } +/* + * Get a ref on a netfs group attached to a dirty page (e.g. a ceph snap). + */ +static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_group) +{ + if (netfs_group && netfs_group != NETFS_FOLIO_COPY_TO_CACHE) + refcount_inc(&netfs_group->ref); + return netfs_group; +} + +/* + * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap). + */ +static inline void netfs_put_group(struct netfs_group *netfs_group) +{ + if (netfs_group && + netfs_group != NETFS_FOLIO_COPY_TO_CACHE && + refcount_dec_and_test(&netfs_group->ref)) + netfs_group->free(netfs_group); +} + +/* + * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap). + */ +static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr) +{ + if (netfs_group && + netfs_group != NETFS_FOLIO_COPY_TO_CACHE && + refcount_sub_and_test(nr, &netfs_group->ref)) + netfs_group->free(netfs_group); +} + +/* + * Set the group pointer directly on a folio. + */ +static inline void __netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) +{ + if (netfs_group) + folio_attach_private(folio, netfs_get_group(netfs_group)); +} + +/* + * Set the group pointer on a folio or the folio info record. + */ +static inline void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) +{ + void *priv = folio_get_private(folio); + + if (unlikely(priv != netfs_group)) { + if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE)) + folio_attach_private(folio, netfs_get_group(netfs_group)); + else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE) + folio_detach_private(folio); + } +} + #endif /* _LINUX_NETFS_H */
Make some adjustments to the handling of netfs groups so that ceph can use them for snap contexts: - Move netfs_get_group(), netfs_put_group() and netfs_put_group_many() to linux/netfs.h so that ceph can build its snap context on netfs groups. - Move netfs_set_group() and __netfs_set_group() to linux/netfs.h so that ceph_dirty_folio() can call them from inside of the locked section in which it finds the snap context to attach. - Provide a netfs_writepages_group() that takes a group as a parameter and attaches it to the request and make netfs_free_request() drop the ref on it. netfs_writepages() then becomes a wrapper that passes in a NULL group. - In netfs_perform_write(), only consider a folio to have a conflicting group if the folio's group pointer isn't NULL and if the folio is dirty. - In netfs_perform_write(), interject a small 10ms sleep after every 16 attempts to flush a folio within a single call. Signed-off-by: David Howells <dhowells@redhat.com> cc: Jeff Layton <jlayton@kernel.org> cc: Viacheslav Dubeyko <slava@dubeyko.com> cc: Alex Markuze <amarkuze@redhat.com> cc: Ilya Dryomov <idryomov@gmail.com> cc: ceph-devel@vger.kernel.org cc: linux-fsdevel@vger.kernel.org --- fs/netfs/buffered_write.c | 25 ++++------------- fs/netfs/internal.h | 32 --------------------- fs/netfs/objects.c | 1 + fs/netfs/write_issue.c | 38 +++++++++++++++++++++---- include/linux/netfs.h | 59 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 98 insertions(+), 57 deletions(-)