diff mbox series

[v9,3/5] NFS: Convert buffered read paths to use netfs when fscache is enabled

Message ID 20221017105212.77588-4-dwysocha@redhat.com (mailing list archive)
State New, archived
Headers show
Series Convert NFS with fscache to the netfs API | expand

Commit Message

David Wysochanski Oct. 17, 2022, 10:52 a.m. UTC
Convert the NFS buffered read code paths to corresponding netfs APIs,
but only when fscache is configured and enabled.

The netfs API defines struct netfs_request_ops which must be filled
in by the network filesystem.  For NFS, we only need to define 5 of
the functions, the main one being the issue_read() function.
The issue_read() function is called by the netfs layer when a read
cannot be fulfilled locally, and must be sent to the server (either
the cache is not active, or it is active but the data is not available).
Once the read from the server is complete, netfs requires a call to
netfs_subreq_terminated() which conveys either how many bytes were read
successfully, or an error.  Note that issue_read() is called with a
structure, netfs_io_subrequest, which defines the IO requested, and
contains a start and a length (both in bytes), and assumes the underlying
netfs will return a either an error on the whole region, or the number
of bytes successfully read.

The NFS IO path is page based and the main APIs are the pgio APIs defined
in pagelist.c.  For the pgio APIs, there is no way for the caller to
know how many RPCs will be sent and how the pages will be broken up
into underlying RPCs, each of which will have their own completion and
return code.  In contrast, netfs is subrequest based, a single
subrequest may contain multiple pages, and a single subrequest is
initiated with issue_read() and terminated with netfs_subreq_terminated().
Thus, to utilze the netfs APIs, NFS needs some way to accommodate
the netfs API requirement on the single response to the whole
subrequest, while also minimizing disruptive changes to the NFS
pgio layer.

The approach taken with this patch is to allocate a small structure
for each nfs_netfs_issue_read() call, store the final error and number
of bytes successfully transferred in the structure, and update these values
as each RPC completes.  The refcount on the structure is used as a marker
for the last RPC completion, is incremented in nfs_netfs_read_initiate(),
and decremented inside nfs_netfs_read_completion(), when a nfs_pgio_header
contains a valid pointer to the data.  On the final put (which signals
the final outstanding RPC is complete) in nfs_netfs_read_completion(),
call netfs_subreq_terminated() with either the final error value (if
one or more READs complete with an error) or the number of bytes
successfully transferred (if all RPCs complete successfully).  Note
that when all RPCs complete successfully, the number of bytes transferred
is capped to the length of the subrequest.  Capping the transferred length
to the subrequest length prevents "Subreq overread" warnings from netfs.
This is due to the "aligned_len" in nfs_pageio_add_page(), and the
corner case where NFS requests a full page at the end of the file,
even when i_size reflects only a partial page (NFS overread).

Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfs/fscache.c         | 232 +++++++++++++++++++++++----------------
 fs/nfs/fscache.h         | 102 +++++++++++------
 fs/nfs/inode.c           |   2 +
 fs/nfs/internal.h        |   9 ++
 fs/nfs/pagelist.c        |  12 ++
 fs/nfs/read.c            |  50 ++++-----
 include/linux/nfs_page.h |   3 +
 include/linux/nfs_xdr.h  |   3 +
 8 files changed, 261 insertions(+), 152 deletions(-)

Comments

Trond Myklebust Oct. 27, 2022, 7:16 p.m. UTC | #1
On Mon, 2022-10-17 at 06:52 -0400, Dave Wysochanski wrote:
> Convert the NFS buffered read code paths to corresponding netfs APIs,
> but only when fscache is configured and enabled.
> 
> The netfs API defines struct netfs_request_ops which must be filled
> in by the network filesystem.  For NFS, we only need to define 5 of
> the functions, the main one being the issue_read() function.
> The issue_read() function is called by the netfs layer when a read
> cannot be fulfilled locally, and must be sent to the server (either
> the cache is not active, or it is active but the data is not
> available).
> Once the read from the server is complete, netfs requires a call to
> netfs_subreq_terminated() which conveys either how many bytes were
> read
> successfully, or an error.  Note that issue_read() is called with a
> structure, netfs_io_subrequest, which defines the IO requested, and
> contains a start and a length (both in bytes), and assumes the
> underlying
> netfs will return a either an error on the whole region, or the
> number
> of bytes successfully read.
> 
> The NFS IO path is page based and the main APIs are the pgio APIs
> defined
> in pagelist.c.  For the pgio APIs, there is no way for the caller to
> know how many RPCs will be sent and how the pages will be broken up
> into underlying RPCs, each of which will have their own completion
> and
> return code.  In contrast, netfs is subrequest based, a single
> subrequest may contain multiple pages, and a single subrequest is
> initiated with issue_read() and terminated with
> netfs_subreq_terminated().
> Thus, to utilze the netfs APIs, NFS needs some way to accommodate
> the netfs API requirement on the single response to the whole
> subrequest, while also minimizing disruptive changes to the NFS
> pgio layer.
> 
> The approach taken with this patch is to allocate a small structure
> for each nfs_netfs_issue_read() call, store the final error and
> number
> of bytes successfully transferred in the structure, and update these
> values
> as each RPC completes.  The refcount on the structure is used as a
> marker
> for the last RPC completion, is incremented in
> nfs_netfs_read_initiate(),
> and decremented inside nfs_netfs_read_completion(), when a
> nfs_pgio_header
> contains a valid pointer to the data.  On the final put (which
> signals
> the final outstanding RPC is complete) in
> nfs_netfs_read_completion(),
> call netfs_subreq_terminated() with either the final error value (if
> one or more READs complete with an error) or the number of bytes
> successfully transferred (if all RPCs complete successfully).  Note
> that when all RPCs complete successfully, the number of bytes
> transferred
> is capped to the length of the subrequest.  Capping the transferred
> length
> to the subrequest length prevents "Subreq overread" warnings from
> netfs.
> This is due to the "aligned_len" in nfs_pageio_add_page(), and the
> corner case where NFS requests a full page at the end of the file,
> even when i_size reflects only a partial page (NFS overread).
> 
> Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
> Reviewed-by: Jeff Layton <jlayton@kernel.org>


This is not doing what I asked for, which was to separate out the
fscache functionality, so that we can call that if and when it is
available.

Instead, it is just wrapping the NFS requests inside netfs requests. As
it stands, that means it is just duplicating information, and adding
unnecessary overhead to the standard I/O path (extra allocations, extra
indirect calls, and extra bloat to the inode).

My expectation is that the standard I/O path should have minimal
overhead, and should certainly not increase the overhead that we
already have. Will this be addressed in future iterations of these
patches?
David Wysochanski Oct. 28, 2022, 11:50 a.m. UTC | #2
On Thu, Oct 27, 2022 at 3:16 PM Trond Myklebust <trondmy@kernel.org> wrote:
>
> On Mon, 2022-10-17 at 06:52 -0400, Dave Wysochanski wrote:
> > Convert the NFS buffered read code paths to corresponding netfs APIs,
> > but only when fscache is configured and enabled.
> >
> > The netfs API defines struct netfs_request_ops which must be filled
> > in by the network filesystem.  For NFS, we only need to define 5 of
> > the functions, the main one being the issue_read() function.
> > The issue_read() function is called by the netfs layer when a read
> > cannot be fulfilled locally, and must be sent to the server (either
> > the cache is not active, or it is active but the data is not
> > available).
> > Once the read from the server is complete, netfs requires a call to
> > netfs_subreq_terminated() which conveys either how many bytes were
> > read
> > successfully, or an error.  Note that issue_read() is called with a
> > structure, netfs_io_subrequest, which defines the IO requested, and
> > contains a start and a length (both in bytes), and assumes the
> > underlying
> > netfs will return a either an error on the whole region, or the
> > number
> > of bytes successfully read.
> >
> > The NFS IO path is page based and the main APIs are the pgio APIs
> > defined
> > in pagelist.c.  For the pgio APIs, there is no way for the caller to
> > know how many RPCs will be sent and how the pages will be broken up
> > into underlying RPCs, each of which will have their own completion
> > and
> > return code.  In contrast, netfs is subrequest based, a single
> > subrequest may contain multiple pages, and a single subrequest is
> > initiated with issue_read() and terminated with
> > netfs_subreq_terminated().
> > Thus, to utilze the netfs APIs, NFS needs some way to accommodate
> > the netfs API requirement on the single response to the whole
> > subrequest, while also minimizing disruptive changes to the NFS
> > pgio layer.
> >
> > The approach taken with this patch is to allocate a small structure
> > for each nfs_netfs_issue_read() call, store the final error and
> > number
> > of bytes successfully transferred in the structure, and update these
> > values
> > as each RPC completes.  The refcount on the structure is used as a
> > marker
> > for the last RPC completion, is incremented in
> > nfs_netfs_read_initiate(),
> > and decremented inside nfs_netfs_read_completion(), when a
> > nfs_pgio_header
> > contains a valid pointer to the data.  On the final put (which
> > signals
> > the final outstanding RPC is complete) in
> > nfs_netfs_read_completion(),
> > call netfs_subreq_terminated() with either the final error value (if
> > one or more READs complete with an error) or the number of bytes
> > successfully transferred (if all RPCs complete successfully).  Note
> > that when all RPCs complete successfully, the number of bytes
> > transferred
> > is capped to the length of the subrequest.  Capping the transferred
> > length
> > to the subrequest length prevents "Subreq overread" warnings from
> > netfs.
> > This is due to the "aligned_len" in nfs_pageio_add_page(), and the
> > corner case where NFS requests a full page at the end of the file,
> > even when i_size reflects only a partial page (NFS overread).
> >
> > Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
> > Reviewed-by: Jeff Layton <jlayton@kernel.org>
>
>
> This is not doing what I asked for, which was to separate out the
> fscache functionality, so that we can call that if and when it is
> available.
>
I must have misunderstood then.

The last feedback I have from you was that you wanted it to be
an opt-in feature, and it was a comment on a previous patch
to Kconfig.  I was proceeding the best I knew how, but
let me try to get back on track.

> Instead, it is just wrapping the NFS requests inside netfs requests. As
> it stands, that means it is just duplicating information, and adding
> unnecessary overhead to the standard I/O path (extra allocations, extra
> indirect calls, and extra bloat to the inode).
>
I think I understand what you're saying but I'm not sure.  Let me
ask some clarifying questions.

Are you objecting to the code when CONFIG_NFS_FSCACHE is
configured?  Or when it is not?  Or both?  I think you're objecting
when it's configured, but not enabled (we mount without 'fsc').
Am I right?

Also, are you objecting to the design that to use fcache we now
have to use netfs, specifically:
- call into netfs via either netfs_read_folio or netfs_readahead
- if fscache is enabled, then the IO can be satisfied from fscache
- if fscache is not enabled, or some of the IO cannot be satisfied
from the cache, then NFS is called back via netfs_issue_read
and we use the normal NFS read pageio interface.  This requires
we call netfs_subreq_terminated() when all the RPCs complete,
which is the reason for the small changes to pagelist.c

Can you be more specific as to the portions of the patch you don't like
so I can move it in the right direction?

This is from patch #2 which you didn't comment on.  I'm not sure you're
ok with it though, since you mention "extra bloat to the inode".
Do you object to this even though it's wrapped in an
#ifdef CONFIG_NFS_FSCACHE?  If so, do you require no
extra size be added to nfs_inode?

@@ -204,9 +208,11 @@ struct nfs_inode {
        __u64 write_io;
        __u64 read_io;
 #ifdef CONFIG_NFS_FSCACHE
-       struct fscache_cookie   *fscache;
-#endif
+       struct netfs_inode      netfs; /* netfs context and VFS inode */
+#else
        struct inode            vfs_inode;
+#endif
+


Are you ok with the stub functions which are placed in fscache.h, and
when CONFIG_NFS_FSCACHE is not set, become either a no-op
or a 1-liner (nfs_netfs_readpage_release)?

 #else /* CONFIG_NFS_FSCACHE */
+static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {}
+static inline void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr) {}
+static inline void nfs_netfs_read_completion(struct nfs_pgio_header *hdr) {}
+static inline void nfs_netfs_readpage_release(struct nfs_page *req)
+{
+       unlock_page(req->wb_page);
+}
 static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
 static inline void nfs_fscache_init_inode(struct inode *inode) {}


Do you object to the below?  If so, then do you want
#ifdef CONFIG_NFS_FSCACHE here?

-- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2249,6 +2249,8 @@ struct inode *nfs_alloc_inode(struct super_block *sb)
 #ifdef CONFIG_NFS_V4_2
        nfsi->xattr_cache = NULL;
 #endif
+       nfs_netfs_inode_init(nfsi);
+
        return VFS_I(nfsi);
 }
 EXPORT_SYMBOL_GPL(nfs_alloc_i
node);


Do you object to the changes in fs/nfs/read.c?  Specifically,
how about the below calls to netfs from nfs_read_folio and
nfs_readahead into equivalent netfs calls?  So when
NFS_CONFIG_FSCACHE is set, but fscache is not enabled
('fsc' not on mount), these netfs functions do immediately call
netfs_alloc_request().  But I wonder if we could simply add a
check to see if fscache is enabled on the mount, and skip
over to satisfy what you want.  Am I understanding what you
want?

@@ -355,6 +343,10 @@ int nfs_read_folio(struct file *file, struct folio *folio)
        if (NFS_STALE(inode))
                goto out_unlock;

+       ret = nfs_netfs_read_folio(file, folio);
+       if (!ret)
+               goto out;
+

@@ -405,6 +399,10 @@ void nfs_readahead(struct readahead_control *ractl)
        if (NFS_STALE(inode))
                goto out;

+       ret = nfs_netfs_readahead(ractl);
+       if (!ret)
+               goto out;
+


And how about these calls from different points in the read
path to the earlier mentioned stub functions?

@@ -110,20 +110,13 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);

 static void nfs_readpage_release(struct nfs_page *req, int error)
 {
-       struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
        struct page *page = req->wb_page;

-       dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id,
-               (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
-               (long long)req_offset(req));
-
        if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
                SetPageError(page);
-       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
-               if (PageUptodate(page))
-                       nfs_fscache_write_page(inode, page);
-               unlock_page(page);
-       }
+       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
+               nfs_netfs_readpage_release(req);
+
        nfs_release_request(req);
 }

@@ -177,6 +170,8 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr)
                nfs_list_remove_request(req);
                nfs_readpage_release(req, error);
        }
+       nfs_netfs_read_completion(hdr);
+
 out:
        hdr->release(hdr);
 }
@@ -187,6 +182,7 @@ static void nfs_initiate_read(struct nfs_pgio_header *hdr,
                              struct rpc_task_setup *task_setup_data, int how)
 {
        rpc_ops->read_setup(hdr, msg);
+       nfs_netfs_initiate_read(hdr);
        trace_nfs_initiate_read(hdr);
 }


Are you ok with these additions?  Something like this would
be required in the case of fscache configured and enabled,
because we could have some of the data in a read in
fscache, and some not.  That is the reason for the netfs
design, and why we need to be able to call the normal
NFS read IO path (netfs calls into issue_read, and we call
back via netfs_subreq_terminated)?

@@ -101,6 +101,9 @@ struct nfs_pageio_descriptor {
        struct pnfs_layout_segment *pg_lseg;
        struct nfs_io_completion *pg_io_completion;
        struct nfs_direct_req   *pg_dreq;
+#ifdef CONFIG_NFS_FSCACHE
+       void                    *pg_netfs;
+#endif

@@ -1619,6 +1619,9 @@ struct nfs_pgio_header {
        const struct nfs_rw_ops *rw_ops;
        struct nfs_io_completion *io_completion;
        struct nfs_direct_req   *dreq;
+#ifdef CONFIG_NFS_FSCACHE
+       void                    *netfs;
+#endif


And these additions to pagelist.c?

@@ -68,6 +69,10 @@ void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
        hdr->good_bytes = mirror->pg_count;
        hdr->io_completion = desc->pg_io_completion;
        hdr->dreq = desc->pg_dreq;
+#ifdef CONFIG_NFS_FSCACHE
+       if (desc->pg_netfs)
+               hdr->netfs = desc->pg_netfs;
+#endif


@@ -846,6 +851,9 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
        desc->pg_lseg = NULL;
        desc->pg_io_completion = NULL;
        desc->pg_dreq = NULL;
+#ifdef CONFIG_NFS_FSCACHE
+       desc->pg_netfs = NULL;
+#endif


@@ -1360,6 +1369,9 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,

        desc->pg_io_completion = hdr->io_completion;
        desc->pg_dreq = hdr->dreq;
+#ifdef CONFIG_NFS_FSCACHE
+       desc->pg_netfs = hdr->netfs;
+#endif


> My expectation is that the standard I/O path should have minimal
> overhead, and should certainly not increase the overhead that we
> already have. Will this be addressed in future iterations of these
> patches?
>

I will do what I can to satisfy what you want, either by fixing up
this patch or follow-on patches.  Hopefully the above questions
will clarify the next steps.
Trond Myklebust Oct. 28, 2022, 4:59 p.m. UTC | #3
On Fri, 2022-10-28 at 07:50 -0400, David Wysochanski wrote:
> On Thu, Oct 27, 2022 at 3:16 PM Trond Myklebust <trondmy@kernel.org>
> wrote:
> > 
> > On Mon, 2022-10-17 at 06:52 -0400, Dave Wysochanski wrote:
> > > Convert the NFS buffered read code paths to corresponding netfs
> > > APIs,
> > > but only when fscache is configured and enabled.
> > > 
> > > The netfs API defines struct netfs_request_ops which must be
> > > filled
> > > in by the network filesystem.  For NFS, we only need to define 5
> > > of
> > > the functions, the main one being the issue_read() function.
> > > The issue_read() function is called by the netfs layer when a
> > > read
> > > cannot be fulfilled locally, and must be sent to the server
> > > (either
> > > the cache is not active, or it is active but the data is not
> > > available).
> > > Once the read from the server is complete, netfs requires a call
> > > to
> > > netfs_subreq_terminated() which conveys either how many bytes
> > > were
> > > read
> > > successfully, or an error.  Note that issue_read() is called with
> > > a
> > > structure, netfs_io_subrequest, which defines the IO requested,
> > > and
> > > contains a start and a length (both in bytes), and assumes the
> > > underlying
> > > netfs will return a either an error on the whole region, or the
> > > number
> > > of bytes successfully read.
> > > 
> > > The NFS IO path is page based and the main APIs are the pgio APIs
> > > defined
> > > in pagelist.c.  For the pgio APIs, there is no way for the caller
> > > to
> > > know how many RPCs will be sent and how the pages will be broken
> > > up
> > > into underlying RPCs, each of which will have their own
> > > completion
> > > and
> > > return code.  In contrast, netfs is subrequest based, a single
> > > subrequest may contain multiple pages, and a single subrequest is
> > > initiated with issue_read() and terminated with
> > > netfs_subreq_terminated().
> > > Thus, to utilze the netfs APIs, NFS needs some way to accommodate
> > > the netfs API requirement on the single response to the whole
> > > subrequest, while also minimizing disruptive changes to the NFS
> > > pgio layer.
> > > 
> > > The approach taken with this patch is to allocate a small
> > > structure
> > > for each nfs_netfs_issue_read() call, store the final error and
> > > number
> > > of bytes successfully transferred in the structure, and update
> > > these
> > > values
> > > as each RPC completes.  The refcount on the structure is used as
> > > a
> > > marker
> > > for the last RPC completion, is incremented in
> > > nfs_netfs_read_initiate(),
> > > and decremented inside nfs_netfs_read_completion(), when a
> > > nfs_pgio_header
> > > contains a valid pointer to the data.  On the final put (which
> > > signals
> > > the final outstanding RPC is complete) in
> > > nfs_netfs_read_completion(),
> > > call netfs_subreq_terminated() with either the final error value
> > > (if
> > > one or more READs complete with an error) or the number of bytes
> > > successfully transferred (if all RPCs complete successfully). 
> > > Note
> > > that when all RPCs complete successfully, the number of bytes
> > > transferred
> > > is capped to the length of the subrequest.  Capping the
> > > transferred
> > > length
> > > to the subrequest length prevents "Subreq overread" warnings from
> > > netfs.
> > > This is due to the "aligned_len" in nfs_pageio_add_page(), and
> > > the
> > > corner case where NFS requests a full page at the end of the
> > > file,
> > > even when i_size reflects only a partial page (NFS overread).
> > > 
> > > Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
> > > Reviewed-by: Jeff Layton <jlayton@kernel.org>
> > 
> > 
> > This is not doing what I asked for, which was to separate out the
> > fscache functionality, so that we can call that if and when it is
> > available.
> > 
> I must have misunderstood then.
> 
> The last feedback I have from you was that you wanted it to be
> an opt-in feature, and it was a comment on a previous patch
> to Kconfig.  I was proceeding the best I knew how, but
> let me try to get back on track.
> 
> > Instead, it is just wrapping the NFS requests inside netfs
> > requests. As
> > it stands, that means it is just duplicating information, and
> > adding
> > unnecessary overhead to the standard I/O path (extra allocations,
> > extra
> > indirect calls, and extra bloat to the inode).
> > 
> I think I understand what you're saying but I'm not sure.  Let me
> ask some clarifying questions.
> 
> Are you objecting to the code when CONFIG_NFS_FSCACHE is
> configured?  Or when it is not?  Or both?  I think you're objecting
> when it's configured, but not enabled (we mount without 'fsc').
> Am I right?
> 
> Also, are you objecting to the design that to use fcache we now
> have to use netfs, specifically:
> - call into netfs via either netfs_read_folio or netfs_readahead
> - if fscache is enabled, then the IO can be satisfied from fscache
> - if fscache is not enabled, or some of the IO cannot be satisfied
> from the cache, then NFS is called back via netfs_issue_read
> and we use the normal NFS read pageio interface.  This requires
> we call netfs_subreq_terminated() when all the RPCs complete,
> which is the reason for the small changes to pagelist.c

I'm objecting to any middle layer "solution" that adds overhead to the
NFS I/O paths.

I'm willing to consider solutions that are specific only to the fscache
use case (i.e. when the 'fsc' mount option is specified). However when
I perform a normal NFS mount, and do I/O, then I don't want to see
extra memory allocations, extra indirect calls and larger inode
footprints.

IOW: I want the code to optimise for the case of standard NFS, not for
the case of 'NFS with cachefs additions'.

> 
> Can you be more specific as to the portions of the patch you don't
> like
> so I can move it in the right direction?
> 
> This is from patch #2 which you didn't comment on.  I'm not sure
> you're
> ok with it though, since you mention "extra bloat to the inode".
> Do you object to this even though it's wrapped in an
> #ifdef CONFIG_NFS_FSCACHE?  If so, do you require no
> extra size be added to nfs_inode?
> 
> @@ -204,9 +208,11 @@ struct nfs_inode {
>         __u64 write_io;
>         __u64 read_io;
>  #ifdef CONFIG_NFS_FSCACHE
> -       struct fscache_cookie   *fscache;
> -#endif
> +       struct netfs_inode      netfs; /* netfs context and VFS inode
> */
> +#else
>         struct inode            vfs_inode;
> +#endif
> +

Ideally, I'd prefer no extra size. I can live with it up to a certain
point, however for now NFS is not unconditionally opting into the netfs
project. If we're to ever do that, then I want to see streamlined code
for the standard I/O case.

> 
> 
> Are you ok with the stub functions which are placed in fscache.h, and
> when CONFIG_NFS_FSCACHE is not set, become either a no-op
> or a 1-liner (nfs_netfs_readpage_release)?
> 
>  #else /* CONFIG_NFS_FSCACHE */
> +static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {}
> +static inline void nfs_netfs_initiate_read(struct nfs_pgio_header
> *hdr) {}
> +static inline void nfs_netfs_read_completion(struct nfs_pgio_header
> *hdr) {}
> +static inline void nfs_netfs_readpage_release(struct nfs_page *req)
> +{
> +       unlock_page(req->wb_page);
> +}
>  static inline void nfs_fscache_release_super_cookie(struct
> super_block *sb) {}
>  static inline void nfs_fscache_init_inode(struct inode *inode) {}
> 
> 
> Do you object to the below?  If so, then do you want
> #ifdef CONFIG_NFS_FSCACHE here?
> 
> -- a/fs/nfs/inode.c
> +++ b/fs/nfs/inode.c
> @@ -2249,6 +2249,8 @@ struct inode *nfs_alloc_inode(struct
> super_block *sb)
>  #ifdef CONFIG_NFS_V4_2
>         nfsi->xattr_cache = NULL;
>  #endif
> +       nfs_netfs_inode_init(nfsi);
> +
>         return VFS_I(nfsi);
>  }
>  EXPORT_SYMBOL_GPL(nfs_alloc_i
> node);
> 
> 
> Do you object to the changes in fs/nfs/read.c?  Specifically,
> how about the below calls to netfs from nfs_read_folio and
> nfs_readahead into equivalent netfs calls?  So when
> NFS_CONFIG_FSCACHE is set, but fscache is not enabled
> ('fsc' not on mount), these netfs functions do immediately call
> netfs_alloc_request().  But I wonder if we could simply add a
> check to see if fscache is enabled on the mount, and skip
> over to satisfy what you want.  Am I understanding what you
> want?

Quite frankly, I'd prefer that we just split out the functionality that
is needed from the netfs code so that it can be optimised. However I'm
not interested enough in the cachefs functionality to work on that
myself. ...and as I indicated above, I might be OK with opting into the
netfs project, once the overhead can be made to disappear.

> 
> @@ -355,6 +343,10 @@ int nfs_read_folio(struct file *file, struct
> folio *folio)
>         if (NFS_STALE(inode))
>                 goto out_unlock;
> 
> +       ret = nfs_netfs_read_folio(file, folio);
> +       if (!ret)
> +               goto out;
> +
> 
> @@ -405,6 +399,10 @@ void nfs_readahead(struct readahead_control
> *ractl)
>         if (NFS_STALE(inode))
>                 goto out;
> 
> +       ret = nfs_netfs_readahead(ractl);
> +       if (!ret)
> +               goto out;
> +
> 
> 
> And how about these calls from different points in the read
> path to the earlier mentioned stub functions?
> 
> @@ -110,20 +110,13 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
> 
>  static void nfs_readpage_release(struct nfs_page *req, int error)
>  {
> -       struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
>         struct page *page = req->wb_page;
> 
> -       dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb-
> >s_id,
> -               (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
> -               (long long)req_offset(req));
> -
>         if (nfs_error_is_fatal_on_server(error) && error != -
> ETIMEDOUT)
>                 SetPageError(page);
> -       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
> -               if (PageUptodate(page))
> -                       nfs_fscache_write_page(inode, page);
> -               unlock_page(page);
> -       }
> +       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> +               nfs_netfs_readpage_release(req);
> +

I'm not seeing the value of wrapping unlock_page(), no... That code is
going to need to change when we move it to use folios natively anyway.

>         nfs_release_request(req);
>  }
> 
> @@ -177,6 +170,8 @@ static void nfs_read_completion(struct
> nfs_pgio_header *hdr)
>                 nfs_list_remove_request(req);
>                 nfs_readpage_release(req, error);
>         }
> +       nfs_netfs_read_completion(hdr);
> +
>  out:
>         hdr->release(hdr);
>  }
> @@ -187,6 +182,7 @@ static void nfs_initiate_read(struct
> nfs_pgio_header *hdr,
>                               struct rpc_task_setup *task_setup_data,
> int how)
>  {
>         rpc_ops->read_setup(hdr, msg);
> +       nfs_netfs_initiate_read(hdr);
>         trace_nfs_initiate_read(hdr);
>  }
> 
> 
> Are you ok with these additions?  Something like this would
> be required in the case of fscache configured and enabled,
> because we could have some of the data in a read in
> fscache, and some not.  That is the reason for the netfs
> design, and why we need to be able to call the normal
> NFS read IO path (netfs calls into issue_read, and we call
> back via netfs_subreq_terminated)?
> 
> @@ -101,6 +101,9 @@ struct nfs_pageio_descriptor {
>         struct pnfs_layout_segment *pg_lseg;
>         struct nfs_io_completion *pg_io_completion;
>         struct nfs_direct_req   *pg_dreq;
> +#ifdef CONFIG_NFS_FSCACHE
> +       void                    *pg_netfs;
> +#endif
> 
> @@ -1619,6 +1619,9 @@ struct nfs_pgio_header {
>         const struct nfs_rw_ops *rw_ops;
>         struct nfs_io_completion *io_completion;
>         struct nfs_direct_req   *dreq;
> +#ifdef CONFIG_NFS_FSCACHE
> +       void                    *netfs;
> +#endif
> 
> 
> And these additions to pagelist.c?
> 
> @@ -68,6 +69,10 @@ void nfs_pgheader_init(struct
> nfs_pageio_descriptor *desc,
>         hdr->good_bytes = mirror->pg_count;
>         hdr->io_completion = desc->pg_io_completion;
>         hdr->dreq = desc->pg_dreq;
> +#ifdef CONFIG_NFS_FSCACHE
> +       if (desc->pg_netfs)
> +               hdr->netfs = desc->pg_netfs;
> +#endif

Why the conditional?

> 
> 
> @@ -846,6 +851,9 @@ void nfs_pageio_init(struct nfs_pageio_descriptor
> *desc,
>         desc->pg_lseg = NULL;
>         desc->pg_io_completion = NULL;
>         desc->pg_dreq = NULL;
> +#ifdef CONFIG_NFS_FSCACHE
> +       desc->pg_netfs = NULL;
> +#endif
> 
> 
> @@ -1360,6 +1369,9 @@ int nfs_pageio_resend(struct
> nfs_pageio_descriptor *desc,
> 
>         desc->pg_io_completion = hdr->io_completion;
>         desc->pg_dreq = hdr->dreq;
> +#ifdef CONFIG_NFS_FSCACHE
> +       desc->pg_netfs = hdr->netfs;
> +#endif

Those all need wrapper functions instead of embedding #ifdefs.

> 
> 
> > My expectation is that the standard I/O path should have minimal
> > overhead, and should certainly not increase the overhead that we
> > already have. Will this be addressed in future iterations of these
> > patches?
> > 
> 
> I will do what I can to satisfy what you want, either by fixing up
> this patch or follow-on patches.  Hopefully the above questions
> will clarify the next steps.
>
David Wysochanski Oct. 29, 2022, 4:46 p.m. UTC | #4
On Fri, Oct 28, 2022 at 12:59 PM Trond Myklebust <trondmy@kernel.org> wrote:
>
> On Fri, 2022-10-28 at 07:50 -0400, David Wysochanski wrote:
> > On Thu, Oct 27, 2022 at 3:16 PM Trond Myklebust <trondmy@kernel.org>
> > wrote:
> > >
> > > On Mon, 2022-10-17 at 06:52 -0400, Dave Wysochanski wrote:
> > > > Convert the NFS buffered read code paths to corresponding netfs
> > > > APIs,
> > > > but only when fscache is configured and enabled.
> > > >
> > > > The netfs API defines struct netfs_request_ops which must be
> > > > filled
> > > > in by the network filesystem.  For NFS, we only need to define 5
> > > > of
> > > > the functions, the main one being the issue_read() function.
> > > > The issue_read() function is called by the netfs layer when a
> > > > read
> > > > cannot be fulfilled locally, and must be sent to the server
> > > > (either
> > > > the cache is not active, or it is active but the data is not
> > > > available).
> > > > Once the read from the server is complete, netfs requires a call
> > > > to
> > > > netfs_subreq_terminated() which conveys either how many bytes
> > > > were
> > > > read
> > > > successfully, or an error.  Note that issue_read() is called with
> > > > a
> > > > structure, netfs_io_subrequest, which defines the IO requested,
> > > > and
> > > > contains a start and a length (both in bytes), and assumes the
> > > > underlying
> > > > netfs will return a either an error on the whole region, or the
> > > > number
> > > > of bytes successfully read.
> > > >
> > > > The NFS IO path is page based and the main APIs are the pgio APIs
> > > > defined
> > > > in pagelist.c.  For the pgio APIs, there is no way for the caller
> > > > to
> > > > know how many RPCs will be sent and how the pages will be broken
> > > > up
> > > > into underlying RPCs, each of which will have their own
> > > > completion
> > > > and
> > > > return code.  In contrast, netfs is subrequest based, a single
> > > > subrequest may contain multiple pages, and a single subrequest is
> > > > initiated with issue_read() and terminated with
> > > > netfs_subreq_terminated().
> > > > Thus, to utilze the netfs APIs, NFS needs some way to accommodate
> > > > the netfs API requirement on the single response to the whole
> > > > subrequest, while also minimizing disruptive changes to the NFS
> > > > pgio layer.
> > > >
> > > > The approach taken with this patch is to allocate a small
> > > > structure
> > > > for each nfs_netfs_issue_read() call, store the final error and
> > > > number
> > > > of bytes successfully transferred in the structure, and update
> > > > these
> > > > values
> > > > as each RPC completes.  The refcount on the structure is used as
> > > > a
> > > > marker
> > > > for the last RPC completion, is incremented in
> > > > nfs_netfs_read_initiate(),
> > > > and decremented inside nfs_netfs_read_completion(), when a
> > > > nfs_pgio_header
> > > > contains a valid pointer to the data.  On the final put (which
> > > > signals
> > > > the final outstanding RPC is complete) in
> > > > nfs_netfs_read_completion(),
> > > > call netfs_subreq_terminated() with either the final error value
> > > > (if
> > > > one or more READs complete with an error) or the number of bytes
> > > > successfully transferred (if all RPCs complete successfully).
> > > > Note
> > > > that when all RPCs complete successfully, the number of bytes
> > > > transferred
> > > > is capped to the length of the subrequest.  Capping the
> > > > transferred
> > > > length
> > > > to the subrequest length prevents "Subreq overread" warnings from
> > > > netfs.
> > > > This is due to the "aligned_len" in nfs_pageio_add_page(), and
> > > > the
> > > > corner case where NFS requests a full page at the end of the
> > > > file,
> > > > even when i_size reflects only a partial page (NFS overread).
> > > >
> > > > Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
> > > > Reviewed-by: Jeff Layton <jlayton@kernel.org>
> > >
> > >
> > > This is not doing what I asked for, which was to separate out the
> > > fscache functionality, so that we can call that if and when it is
> > > available.
> > >
> > I must have misunderstood then.
> >
> > The last feedback I have from you was that you wanted it to be
> > an opt-in feature, and it was a comment on a previous patch
> > to Kconfig.  I was proceeding the best I knew how, but
> > let me try to get back on track.
> >
> > > Instead, it is just wrapping the NFS requests inside netfs
> > > requests. As
> > > it stands, that means it is just duplicating information, and
> > > adding
> > > unnecessary overhead to the standard I/O path (extra allocations,
> > > extra
> > > indirect calls, and extra bloat to the inode).
> > >
> > I think I understand what you're saying but I'm not sure.  Let me
> > ask some clarifying questions.
> >
> > Are you objecting to the code when CONFIG_NFS_FSCACHE is
> > configured?  Or when it is not?  Or both?  I think you're objecting
> > when it's configured, but not enabled (we mount without 'fsc').
> > Am I right?
> >
> > Also, are you objecting to the design that to use fcache we now
> > have to use netfs, specifically:
> > - call into netfs via either netfs_read_folio or netfs_readahead
> > - if fscache is enabled, then the IO can be satisfied from fscache
> > - if fscache is not enabled, or some of the IO cannot be satisfied
> > from the cache, then NFS is called back via netfs_issue_read
> > and we use the normal NFS read pageio interface.  This requires
> > we call netfs_subreq_terminated() when all the RPCs complete,
> > which is the reason for the small changes to pagelist.c
>
> I'm objecting to any middle layer "solution" that adds overhead to the
> NFS I/O paths.
>
Got it.

> I'm willing to consider solutions that are specific only to the fscache
> use case (i.e. when the 'fsc' mount option is specified). However when
> I perform a normal NFS mount, and do I/O, then I don't want to see
> extra memory allocations, extra indirect calls and larger inode
> footprints.
>
> IOW: I want the code to optimise for the case of standard NFS, not for
> the case of 'NFS with cachefs additions'.
>
I agree completely.  Are you seeing extra memory allocations
happen on mounts without 'fsc' or is it more a concern or how
some of the patches look?  We should not be calling any netfs or
fscache code if 'fsc' is not on the mount and I don't see any in my
testing. So either there's a misunderstanding here, or there's a
bug I'm missing.

If fscache is not configured, then nfs_netfs_read_folio() and
nfs_netfs_readahead() is a wrapper that returns -ENOBUFS.
If it's configured but not enabled, then the checks for
netfs_inode(inode)->cache should skip over any netfs code.
But maybe there's a non-obvious bug you're seeing and
somehow netfs is still getting called?  Because I cannot
see netfs getting called if 'fsc' is not on the mount in my
tests.

int nfs_netfs_read_folio(struct file *file, struct folio *folio)
{
        if (!netfs_inode(folio_inode(folio))->cache)
                return -ENOBUFS;

        return netfs_read_folio(file, folio);
}

int nfs_netfs_readahead(struct readahead_control *ractl)
{
        struct inode *inode = ractl->mapping->host;

        if (!netfs_inode(inode)->cache)
                return -ENOBUFS;

        netfs_readahead(ractl);
        return 0;
}


> >
> > Can you be more specific as to the portions of the patch you don't
> > like
> > so I can move it in the right direction?
> >
> > This is from patch #2 which you didn't comment on.  I'm not sure
> > you're
> > ok with it though, since you mention "extra bloat to the inode".
> > Do you object to this even though it's wrapped in an
> > #ifdef CONFIG_NFS_FSCACHE?  If so, do you require no
> > extra size be added to nfs_inode?
> >
> > @@ -204,9 +208,11 @@ struct nfs_inode {
> >         __u64 write_io;
> >         __u64 read_io;
> >  #ifdef CONFIG_NFS_FSCACHE
> > -       struct fscache_cookie   *fscache;
> > -#endif
> > +       struct netfs_inode      netfs; /* netfs context and VFS inode
> > */
> > +#else
> >         struct inode            vfs_inode;
> > +#endif
> > +
>
> Ideally, I'd prefer no extra size. I can live with it up to a certain
> point, however for now NFS is not unconditionally opting into the netfs
> project. If we're to ever do that, then I want to see streamlined code
> for the standard I/O case.
>
Ok and understood about standard I/O case.

I was thinking how we might not increase the size, but I don't think
I can make it work.

I thought we could change to something like the below, without an
embedded struct inode:

@@ -204,9 +208,11 @@ struct nfs_inode {
        __u64 write_io;
        __u64 read_io;
 #ifdef CONFIG_NFS_FSCACHE
-       struct fscache_cookie   *fscache;
-#endif
+       struct netfs_inode      *netfs; /* netfs context and VFS inode */
+#else
        struct inode            vfs_inode;
+#endif
+

Then I would need to alloc/free a netfs_inode at the time of
nfs_inode initiation.  Unfortunately this has the issue that the NFS_I()
macro cannot work, because it requires an embedded "struct inode"
due to "container_of" use:

+#ifdef CONFIG_NFS_FSCACHE
+static inline struct inode *VFS_I(struct nfs_inode *nfsi)
+{
+       return &nfsi->netfs.inode;
+}
+static inline struct nfs_inode *NFS_I(const struct inode *inode)
+{
+       return container_of(inode, struct nfs_inode, netfs.inode);
+}
+#else
+static inline struct inode *VFS_I(struct nfs_inode *nfsi)
+{
+       return &nfsi->vfs_inode;
+}
 static inline struct nfs_inode *NFS_I(const struct inode *inode)
 {
        return container_of(inode, struct nfs_inode, vfs_inode);
 }
+#endif



> >
> >
> > Are you ok with the stub functions which are placed in fscache.h, and
> > when CONFIG_NFS_FSCACHE is not set, become either a no-op
> > or a 1-liner (nfs_netfs_readpage_release)?
> >
> >  #else /* CONFIG_NFS_FSCACHE */
> > +static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {}
> > +static inline void nfs_netfs_initiate_read(struct nfs_pgio_header
> > *hdr) {}
> > +static inline void nfs_netfs_read_completion(struct nfs_pgio_header
> > *hdr) {}
> > +static inline void nfs_netfs_readpage_release(struct nfs_page *req)
> > +{
> > +       unlock_page(req->wb_page);
> > +}
> >  static inline void nfs_fscache_release_super_cookie(struct
> > super_block *sb) {}
> >  static inline void nfs_fscache_init_inode(struct inode *inode) {}
> >
> >
> > Do you object to the below?  If so, then do you want
> > #ifdef CONFIG_NFS_FSCACHE here?
> >
> > -- a/fs/nfs/inode.c
> > +++ b/fs/nfs/inode.c
> > @@ -2249,6 +2249,8 @@ struct inode *nfs_alloc_inode(struct
> > super_block *sb)
> >  #ifdef CONFIG_NFS_V4_2
> >         nfsi->xattr_cache = NULL;
> >  #endif
> > +       nfs_netfs_inode_init(nfsi);
> > +
> >         return VFS_I(nfsi);
> >  }
> >  EXPORT_SYMBOL_GPL(nfs_alloc_i
> > node);
> >
> >
> > Do you object to the changes in fs/nfs/read.c?  Specifically,
> > how about the below calls to netfs from nfs_read_folio and
> > nfs_readahead into equivalent netfs calls?  So when
> > NFS_CONFIG_FSCACHE is set, but fscache is not enabled
> > ('fsc' not on mount), these netfs functions do immediately call
> > netfs_alloc_request().  But I wonder if we could simply add a
> > check to see if fscache is enabled on the mount, and skip
> > over to satisfy what you want.  Am I understanding what you
> > want?
>
> Quite frankly, I'd prefer that we just split out the functionality that
> is needed from the netfs code so that it can be optimised. However I'm
> not interested enough in the cachefs functionality to work on that
> myself. ...and as I indicated above, I might be OK with opting into the
> netfs project, once the overhead can be made to disappear.
>
Understood.

If you think it makes more sense, I can move some of the nfs_netfs_*
functions into a netfs.c file as a starting point.  Or that can maybe
be done in a future patchset?

For now I was equating netfs and fscache together so we can
move on from the much older and single-page limiting fscache
interface that is likely to go away soon.

> >
> > @@ -355,6 +343,10 @@ int nfs_read_folio(struct file *file, struct
> > folio *folio)
> >         if (NFS_STALE(inode))
> >                 goto out_unlock;
> >
> > +       ret = nfs_netfs_read_folio(file, folio);
> > +       if (!ret)
> > +               goto out;
> > +
> >
> > @@ -405,6 +399,10 @@ void nfs_readahead(struct readahead_control
> > *ractl)
> >         if (NFS_STALE(inode))
> >                 goto out;
> >
> > +       ret = nfs_netfs_readahead(ractl);
> > +       if (!ret)
> > +               goto out;
> > +
> >
The above wrappers should prevent any additional overhead when fscache
is not enabled.  As far as I know these work to avoid calling netfs
when 'fsc' is not on the mount.

> >
> > And how about these calls from different points in the read
> > path to the earlier mentioned stub functions?
> >
> > @@ -110,20 +110,13 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
> >
> >  static void nfs_readpage_release(struct nfs_page *req, int error)
> >  {
> > -       struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> >         struct page *page = req->wb_page;
> >
> > -       dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb-
> > >s_id,
> > -               (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
> > -               (long long)req_offset(req));
> > -
> >         if (nfs_error_is_fatal_on_server(error) && error != -
> > ETIMEDOUT)
> >                 SetPageError(page);
> > -       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
> > -               if (PageUptodate(page))
> > -                       nfs_fscache_write_page(inode, page);
> > -               unlock_page(page);
> > -       }
> > +       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > +               nfs_netfs_readpage_release(req);
> > +
>
> I'm not seeing the value of wrapping unlock_page(), no... That code is
> going to need to change when we move it to use folios natively anyway.
>
Ok, how about I make it conditional on whether fscache is configured
and enabled then, similar to the nfs_netfs_read_folio() and
nfs_netfs_readahead()?  Below is what that would look like.
I could inline the code in nfs_netfs_readpage_release() if you
think it would be clearer.

static void nfs_readpage_release(struct nfs_page *req, int error)
{
        struct page *page = req->wb_page;

        if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
                SetPageError(page);
        if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
#ifndef CONFIG_NFS_FSCACHE
                unlock_page(req->wb_page);
#else
                nfs_netfs_readpage_release(req);
#endif
        nfs_release_request(req);
}


void nfs_netfs_readpage_release(struct nfs_page *req)
{
    struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);

    /*
     * If fscache is enabled, netfs will unlock pages.
     */
    if (netfs_inode(inode)->cache)
        return;

    unlock_page(req->wb_page);
}


> >         nfs_release_request(req);
> >  }
> >
> > @@ -177,6 +170,8 @@ static void nfs_read_completion(struct
> > nfs_pgio_header *hdr)
> >                 nfs_list_remove_request(req);
> >                 nfs_readpage_release(req, error);
> >         }
> > +       nfs_netfs_read_completion(hdr);
> > +
> >  out:
> >         hdr->release(hdr);
> >  }
> > @@ -187,6 +182,7 @@ static void nfs_initiate_read(struct
> > nfs_pgio_header *hdr,
> >                               struct rpc_task_setup *task_setup_data,
> > int how)
> >  {
> >         rpc_ops->read_setup(hdr, msg);
> > +       nfs_netfs_initiate_read(hdr);
> >         trace_nfs_initiate_read(hdr);
> >  }
> >
> >
> > Are you ok with these additions?  Something like this would
> > be required in the case of fscache configured and enabled,
> > because we could have some of the data in a read in
> > fscache, and some not.  That is the reason for the netfs
> > design, and why we need to be able to call the normal
> > NFS read IO path (netfs calls into issue_read, and we call
> > back via netfs_subreq_terminated)?
> >
> > @@ -101,6 +101,9 @@ struct nfs_pageio_descriptor {
> >         struct pnfs_layout_segment *pg_lseg;
> >         struct nfs_io_completion *pg_io_completion;
> >         struct nfs_direct_req   *pg_dreq;
> > +#ifdef CONFIG_NFS_FSCACHE
> > +       void                    *pg_netfs;
> > +#endif
> >
> > @@ -1619,6 +1619,9 @@ struct nfs_pgio_header {
> >         const struct nfs_rw_ops *rw_ops;
> >         struct nfs_io_completion *io_completion;
> >         struct nfs_direct_req   *dreq;
> > +#ifdef CONFIG_NFS_FSCACHE
> > +       void                    *netfs;
> > +#endif
> >
> >
> > And these additions to pagelist.c?
> >
> > @@ -68,6 +69,10 @@ void nfs_pgheader_init(struct
> > nfs_pageio_descriptor *desc,
> >         hdr->good_bytes = mirror->pg_count;
> >         hdr->io_completion = desc->pg_io_completion;
> >         hdr->dreq = desc->pg_dreq;
> > +#ifdef CONFIG_NFS_FSCACHE
> > +       if (desc->pg_netfs)
> > +               hdr->netfs = desc->pg_netfs;
> > +#endif
>
> Why the conditional?
>
Not really needed and I was thinking of removing it, so I'll do that.

> >
> >
> > @@ -846,6 +851,9 @@ void nfs_pageio_init(struct nfs_pageio_descriptor
> > *desc,
> >         desc->pg_lseg = NULL;
> >         desc->pg_io_completion = NULL;
> >         desc->pg_dreq = NULL;
> > +#ifdef CONFIG_NFS_FSCACHE
> > +       desc->pg_netfs = NULL;
> > +#endif
> >
> >
> > @@ -1360,6 +1369,9 @@ int nfs_pageio_resend(struct
> > nfs_pageio_descriptor *desc,
> >
> >         desc->pg_io_completion = hdr->io_completion;
> >         desc->pg_dreq = hdr->dreq;
> > +#ifdef CONFIG_NFS_FSCACHE
> > +       desc->pg_netfs = hdr->netfs;
> > +#endif
>
> Those all need wrapper functions instead of embedding #ifdefs.
>
Ok.



> >
> >
> > > My expectation is that the standard I/O path should have minimal
> > > overhead, and should certainly not increase the overhead that we
> > > already have. Will this be addressed in future iterations of these
> > > patches?
> > >
> >
> > I will do what I can to satisfy what you want, either by fixing up
> > this patch or follow-on patches.  Hopefully the above questions
> > will clarify the next steps.
> >
>
> --
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com
>
>
David Wysochanski Oct. 30, 2022, 11:25 p.m. UTC | #5
On Sat, Oct 29, 2022 at 12:46 PM David Wysochanski <dwysocha@redhat.com> wrote:
>
> On Fri, Oct 28, 2022 at 12:59 PM Trond Myklebust <trondmy@kernel.org> wrote:
> >
> > On Fri, 2022-10-28 at 07:50 -0400, David Wysochanski wrote:
> > > On Thu, Oct 27, 2022 at 3:16 PM Trond Myklebust <trondmy@kernel.org>
> > > wrote:
> > > >
> > > > On Mon, 2022-10-17 at 06:52 -0400, Dave Wysochanski wrote:
> > > > > Convert the NFS buffered read code paths to corresponding netfs
> > > > > APIs,
> > > > > but only when fscache is configured and enabled.
> > > > >
> > > > > The netfs API defines struct netfs_request_ops which must be
> > > > > filled
> > > > > in by the network filesystem.  For NFS, we only need to define 5
> > > > > of
> > > > > the functions, the main one being the issue_read() function.
> > > > > The issue_read() function is called by the netfs layer when a
> > > > > read
> > > > > cannot be fulfilled locally, and must be sent to the server
> > > > > (either
> > > > > the cache is not active, or it is active but the data is not
> > > > > available).
> > > > > Once the read from the server is complete, netfs requires a call
> > > > > to
> > > > > netfs_subreq_terminated() which conveys either how many bytes
> > > > > were
> > > > > read
> > > > > successfully, or an error.  Note that issue_read() is called with
> > > > > a
> > > > > structure, netfs_io_subrequest, which defines the IO requested,
> > > > > and
> > > > > contains a start and a length (both in bytes), and assumes the
> > > > > underlying
> > > > > netfs will return a either an error on the whole region, or the
> > > > > number
> > > > > of bytes successfully read.
> > > > >
> > > > > The NFS IO path is page based and the main APIs are the pgio APIs
> > > > > defined
> > > > > in pagelist.c.  For the pgio APIs, there is no way for the caller
> > > > > to
> > > > > know how many RPCs will be sent and how the pages will be broken
> > > > > up
> > > > > into underlying RPCs, each of which will have their own
> > > > > completion
> > > > > and
> > > > > return code.  In contrast, netfs is subrequest based, a single
> > > > > subrequest may contain multiple pages, and a single subrequest is
> > > > > initiated with issue_read() and terminated with
> > > > > netfs_subreq_terminated().
> > > > > Thus, to utilze the netfs APIs, NFS needs some way to accommodate
> > > > > the netfs API requirement on the single response to the whole
> > > > > subrequest, while also minimizing disruptive changes to the NFS
> > > > > pgio layer.
> > > > >
> > > > > The approach taken with this patch is to allocate a small
> > > > > structure
> > > > > for each nfs_netfs_issue_read() call, store the final error and
> > > > > number
> > > > > of bytes successfully transferred in the structure, and update
> > > > > these
> > > > > values
> > > > > as each RPC completes.  The refcount on the structure is used as
> > > > > a
> > > > > marker
> > > > > for the last RPC completion, is incremented in
> > > > > nfs_netfs_read_initiate(),
> > > > > and decremented inside nfs_netfs_read_completion(), when a
> > > > > nfs_pgio_header
> > > > > contains a valid pointer to the data.  On the final put (which
> > > > > signals
> > > > > the final outstanding RPC is complete) in
> > > > > nfs_netfs_read_completion(),
> > > > > call netfs_subreq_terminated() with either the final error value
> > > > > (if
> > > > > one or more READs complete with an error) or the number of bytes
> > > > > successfully transferred (if all RPCs complete successfully).
> > > > > Note
> > > > > that when all RPCs complete successfully, the number of bytes
> > > > > transferred
> > > > > is capped to the length of the subrequest.  Capping the
> > > > > transferred
> > > > > length
> > > > > to the subrequest length prevents "Subreq overread" warnings from
> > > > > netfs.
> > > > > This is due to the "aligned_len" in nfs_pageio_add_page(), and
> > > > > the
> > > > > corner case where NFS requests a full page at the end of the
> > > > > file,
> > > > > even when i_size reflects only a partial page (NFS overread).
> > > > >
> > > > > Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
> > > > > Reviewed-by: Jeff Layton <jlayton@kernel.org>
> > > >
> > > >
> > > > This is not doing what I asked for, which was to separate out the
> > > > fscache functionality, so that we can call that if and when it is
> > > > available.
> > > >
> > > I must have misunderstood then.
> > >
> > > The last feedback I have from you was that you wanted it to be
> > > an opt-in feature, and it was a comment on a previous patch
> > > to Kconfig.  I was proceeding the best I knew how, but
> > > let me try to get back on track.
> > >
> > > > Instead, it is just wrapping the NFS requests inside netfs
> > > > requests. As
> > > > it stands, that means it is just duplicating information, and
> > > > adding
> > > > unnecessary overhead to the standard I/O path (extra allocations,
> > > > extra
> > > > indirect calls, and extra bloat to the inode).
> > > >
> > > I think I understand what you're saying but I'm not sure.  Let me
> > > ask some clarifying questions.
> > >
> > > Are you objecting to the code when CONFIG_NFS_FSCACHE is
> > > configured?  Or when it is not?  Or both?  I think you're objecting
> > > when it's configured, but not enabled (we mount without 'fsc').
> > > Am I right?
> > >
> > > Also, are you objecting to the design that to use fcache we now
> > > have to use netfs, specifically:
> > > - call into netfs via either netfs_read_folio or netfs_readahead
> > > - if fscache is enabled, then the IO can be satisfied from fscache
> > > - if fscache is not enabled, or some of the IO cannot be satisfied
> > > from the cache, then NFS is called back via netfs_issue_read
> > > and we use the normal NFS read pageio interface.  This requires
> > > we call netfs_subreq_terminated() when all the RPCs complete,
> > > which is the reason for the small changes to pagelist.c
> >
> > I'm objecting to any middle layer "solution" that adds overhead to the
> > NFS I/O paths.
> >
> Got it.
>
> > I'm willing to consider solutions that are specific only to the fscache
> > use case (i.e. when the 'fsc' mount option is specified). However when
> > I perform a normal NFS mount, and do I/O, then I don't want to see
> > extra memory allocations, extra indirect calls and larger inode
> > footprints.
> >
> > IOW: I want the code to optimise for the case of standard NFS, not for
> > the case of 'NFS with cachefs additions'.
> >
> I agree completely.  Are you seeing extra memory allocations
> happen on mounts without 'fsc' or is it more a concern or how
> some of the patches look?  We should not be calling any netfs or
> fscache code if 'fsc' is not on the mount and I don't see any in my
> testing. So either there's a misunderstanding here, or there's a
> bug I'm missing.
>
> If fscache is not configured, then nfs_netfs_read_folio() and
> nfs_netfs_readahead() is a wrapper that returns -ENOBUFS.
> If it's configured but not enabled, then the checks for
> netfs_inode(inode)->cache should skip over any netfs code.
> But maybe there's a non-obvious bug you're seeing and
> somehow netfs is still getting called?  Because I cannot
> see netfs getting called if 'fsc' is not on the mount in my
> tests.
>
> int nfs_netfs_read_folio(struct file *file, struct folio *folio)
> {
>         if (!netfs_inode(folio_inode(folio))->cache)
>                 return -ENOBUFS;
>
>         return netfs_read_folio(file, folio);
> }
>
> int nfs_netfs_readahead(struct readahead_control *ractl)
> {
>         struct inode *inode = ractl->mapping->host;
>
>         if (!netfs_inode(inode)->cache)
>                 return -ENOBUFS;
>
>         netfs_readahead(ractl);
>         return 0;
> }
>
>
> > >
> > > Can you be more specific as to the portions of the patch you don't
> > > like
> > > so I can move it in the right direction?
> > >
> > > This is from patch #2 which you didn't comment on.  I'm not sure
> > > you're
> > > ok with it though, since you mention "extra bloat to the inode".
> > > Do you object to this even though it's wrapped in an
> > > #ifdef CONFIG_NFS_FSCACHE?  If so, do you require no
> > > extra size be added to nfs_inode?
> > >
> > > @@ -204,9 +208,11 @@ struct nfs_inode {
> > >         __u64 write_io;
> > >         __u64 read_io;
> > >  #ifdef CONFIG_NFS_FSCACHE
> > > -       struct fscache_cookie   *fscache;
> > > -#endif
> > > +       struct netfs_inode      netfs; /* netfs context and VFS inode
> > > */
> > > +#else
> > >         struct inode            vfs_inode;
> > > +#endif
> > > +
> >
> > Ideally, I'd prefer no extra size. I can live with it up to a certain
> > point, however for now NFS is not unconditionally opting into the netfs
> > project. If we're to ever do that, then I want to see streamlined code
> > for the standard I/O case.
> >
> Ok and understood about standard I/O case.
>
> I was thinking how we might not increase the size, but I don't think
> I can make it work.
>
> I thought we could change to something like the below, without an
> embedded struct inode:
>
> @@ -204,9 +208,11 @@ struct nfs_inode {
>         __u64 write_io;
>         __u64 read_io;
>  #ifdef CONFIG_NFS_FSCACHE
> -       struct fscache_cookie   *fscache;
> -#endif
> +       struct netfs_inode      *netfs; /* netfs context and VFS inode */
> +#else
>         struct inode            vfs_inode;
> +#endif
> +
>
> Then I would need to alloc/free a netfs_inode at the time of
> nfs_inode initiation.  Unfortunately this has the issue that the NFS_I()
> macro cannot work, because it requires an embedded "struct inode"
> due to "container_of" use:
>
> +#ifdef CONFIG_NFS_FSCACHE
> +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> +{
> +       return &nfsi->netfs.inode;
> +}
> +static inline struct nfs_inode *NFS_I(const struct inode *inode)
> +{
> +       return container_of(inode, struct nfs_inode, netfs.inode);
> +}
> +#else
> +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> +{
> +       return &nfsi->vfs_inode;
> +}
>  static inline struct nfs_inode *NFS_I(const struct inode *inode)
>  {
>         return container_of(inode, struct nfs_inode, vfs_inode);
>  }
> +#endif
>
>

Actually Trond maybe we can achieve a "0 length increase" of
nfs_inode if dhowells would take a patch to modify the definition
of struct netfs_inode and netfs_inode_init(), something like the WIP
patch below.  What do you think?

I think maybe this could be a follow-on patch and if you/dhowells
think it's an ok idea I can try to work out what is needed across
the tree.  I thought about it more and I kinda agree that in the
case for NFS where fscache is "configured but not enabled",
then even though we're only adding 24 bytes to the nfs_inode
each time, it will add up so it is worth at least a discussion.

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index f2402ddeafbf..195714f1c355 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -118,11 +118,7 @@ enum netfs_io_source {
 typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error,
                                      bool was_async);

-/*
- * Per-inode context.  This wraps the VFS inode.
- */
-struct netfs_inode {
-       struct inode            inode;          /* The VFS inode */
+struct netfs_info {
        const struct netfs_request_ops *ops;
 #if IS_ENABLED(CONFIG_FSCACHE)
        struct fscache_cookie   *cache;
@@ -130,6 +126,14 @@ struct netfs_inode {
        loff_t                  remote_i_size;  /* Size of the remote file */
 };

+/*
+ * Per-inode context.  This wraps the VFS inode.
+ */
+struct netfs_inode {
+       struct inode            inode;          /* The VFS inode */
+       struct netfs_info       *netfs;         /* Rest of netfs data */
+};
+
 /*
  * Resources required to do operations on a cache.
  */
@@ -312,10 +316,12 @@ static inline struct netfs_inode
*netfs_inode(struct inode *inode)
 static inline void netfs_inode_init(struct netfs_inode *ctx,
                                    const struct netfs_request_ops *ops)
 {
-       ctx->ops = ops;
-       ctx->remote_i_size = i_size_read(&ctx->inode);
+       ctx->netfs = kzalloc(sizeof(struct netfs_info)), GFP_KERNEL);
+       /* FIXME: Check for NULL */
+       ctx->netfs->ops = ops;
+       ctx->netfs->remote_i_size = i_size_read(&ctx->inode);
 #if IS_ENABLED(CONFIG_FSCACHE)
-       ctx->cache = NULL;
+       ctx->netfs->cache = NULL;
 #endif
 }



>
> > >
> > >
> > > Are you ok with the stub functions which are placed in fscache.h, and
> > > when CONFIG_NFS_FSCACHE is not set, become either a no-op
> > > or a 1-liner (nfs_netfs_readpage_release)?
> > >
> > >  #else /* CONFIG_NFS_FSCACHE */
> > > +static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {}
> > > +static inline void nfs_netfs_initiate_read(struct nfs_pgio_header
> > > *hdr) {}
> > > +static inline void nfs_netfs_read_completion(struct nfs_pgio_header
> > > *hdr) {}
> > > +static inline void nfs_netfs_readpage_release(struct nfs_page *req)
> > > +{
> > > +       unlock_page(req->wb_page);
> > > +}
> > >  static inline void nfs_fscache_release_super_cookie(struct
> > > super_block *sb) {}
> > >  static inline void nfs_fscache_init_inode(struct inode *inode) {}
> > >
> > >
> > > Do you object to the below?  If so, then do you want
> > > #ifdef CONFIG_NFS_FSCACHE here?
> > >
> > > -- a/fs/nfs/inode.c
> > > +++ b/fs/nfs/inode.c
> > > @@ -2249,6 +2249,8 @@ struct inode *nfs_alloc_inode(struct
> > > super_block *sb)
> > >  #ifdef CONFIG_NFS_V4_2
> > >         nfsi->xattr_cache = NULL;
> > >  #endif
> > > +       nfs_netfs_inode_init(nfsi);
> > > +
> > >         return VFS_I(nfsi);
> > >  }
> > >  EXPORT_SYMBOL_GPL(nfs_alloc_i
> > > node);
> > >
> > >
> > > Do you object to the changes in fs/nfs/read.c?  Specifically,
> > > how about the below calls to netfs from nfs_read_folio and
> > > nfs_readahead into equivalent netfs calls?  So when
> > > NFS_CONFIG_FSCACHE is set, but fscache is not enabled
> > > ('fsc' not on mount), these netfs functions do immediately call
> > > netfs_alloc_request().  But I wonder if we could simply add a
> > > check to see if fscache is enabled on the mount, and skip
> > > over to satisfy what you want.  Am I understanding what you
> > > want?
> >
> > Quite frankly, I'd prefer that we just split out the functionality that
> > is needed from the netfs code so that it can be optimised. However I'm
> > not interested enough in the cachefs functionality to work on that
> > myself. ...and as I indicated above, I might be OK with opting into the
> > netfs project, once the overhead can be made to disappear.
> >
> Understood.
>
> If you think it makes more sense, I can move some of the nfs_netfs_*
> functions into a netfs.c file as a starting point.  Or that can maybe
> be done in a future patchset?
>
> For now I was equating netfs and fscache together so we can
> move on from the much older and single-page limiting fscache
> interface that is likely to go away soon.
>
> > >
> > > @@ -355,6 +343,10 @@ int nfs_read_folio(struct file *file, struct
> > > folio *folio)
> > >         if (NFS_STALE(inode))
> > >                 goto out_unlock;
> > >
> > > +       ret = nfs_netfs_read_folio(file, folio);
> > > +       if (!ret)
> > > +               goto out;
> > > +
> > >
> > > @@ -405,6 +399,10 @@ void nfs_readahead(struct readahead_control
> > > *ractl)
> > >         if (NFS_STALE(inode))
> > >                 goto out;
> > >
> > > +       ret = nfs_netfs_readahead(ractl);
> > > +       if (!ret)
> > > +               goto out;
> > > +
> > >
> The above wrappers should prevent any additional overhead when fscache
> is not enabled.  As far as I know these work to avoid calling netfs
> when 'fsc' is not on the mount.
>
> > >
> > > And how about these calls from different points in the read
> > > path to the earlier mentioned stub functions?
> > >
> > > @@ -110,20 +110,13 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
> > >
> > >  static void nfs_readpage_release(struct nfs_page *req, int error)
> > >  {
> > > -       struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > >         struct page *page = req->wb_page;
> > >
> > > -       dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb-
> > > >s_id,
> > > -               (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
> > > -               (long long)req_offset(req));
> > > -
> > >         if (nfs_error_is_fatal_on_server(error) && error != -
> > > ETIMEDOUT)
> > >                 SetPageError(page);
> > > -       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
> > > -               if (PageUptodate(page))
> > > -                       nfs_fscache_write_page(inode, page);
> > > -               unlock_page(page);
> > > -       }
> > > +       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > +               nfs_netfs_readpage_release(req);
> > > +
> >
> > I'm not seeing the value of wrapping unlock_page(), no... That code is
> > going to need to change when we move it to use folios natively anyway.
> >
> Ok, how about I make it conditional on whether fscache is configured
> and enabled then, similar to the nfs_netfs_read_folio() and
> nfs_netfs_readahead()?  Below is what that would look like.
> I could inline the code in nfs_netfs_readpage_release() if you
> think it would be clearer.
>
> static void nfs_readpage_release(struct nfs_page *req, int error)
> {
>         struct page *page = req->wb_page;
>
>         if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
>                 SetPageError(page);
>         if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> #ifndef CONFIG_NFS_FSCACHE
>                 unlock_page(req->wb_page);
> #else
>                 nfs_netfs_readpage_release(req);
> #endif
>         nfs_release_request(req);
> }
>
>
> void nfs_netfs_readpage_release(struct nfs_page *req)
> {
>     struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
>
>     /*
>      * If fscache is enabled, netfs will unlock pages.
>      */
>     if (netfs_inode(inode)->cache)
>         return;
>
>     unlock_page(req->wb_page);
> }
>
>
> > >         nfs_release_request(req);
> > >  }
> > >
> > > @@ -177,6 +170,8 @@ static void nfs_read_completion(struct
> > > nfs_pgio_header *hdr)
> > >                 nfs_list_remove_request(req);
> > >                 nfs_readpage_release(req, error);
> > >         }
> > > +       nfs_netfs_read_completion(hdr);
> > > +
> > >  out:
> > >         hdr->release(hdr);
> > >  }
> > > @@ -187,6 +182,7 @@ static void nfs_initiate_read(struct
> > > nfs_pgio_header *hdr,
> > >                               struct rpc_task_setup *task_setup_data,
> > > int how)
> > >  {
> > >         rpc_ops->read_setup(hdr, msg);
> > > +       nfs_netfs_initiate_read(hdr);
> > >         trace_nfs_initiate_read(hdr);
> > >  }
> > >
> > >
> > > Are you ok with these additions?  Something like this would
> > > be required in the case of fscache configured and enabled,
> > > because we could have some of the data in a read in
> > > fscache, and some not.  That is the reason for the netfs
> > > design, and why we need to be able to call the normal
> > > NFS read IO path (netfs calls into issue_read, and we call
> > > back via netfs_subreq_terminated)?
> > >
> > > @@ -101,6 +101,9 @@ struct nfs_pageio_descriptor {
> > >         struct pnfs_layout_segment *pg_lseg;
> > >         struct nfs_io_completion *pg_io_completion;
> > >         struct nfs_direct_req   *pg_dreq;
> > > +#ifdef CONFIG_NFS_FSCACHE
> > > +       void                    *pg_netfs;
> > > +#endif
> > >
> > > @@ -1619,6 +1619,9 @@ struct nfs_pgio_header {
> > >         const struct nfs_rw_ops *rw_ops;
> > >         struct nfs_io_completion *io_completion;
> > >         struct nfs_direct_req   *dreq;
> > > +#ifdef CONFIG_NFS_FSCACHE
> > > +       void                    *netfs;
> > > +#endif
> > >
> > >
> > > And these additions to pagelist.c?
> > >
> > > @@ -68,6 +69,10 @@ void nfs_pgheader_init(struct
> > > nfs_pageio_descriptor *desc,
> > >         hdr->good_bytes = mirror->pg_count;
> > >         hdr->io_completion = desc->pg_io_completion;
> > >         hdr->dreq = desc->pg_dreq;
> > > +#ifdef CONFIG_NFS_FSCACHE
> > > +       if (desc->pg_netfs)
> > > +               hdr->netfs = desc->pg_netfs;
> > > +#endif
> >
> > Why the conditional?
> >
> Not really needed and I was thinking of removing it, so I'll do that.
>
> > >
> > >
> > > @@ -846,6 +851,9 @@ void nfs_pageio_init(struct nfs_pageio_descriptor
> > > *desc,
> > >         desc->pg_lseg = NULL;
> > >         desc->pg_io_completion = NULL;
> > >         desc->pg_dreq = NULL;
> > > +#ifdef CONFIG_NFS_FSCACHE
> > > +       desc->pg_netfs = NULL;
> > > +#endif
> > >
> > >
> > > @@ -1360,6 +1369,9 @@ int nfs_pageio_resend(struct
> > > nfs_pageio_descriptor *desc,
> > >
> > >         desc->pg_io_completion = hdr->io_completion;
> > >         desc->pg_dreq = hdr->dreq;
> > > +#ifdef CONFIG_NFS_FSCACHE
> > > +       desc->pg_netfs = hdr->netfs;
> > > +#endif
> >
> > Those all need wrapper functions instead of embedding #ifdefs.
> >
> Ok.
>
>
>
> > >
> > >
> > > > My expectation is that the standard I/O path should have minimal
> > > > overhead, and should certainly not increase the overhead that we
> > > > already have. Will this be addressed in future iterations of these
> > > > patches?
> > > >
> > >
> > > I will do what I can to satisfy what you want, either by fixing up
> > > this patch or follow-on patches.  Hopefully the above questions
> > > will clarify the next steps.
> > >
> >
> > --
> > Trond Myklebust
> > Linux NFS client maintainer, Hammerspace
> > trond.myklebust@hammerspace.com
> >
> >
Benjamin Maynard Oct. 31, 2022, 5:42 p.m. UTC | #6
Just wanted to add that I am really keen to see some form of these
patches merged.

I am using FS-Cache for the NFS re-exporting use-case.

I (and a fair few others) are impacted heavily by the IO performance
bottleneck when using FS-Cache, so much so that we are currently stuck
on v5.16 of the kernel.

--
Ben

On Sun, 30 Oct 2022 at 23:26, David Wysochanski <dwysocha@redhat.com> wrote:
>
> On Sat, Oct 29, 2022 at 12:46 PM David Wysochanski <dwysocha@redhat.com> wrote:
> >
> > On Fri, Oct 28, 2022 at 12:59 PM Trond Myklebust <trondmy@kernel.org> wrote:
> > >
> > > On Fri, 2022-10-28 at 07:50 -0400, David Wysochanski wrote:
> > > > On Thu, Oct 27, 2022 at 3:16 PM Trond Myklebust <trondmy@kernel.org>
> > > > wrote:
> > > > >
> > > > > On Mon, 2022-10-17 at 06:52 -0400, Dave Wysochanski wrote:
> > > > > > Convert the NFS buffered read code paths to corresponding netfs
> > > > > > APIs,
> > > > > > but only when fscache is configured and enabled.
> > > > > >
> > > > > > The netfs API defines struct netfs_request_ops which must be
> > > > > > filled
> > > > > > in by the network filesystem.  For NFS, we only need to define 5
> > > > > > of
> > > > > > the functions, the main one being the issue_read() function.
> > > > > > The issue_read() function is called by the netfs layer when a
> > > > > > read
> > > > > > cannot be fulfilled locally, and must be sent to the server
> > > > > > (either
> > > > > > the cache is not active, or it is active but the data is not
> > > > > > available).
> > > > > > Once the read from the server is complete, netfs requires a call
> > > > > > to
> > > > > > netfs_subreq_terminated() which conveys either how many bytes
> > > > > > were
> > > > > > read
> > > > > > successfully, or an error.  Note that issue_read() is called with
> > > > > > a
> > > > > > structure, netfs_io_subrequest, which defines the IO requested,
> > > > > > and
> > > > > > contains a start and a length (both in bytes), and assumes the
> > > > > > underlying
> > > > > > netfs will return a either an error on the whole region, or the
> > > > > > number
> > > > > > of bytes successfully read.
> > > > > >
> > > > > > The NFS IO path is page based and the main APIs are the pgio APIs
> > > > > > defined
> > > > > > in pagelist.c.  For the pgio APIs, there is no way for the caller
> > > > > > to
> > > > > > know how many RPCs will be sent and how the pages will be broken
> > > > > > up
> > > > > > into underlying RPCs, each of which will have their own
> > > > > > completion
> > > > > > and
> > > > > > return code.  In contrast, netfs is subrequest based, a single
> > > > > > subrequest may contain multiple pages, and a single subrequest is
> > > > > > initiated with issue_read() and terminated with
> > > > > > netfs_subreq_terminated().
> > > > > > Thus, to utilze the netfs APIs, NFS needs some way to accommodate
> > > > > > the netfs API requirement on the single response to the whole
> > > > > > subrequest, while also minimizing disruptive changes to the NFS
> > > > > > pgio layer.
> > > > > >
> > > > > > The approach taken with this patch is to allocate a small
> > > > > > structure
> > > > > > for each nfs_netfs_issue_read() call, store the final error and
> > > > > > number
> > > > > > of bytes successfully transferred in the structure, and update
> > > > > > these
> > > > > > values
> > > > > > as each RPC completes.  The refcount on the structure is used as
> > > > > > a
> > > > > > marker
> > > > > > for the last RPC completion, is incremented in
> > > > > > nfs_netfs_read_initiate(),
> > > > > > and decremented inside nfs_netfs_read_completion(), when a
> > > > > > nfs_pgio_header
> > > > > > contains a valid pointer to the data.  On the final put (which
> > > > > > signals
> > > > > > the final outstanding RPC is complete) in
> > > > > > nfs_netfs_read_completion(),
> > > > > > call netfs_subreq_terminated() with either the final error value
> > > > > > (if
> > > > > > one or more READs complete with an error) or the number of bytes
> > > > > > successfully transferred (if all RPCs complete successfully).
> > > > > > Note
> > > > > > that when all RPCs complete successfully, the number of bytes
> > > > > > transferred
> > > > > > is capped to the length of the subrequest.  Capping the
> > > > > > transferred
> > > > > > length
> > > > > > to the subrequest length prevents "Subreq overread" warnings from
> > > > > > netfs.
> > > > > > This is due to the "aligned_len" in nfs_pageio_add_page(), and
> > > > > > the
> > > > > > corner case where NFS requests a full page at the end of the
> > > > > > file,
> > > > > > even when i_size reflects only a partial page (NFS overread).
> > > > > >
> > > > > > Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
> > > > > > Reviewed-by: Jeff Layton <jlayton@kernel.org>
> > > > >
> > > > >
> > > > > This is not doing what I asked for, which was to separate out the
> > > > > fscache functionality, so that we can call that if and when it is
> > > > > available.
> > > > >
> > > > I must have misunderstood then.
> > > >
> > > > The last feedback I have from you was that you wanted it to be
> > > > an opt-in feature, and it was a comment on a previous patch
> > > > to Kconfig.  I was proceeding the best I knew how, but
> > > > let me try to get back on track.
> > > >
> > > > > Instead, it is just wrapping the NFS requests inside netfs
> > > > > requests. As
> > > > > it stands, that means it is just duplicating information, and
> > > > > adding
> > > > > unnecessary overhead to the standard I/O path (extra allocations,
> > > > > extra
> > > > > indirect calls, and extra bloat to the inode).
> > > > >
> > > > I think I understand what you're saying but I'm not sure.  Let me
> > > > ask some clarifying questions.
> > > >
> > > > Are you objecting to the code when CONFIG_NFS_FSCACHE is
> > > > configured?  Or when it is not?  Or both?  I think you're objecting
> > > > when it's configured, but not enabled (we mount without 'fsc').
> > > > Am I right?
> > > >
> > > > Also, are you objecting to the design that to use fcache we now
> > > > have to use netfs, specifically:
> > > > - call into netfs via either netfs_read_folio or netfs_readahead
> > > > - if fscache is enabled, then the IO can be satisfied from fscache
> > > > - if fscache is not enabled, or some of the IO cannot be satisfied
> > > > from the cache, then NFS is called back via netfs_issue_read
> > > > and we use the normal NFS read pageio interface.  This requires
> > > > we call netfs_subreq_terminated() when all the RPCs complete,
> > > > which is the reason for the small changes to pagelist.c
> > >
> > > I'm objecting to any middle layer "solution" that adds overhead to the
> > > NFS I/O paths.
> > >
> > Got it.
> >
> > > I'm willing to consider solutions that are specific only to the fscache
> > > use case (i.e. when the 'fsc' mount option is specified). However when
> > > I perform a normal NFS mount, and do I/O, then I don't want to see
> > > extra memory allocations, extra indirect calls and larger inode
> > > footprints.
> > >
> > > IOW: I want the code to optimise for the case of standard NFS, not for
> > > the case of 'NFS with cachefs additions'.
> > >
> > I agree completely.  Are you seeing extra memory allocations
> > happen on mounts without 'fsc' or is it more a concern or how
> > some of the patches look?  We should not be calling any netfs or
> > fscache code if 'fsc' is not on the mount and I don't see any in my
> > testing. So either there's a misunderstanding here, or there's a
> > bug I'm missing.
> >
> > If fscache is not configured, then nfs_netfs_read_folio() and
> > nfs_netfs_readahead() is a wrapper that returns -ENOBUFS.
> > If it's configured but not enabled, then the checks for
> > netfs_inode(inode)->cache should skip over any netfs code.
> > But maybe there's a non-obvious bug you're seeing and
> > somehow netfs is still getting called?  Because I cannot
> > see netfs getting called if 'fsc' is not on the mount in my
> > tests.
> >
> > int nfs_netfs_read_folio(struct file *file, struct folio *folio)
> > {
> >         if (!netfs_inode(folio_inode(folio))->cache)
> >                 return -ENOBUFS;
> >
> >         return netfs_read_folio(file, folio);
> > }
> >
> > int nfs_netfs_readahead(struct readahead_control *ractl)
> > {
> >         struct inode *inode = ractl->mapping->host;
> >
> >         if (!netfs_inode(inode)->cache)
> >                 return -ENOBUFS;
> >
> >         netfs_readahead(ractl);
> >         return 0;
> > }
> >
> >
> > > >
> > > > Can you be more specific as to the portions of the patch you don't
> > > > like
> > > > so I can move it in the right direction?
> > > >
> > > > This is from patch #2 which you didn't comment on.  I'm not sure
> > > > you're
> > > > ok with it though, since you mention "extra bloat to the inode".
> > > > Do you object to this even though it's wrapped in an
> > > > #ifdef CONFIG_NFS_FSCACHE?  If so, do you require no
> > > > extra size be added to nfs_inode?
> > > >
> > > > @@ -204,9 +208,11 @@ struct nfs_inode {
> > > >         __u64 write_io;
> > > >         __u64 read_io;
> > > >  #ifdef CONFIG_NFS_FSCACHE
> > > > -       struct fscache_cookie   *fscache;
> > > > -#endif
> > > > +       struct netfs_inode      netfs; /* netfs context and VFS inode
> > > > */
> > > > +#else
> > > >         struct inode            vfs_inode;
> > > > +#endif
> > > > +
> > >
> > > Ideally, I'd prefer no extra size. I can live with it up to a certain
> > > point, however for now NFS is not unconditionally opting into the netfs
> > > project. If we're to ever do that, then I want to see streamlined code
> > > for the standard I/O case.
> > >
> > Ok and understood about standard I/O case.
> >
> > I was thinking how we might not increase the size, but I don't think
> > I can make it work.
> >
> > I thought we could change to something like the below, without an
> > embedded struct inode:
> >
> > @@ -204,9 +208,11 @@ struct nfs_inode {
> >         __u64 write_io;
> >         __u64 read_io;
> >  #ifdef CONFIG_NFS_FSCACHE
> > -       struct fscache_cookie   *fscache;
> > -#endif
> > +       struct netfs_inode      *netfs; /* netfs context and VFS inode */
> > +#else
> >         struct inode            vfs_inode;
> > +#endif
> > +
> >
> > Then I would need to alloc/free a netfs_inode at the time of
> > nfs_inode initiation.  Unfortunately this has the issue that the NFS_I()
> > macro cannot work, because it requires an embedded "struct inode"
> > due to "container_of" use:
> >
> > +#ifdef CONFIG_NFS_FSCACHE
> > +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > +{
> > +       return &nfsi->netfs.inode;
> > +}
> > +static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > +{
> > +       return container_of(inode, struct nfs_inode, netfs.inode);
> > +}
> > +#else
> > +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > +{
> > +       return &nfsi->vfs_inode;
> > +}
> >  static inline struct nfs_inode *NFS_I(const struct inode *inode)
> >  {
> >         return container_of(inode, struct nfs_inode, vfs_inode);
> >  }
> > +#endif
> >
> >
>
> Actually Trond maybe we can achieve a "0 length increase" of
> nfs_inode if dhowells would take a patch to modify the definition
> of struct netfs_inode and netfs_inode_init(), something like the WIP
> patch below.  What do you think?
>
> I think maybe this could be a follow-on patch and if you/dhowells
> think it's an ok idea I can try to work out what is needed across
> the tree.  I thought about it more and I kinda agree that in the
> case for NFS where fscache is "configured but not enabled",
> then even though we're only adding 24 bytes to the nfs_inode
> each time, it will add up so it is worth at least a discussion.
>
> diff --git a/include/linux/netfs.h b/include/linux/netfs.h
> index f2402ddeafbf..195714f1c355 100644
> --- a/include/linux/netfs.h
> +++ b/include/linux/netfs.h
> @@ -118,11 +118,7 @@ enum netfs_io_source {
>  typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error,
>                                       bool was_async);
>
> -/*
> - * Per-inode context.  This wraps the VFS inode.
> - */
> -struct netfs_inode {
> -       struct inode            inode;          /* The VFS inode */
> +struct netfs_info {
>         const struct netfs_request_ops *ops;
>  #if IS_ENABLED(CONFIG_FSCACHE)
>         struct fscache_cookie   *cache;
> @@ -130,6 +126,14 @@ struct netfs_inode {
>         loff_t                  remote_i_size;  /* Size of the remote file */
>  };
>
> +/*
> + * Per-inode context.  This wraps the VFS inode.
> + */
> +struct netfs_inode {
> +       struct inode            inode;          /* The VFS inode */
> +       struct netfs_info       *netfs;         /* Rest of netfs data */
> +};
> +
>  /*
>   * Resources required to do operations on a cache.
>   */
> @@ -312,10 +316,12 @@ static inline struct netfs_inode
> *netfs_inode(struct inode *inode)
>  static inline void netfs_inode_init(struct netfs_inode *ctx,
>                                     const struct netfs_request_ops *ops)
>  {
> -       ctx->ops = ops;
> -       ctx->remote_i_size = i_size_read(&ctx->inode);
> +       ctx->netfs = kzalloc(sizeof(struct netfs_info)), GFP_KERNEL);
> +       /* FIXME: Check for NULL */
> +       ctx->netfs->ops = ops;
> +       ctx->netfs->remote_i_size = i_size_read(&ctx->inode);
>  #if IS_ENABLED(CONFIG_FSCACHE)
> -       ctx->cache = NULL;
> +       ctx->netfs->cache = NULL;
>  #endif
>  }
>
>
>
> >
> > > >
> > > >
> > > > Are you ok with the stub functions which are placed in fscache.h, and
> > > > when CONFIG_NFS_FSCACHE is not set, become either a no-op
> > > > or a 1-liner (nfs_netfs_readpage_release)?
> > > >
> > > >  #else /* CONFIG_NFS_FSCACHE */
> > > > +static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {}
> > > > +static inline void nfs_netfs_initiate_read(struct nfs_pgio_header
> > > > *hdr) {}
> > > > +static inline void nfs_netfs_read_completion(struct nfs_pgio_header
> > > > *hdr) {}
> > > > +static inline void nfs_netfs_readpage_release(struct nfs_page *req)
> > > > +{
> > > > +       unlock_page(req->wb_page);
> > > > +}
> > > >  static inline void nfs_fscache_release_super_cookie(struct
> > > > super_block *sb) {}
> > > >  static inline void nfs_fscache_init_inode(struct inode *inode) {}
> > > >
> > > >
> > > > Do you object to the below?  If so, then do you want
> > > > #ifdef CONFIG_NFS_FSCACHE here?
> > > >
> > > > -- a/fs/nfs/inode.c
> > > > +++ b/fs/nfs/inode.c
> > > > @@ -2249,6 +2249,8 @@ struct inode *nfs_alloc_inode(struct
> > > > super_block *sb)
> > > >  #ifdef CONFIG_NFS_V4_2
> > > >         nfsi->xattr_cache = NULL;
> > > >  #endif
> > > > +       nfs_netfs_inode_init(nfsi);
> > > > +
> > > >         return VFS_I(nfsi);
> > > >  }
> > > >  EXPORT_SYMBOL_GPL(nfs_alloc_i
> > > > node);
> > > >
> > > >
> > > > Do you object to the changes in fs/nfs/read.c?  Specifically,
> > > > how about the below calls to netfs from nfs_read_folio and
> > > > nfs_readahead into equivalent netfs calls?  So when
> > > > NFS_CONFIG_FSCACHE is set, but fscache is not enabled
> > > > ('fsc' not on mount), these netfs functions do immediately call
> > > > netfs_alloc_request().  But I wonder if we could simply add a
> > > > check to see if fscache is enabled on the mount, and skip
> > > > over to satisfy what you want.  Am I understanding what you
> > > > want?
> > >
> > > Quite frankly, I'd prefer that we just split out the functionality that
> > > is needed from the netfs code so that it can be optimised. However I'm
> > > not interested enough in the cachefs functionality to work on that
> > > myself. ...and as I indicated above, I might be OK with opting into the
> > > netfs project, once the overhead can be made to disappear.
> > >
> > Understood.
> >
> > If you think it makes more sense, I can move some of the nfs_netfs_*
> > functions into a netfs.c file as a starting point.  Or that can maybe
> > be done in a future patchset?
> >
> > For now I was equating netfs and fscache together so we can
> > move on from the much older and single-page limiting fscache
> > interface that is likely to go away soon.
> >
> > > >
> > > > @@ -355,6 +343,10 @@ int nfs_read_folio(struct file *file, struct
> > > > folio *folio)
> > > >         if (NFS_STALE(inode))
> > > >                 goto out_unlock;
> > > >
> > > > +       ret = nfs_netfs_read_folio(file, folio);
> > > > +       if (!ret)
> > > > +               goto out;
> > > > +
> > > >
> > > > @@ -405,6 +399,10 @@ void nfs_readahead(struct readahead_control
> > > > *ractl)
> > > >         if (NFS_STALE(inode))
> > > >                 goto out;
> > > >
> > > > +       ret = nfs_netfs_readahead(ractl);
> > > > +       if (!ret)
> > > > +               goto out;
> > > > +
> > > >
> > The above wrappers should prevent any additional overhead when fscache
> > is not enabled.  As far as I know these work to avoid calling netfs
> > when 'fsc' is not on the mount.
> >
> > > >
> > > > And how about these calls from different points in the read
> > > > path to the earlier mentioned stub functions?
> > > >
> > > > @@ -110,20 +110,13 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
> > > >
> > > >  static void nfs_readpage_release(struct nfs_page *req, int error)
> > > >  {
> > > > -       struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > > >         struct page *page = req->wb_page;
> > > >
> > > > -       dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb-
> > > > >s_id,
> > > > -               (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
> > > > -               (long long)req_offset(req));
> > > > -
> > > >         if (nfs_error_is_fatal_on_server(error) && error != -
> > > > ETIMEDOUT)
> > > >                 SetPageError(page);
> > > > -       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
> > > > -               if (PageUptodate(page))
> > > > -                       nfs_fscache_write_page(inode, page);
> > > > -               unlock_page(page);
> > > > -       }
> > > > +       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > > +               nfs_netfs_readpage_release(req);
> > > > +
> > >
> > > I'm not seeing the value of wrapping unlock_page(), no... That code is
> > > going to need to change when we move it to use folios natively anyway.
> > >
> > Ok, how about I make it conditional on whether fscache is configured
> > and enabled then, similar to the nfs_netfs_read_folio() and
> > nfs_netfs_readahead()?  Below is what that would look like.
> > I could inline the code in nfs_netfs_readpage_release() if you
> > think it would be clearer.
> >
> > static void nfs_readpage_release(struct nfs_page *req, int error)
> > {
> >         struct page *page = req->wb_page;
> >
> >         if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
> >                 SetPageError(page);
> >         if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > #ifndef CONFIG_NFS_FSCACHE
> >                 unlock_page(req->wb_page);
> > #else
> >                 nfs_netfs_readpage_release(req);
> > #endif
> >         nfs_release_request(req);
> > }
> >
> >
> > void nfs_netfs_readpage_release(struct nfs_page *req)
> > {
> >     struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> >
> >     /*
> >      * If fscache is enabled, netfs will unlock pages.
> >      */
> >     if (netfs_inode(inode)->cache)
> >         return;
> >
> >     unlock_page(req->wb_page);
> > }
> >
> >
> > > >         nfs_release_request(req);
> > > >  }
> > > >
> > > > @@ -177,6 +170,8 @@ static void nfs_read_completion(struct
> > > > nfs_pgio_header *hdr)
> > > >                 nfs_list_remove_request(req);
> > > >                 nfs_readpage_release(req, error);
> > > >         }
> > > > +       nfs_netfs_read_completion(hdr);
> > > > +
> > > >  out:
> > > >         hdr->release(hdr);
> > > >  }
> > > > @@ -187,6 +182,7 @@ static void nfs_initiate_read(struct
> > > > nfs_pgio_header *hdr,
> > > >                               struct rpc_task_setup *task_setup_data,
> > > > int how)
> > > >  {
> > > >         rpc_ops->read_setup(hdr, msg);
> > > > +       nfs_netfs_initiate_read(hdr);
> > > >         trace_nfs_initiate_read(hdr);
> > > >  }
> > > >
> > > >
> > > > Are you ok with these additions?  Something like this would
> > > > be required in the case of fscache configured and enabled,
> > > > because we could have some of the data in a read in
> > > > fscache, and some not.  That is the reason for the netfs
> > > > design, and why we need to be able to call the normal
> > > > NFS read IO path (netfs calls into issue_read, and we call
> > > > back via netfs_subreq_terminated)?
> > > >
> > > > @@ -101,6 +101,9 @@ struct nfs_pageio_descriptor {
> > > >         struct pnfs_layout_segment *pg_lseg;
> > > >         struct nfs_io_completion *pg_io_completion;
> > > >         struct nfs_direct_req   *pg_dreq;
> > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > +       void                    *pg_netfs;
> > > > +#endif
> > > >
> > > > @@ -1619,6 +1619,9 @@ struct nfs_pgio_header {
> > > >         const struct nfs_rw_ops *rw_ops;
> > > >         struct nfs_io_completion *io_completion;
> > > >         struct nfs_direct_req   *dreq;
> > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > +       void                    *netfs;
> > > > +#endif
> > > >
> > > >
> > > > And these additions to pagelist.c?
> > > >
> > > > @@ -68,6 +69,10 @@ void nfs_pgheader_init(struct
> > > > nfs_pageio_descriptor *desc,
> > > >         hdr->good_bytes = mirror->pg_count;
> > > >         hdr->io_completion = desc->pg_io_completion;
> > > >         hdr->dreq = desc->pg_dreq;
> > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > +       if (desc->pg_netfs)
> > > > +               hdr->netfs = desc->pg_netfs;
> > > > +#endif
> > >
> > > Why the conditional?
> > >
> > Not really needed and I was thinking of removing it, so I'll do that.
> >
> > > >
> > > >
> > > > @@ -846,6 +851,9 @@ void nfs_pageio_init(struct nfs_pageio_descriptor
> > > > *desc,
> > > >         desc->pg_lseg = NULL;
> > > >         desc->pg_io_completion = NULL;
> > > >         desc->pg_dreq = NULL;
> > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > +       desc->pg_netfs = NULL;
> > > > +#endif
> > > >
> > > >
> > > > @@ -1360,6 +1369,9 @@ int nfs_pageio_resend(struct
> > > > nfs_pageio_descriptor *desc,
> > > >
> > > >         desc->pg_io_completion = hdr->io_completion;
> > > >         desc->pg_dreq = hdr->dreq;
> > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > +       desc->pg_netfs = hdr->netfs;
> > > > +#endif
> > >
> > > Those all need wrapper functions instead of embedding #ifdefs.
> > >
> > Ok.
> >
> >
> >
> > > >
> > > >
> > > > > My expectation is that the standard I/O path should have minimal
> > > > > overhead, and should certainly not increase the overhead that we
> > > > > already have. Will this be addressed in future iterations of these
> > > > > patches?
> > > > >
> > > >
> > > > I will do what I can to satisfy what you want, either by fixing up
> > > > this patch or follow-on patches.  Hopefully the above questions
> > > > will clarify the next steps.
> > > >
> > >
> > > --
> > > Trond Myklebust
> > > Linux NFS client maintainer, Hammerspace
> > > trond.myklebust@hammerspace.com
> > >
> > >
>
Benjamin Maynard Nov. 12, 2022, 12:46 p.m. UTC | #7
Hi all,

I've been doing some more testing with these patches, I applied all of
the patches (v10 from
https://patchwork.kernel.org/project/linux-nfs/list/?series=691729)
apart from Patch 6 (the RFC patch) to version 6.0.8 of the kernel.

I have the following setup:

Source NFS Server <-- Re-Export Server (with FS-Cache) <-- NFS Client.

I have a 500Gb file on the Source NFS Server, which I am then copying
to the NFS Client via the Re-Export Server.

On the first copy, I see heavy writes to /var/cache/fscache on the
re-export server, and once the file copy completes I see that
/var/cache/fscache is approximately 500Gb in size. All good so far.

I then deleted that file from the NFS Client, and dropped the caches
just to be safe (echo 3 > /proc/sys/vm/drop_caches on the NFS Client).

I then performed another copy of the 500Gb file on the NFS Client,
again via the Re-Export Server. What I expected would happen is that I
would see heavy reads from the /var/cache/fscache volume as the file
should be served from FS-Cache.

However what I actually saw was no reads whatsoever, FS-Cache seems to
be ignored and the file is pulled from the Source NFS Filer again. I
also see heavy writes to /var/cache/fscache, so it appears that
FS-Cache is overwriting its existing cache, and never using it.

I only have 104Gb of memory on the Re-Export Server (with FS-Cache) so
it is not possible that the file is being served from the page cache.

We saw this behaviour before on an older set of the patches when our
mount between the Re-Export Server and the Source NFS Filer was using
the "sync" option, but we are now using the "async" option and the
same is happening.

Mount options:

Source NFS Server <-- Re-Export Server (with FS-Cache):

10.0.0.49:/files /srv/nfs/files nfs
rw,noatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,acregmin=600,acregmax=600,acdirmin=600,acdirmax=600,hard,nocto,proto=tcp,nconnect=16,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.49,mountvers=3,mountport=37485,mountproto=tcp,fsc,local_lock=none,addr=10.0.0.49

Re-Export Server (with FS-Cache) <-- NFS Client:

10.0.0.3:/files /mnt/nfs nfs
rw,relatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.3,mountvers=3,mountport=20048,mountproto=tcp,local_lock=none,addr=10.0.0.3

It is also worth noting this behaviour is not unique to the re-export
use case. I see FS-Cache not being used with the following setup:

Source NFS Server <-- Client (with FS-Cache).

Thanks,
Ben


Kind Regards

Benjamin Maynard

Customer Engineer

benmaynard@google.com

Google, Inc.




On Mon, 31 Oct 2022 at 22:22, Trond Myklebust <trondmy@hammerspace.com> wrote:
>
>
>
> > On Oct 30, 2022, at 19:25, David Wysochanski <dwysocha@redhat.com> wrote:
> >
> > On Sat, Oct 29, 2022 at 12:46 PM David Wysochanski <dwysocha@redhat.com> wrote:
> >>
> >> On Fri, Oct 28, 2022 at 12:59 PM Trond Myklebust <trondmy@kernel.org> wrote:
> >>>
> >>> On Fri, 2022-10-28 at 07:50 -0400, David Wysochanski wrote:
> >>>> On Thu, Oct 27, 2022 at 3:16 PM Trond Myklebust <trondmy@kernel.org>
> >>>> wrote:
> >>>>>
> >>>>> On Mon, 2022-10-17 at 06:52 -0400, Dave Wysochanski wrote:
> >>>>>> Convert the NFS buffered read code paths to corresponding netfs
> >>>>>> APIs,
> >>>>>> but only when fscache is configured and enabled.
> >>>>>>
> >>>>>> The netfs API defines struct netfs_request_ops which must be
> >>>>>> filled
> >>>>>> in by the network filesystem.  For NFS, we only need to define 5
> >>>>>> of
> >>>>>> the functions, the main one being the issue_read() function.
> >>>>>> The issue_read() function is called by the netfs layer when a
> >>>>>> read
> >>>>>> cannot be fulfilled locally, and must be sent to the server
> >>>>>> (either
> >>>>>> the cache is not active, or it is active but the data is not
> >>>>>> available).
> >>>>>> Once the read from the server is complete, netfs requires a call
> >>>>>> to
> >>>>>> netfs_subreq_terminated() which conveys either how many bytes
> >>>>>> were
> >>>>>> read
> >>>>>> successfully, or an error.  Note that issue_read() is called with
> >>>>>> a
> >>>>>> structure, netfs_io_subrequest, which defines the IO requested,
> >>>>>> and
> >>>>>> contains a start and a length (both in bytes), and assumes the
> >>>>>> underlying
> >>>>>> netfs will return a either an error on the whole region, or the
> >>>>>> number
> >>>>>> of bytes successfully read.
> >>>>>>
> >>>>>> The NFS IO path is page based and the main APIs are the pgio APIs
> >>>>>> defined
> >>>>>> in pagelist.c.  For the pgio APIs, there is no way for the caller
> >>>>>> to
> >>>>>> know how many RPCs will be sent and how the pages will be broken
> >>>>>> up
> >>>>>> into underlying RPCs, each of which will have their own
> >>>>>> completion
> >>>>>> and
> >>>>>> return code.  In contrast, netfs is subrequest based, a single
> >>>>>> subrequest may contain multiple pages, and a single subrequest is
> >>>>>> initiated with issue_read() and terminated with
> >>>>>> netfs_subreq_terminated().
> >>>>>> Thus, to utilze the netfs APIs, NFS needs some way to accommodate
> >>>>>> the netfs API requirement on the single response to the whole
> >>>>>> subrequest, while also minimizing disruptive changes to the NFS
> >>>>>> pgio layer.
> >>>>>>
> >>>>>> The approach taken with this patch is to allocate a small
> >>>>>> structure
> >>>>>> for each nfs_netfs_issue_read() call, store the final error and
> >>>>>> number
> >>>>>> of bytes successfully transferred in the structure, and update
> >>>>>> these
> >>>>>> values
> >>>>>> as each RPC completes.  The refcount on the structure is used as
> >>>>>> a
> >>>>>> marker
> >>>>>> for the last RPC completion, is incremented in
> >>>>>> nfs_netfs_read_initiate(),
> >>>>>> and decremented inside nfs_netfs_read_completion(), when a
> >>>>>> nfs_pgio_header
> >>>>>> contains a valid pointer to the data.  On the final put (which
> >>>>>> signals
> >>>>>> the final outstanding RPC is complete) in
> >>>>>> nfs_netfs_read_completion(),
> >>>>>> call netfs_subreq_terminated() with either the final error value
> >>>>>> (if
> >>>>>> one or more READs complete with an error) or the number of bytes
> >>>>>> successfully transferred (if all RPCs complete successfully).
> >>>>>> Note
> >>>>>> that when all RPCs complete successfully, the number of bytes
> >>>>>> transferred
> >>>>>> is capped to the length of the subrequest.  Capping the
> >>>>>> transferred
> >>>>>> length
> >>>>>> to the subrequest length prevents "Subreq overread" warnings from
> >>>>>> netfs.
> >>>>>> This is due to the "aligned_len" in nfs_pageio_add_page(), and
> >>>>>> the
> >>>>>> corner case where NFS requests a full page at the end of the
> >>>>>> file,
> >>>>>> even when i_size reflects only a partial page (NFS overread).
> >>>>>>
> >>>>>> Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
> >>>>>> Reviewed-by: Jeff Layton <jlayton@kernel.org>
> >>>>>
> >>>>>
> >>>>> This is not doing what I asked for, which was to separate out the
> >>>>> fscache functionality, so that we can call that if and when it is
> >>>>> available.
> >>>>>
> >>>> I must have misunderstood then.
> >>>>
> >>>> The last feedback I have from you was that you wanted it to be
> >>>> an opt-in feature, and it was a comment on a previous patch
> >>>> to Kconfig.  I was proceeding the best I knew how, but
> >>>> let me try to get back on track.
> >>>>
> >>>>> Instead, it is just wrapping the NFS requests inside netfs
> >>>>> requests. As
> >>>>> it stands, that means it is just duplicating information, and
> >>>>> adding
> >>>>> unnecessary overhead to the standard I/O path (extra allocations,
> >>>>> extra
> >>>>> indirect calls, and extra bloat to the inode).
> >>>>>
> >>>> I think I understand what you're saying but I'm not sure.  Let me
> >>>> ask some clarifying questions.
> >>>>
> >>>> Are you objecting to the code when CONFIG_NFS_FSCACHE is
> >>>> configured?  Or when it is not?  Or both?  I think you're objecting
> >>>> when it's configured, but not enabled (we mount without 'fsc').
> >>>> Am I right?
> >>>>
> >>>> Also, are you objecting to the design that to use fcache we now
> >>>> have to use netfs, specifically:
> >>>> - call into netfs via either netfs_read_folio or netfs_readahead
> >>>> - if fscache is enabled, then the IO can be satisfied from fscache
> >>>> - if fscache is not enabled, or some of the IO cannot be satisfied
> >>>> from the cache, then NFS is called back via netfs_issue_read
> >>>> and we use the normal NFS read pageio interface.  This requires
> >>>> we call netfs_subreq_terminated() when all the RPCs complete,
> >>>> which is the reason for the small changes to pagelist.c
> >>>
> >>> I'm objecting to any middle layer "solution" that adds overhead to the
> >>> NFS I/O paths.
> >>>
> >> Got it.
> >>
> >>> I'm willing to consider solutions that are specific only to the fscache
> >>> use case (i.e. when the 'fsc' mount option is specified). However when
> >>> I perform a normal NFS mount, and do I/O, then I don't want to see
> >>> extra memory allocations, extra indirect calls and larger inode
> >>> footprints.
> >>>
> >>> IOW: I want the code to optimise for the case of standard NFS, not for
> >>> the case of 'NFS with cachefs additions'.
> >>>
> >> I agree completely.  Are you seeing extra memory allocations
> >> happen on mounts without 'fsc' or is it more a concern or how
> >> some of the patches look?  We should not be calling any netfs or
> >> fscache code if 'fsc' is not on the mount and I don't see any in my
> >> testing. So either there's a misunderstanding here, or there's a
> >> bug I'm missing.
> >>
> >> If fscache is not configured, then nfs_netfs_read_folio() and
> >> nfs_netfs_readahead() is a wrapper that returns -ENOBUFS.
> >> If it's configured but not enabled, then the checks for
> >> netfs_inode(inode)->cache should skip over any netfs code.
> >> But maybe there's a non-obvious bug you're seeing and
> >> somehow netfs is still getting called?  Because I cannot
> >> see netfs getting called if 'fsc' is not on the mount in my
> >> tests.
> >>
> >> int nfs_netfs_read_folio(struct file *file, struct folio *folio)
> >> {
> >>       if (!netfs_inode(folio_inode(folio))->cache)
> >>               return -ENOBUFS;
> >>
> >>       return netfs_read_folio(file, folio);
> >> }
> >>
> >> int nfs_netfs_readahead(struct readahead_control *ractl)
> >> {
> >>       struct inode *inode = ractl->mapping->host;
> >>
> >>       if (!netfs_inode(inode)->cache)
> >>               return -ENOBUFS;
> >>
> >>       netfs_readahead(ractl);
> >>       return 0;
> >> }
> >>
> >>
> >>>>
> >>>> Can you be more specific as to the portions of the patch you don't
> >>>> like
> >>>> so I can move it in the right direction?
> >>>>
> >>>> This is from patch #2 which you didn't comment on.  I'm not sure
> >>>> you're
> >>>> ok with it though, since you mention "extra bloat to the inode".
> >>>> Do you object to this even though it's wrapped in an
> >>>> #ifdef CONFIG_NFS_FSCACHE?  If so, do you require no
> >>>> extra size be added to nfs_inode?
> >>>>
> >>>> @@ -204,9 +208,11 @@ struct nfs_inode {
> >>>>       __u64 write_io;
> >>>>       __u64 read_io;
> >>>> #ifdef CONFIG_NFS_FSCACHE
> >>>> -       struct fscache_cookie   *fscache;
> >>>> -#endif
> >>>> +       struct netfs_inode      netfs; /* netfs context and VFS inode
> >>>> */
> >>>> +#else
> >>>>       struct inode            vfs_inode;
> >>>> +#endif
> >>>> +
> >>>
> >>> Ideally, I'd prefer no extra size. I can live with it up to a certain
> >>> point, however for now NFS is not unconditionally opting into the netfs
> >>> project. If we're to ever do that, then I want to see streamlined code
> >>> for the standard I/O case.
> >>>
> >> Ok and understood about standard I/O case.
> >>
> >> I was thinking how we might not increase the size, but I don't think
> >> I can make it work.
> >>
> >> I thought we could change to something like the below, without an
> >> embedded struct inode:
> >>
> >> @@ -204,9 +208,11 @@ struct nfs_inode {
> >>       __u64 write_io;
> >>       __u64 read_io;
> >> #ifdef CONFIG_NFS_FSCACHE
> >> -       struct fscache_cookie   *fscache;
> >> -#endif
> >> +       struct netfs_inode      *netfs; /* netfs context and VFS inode */
> >> +#else
> >>       struct inode            vfs_inode;
> >> +#endif
> >> +
> >>
> >> Then I would need to alloc/free a netfs_inode at the time of
> >> nfs_inode initiation.  Unfortunately this has the issue that the NFS_I()
> >> macro cannot work, because it requires an embedded "struct inode"
> >> due to "container_of" use:
> >>
> >> +#ifdef CONFIG_NFS_FSCACHE
> >> +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> >> +{
> >> +       return &nfsi->netfs.inode;
> >> +}
> >> +static inline struct nfs_inode *NFS_I(const struct inode *inode)
> >> +{
> >> +       return container_of(inode, struct nfs_inode, netfs.inode);
> >> +}
> >> +#else
> >> +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> >> +{
> >> +       return &nfsi->vfs_inode;
> >> +}
> >> static inline struct nfs_inode *NFS_I(const struct inode *inode)
> >> {
> >>       return container_of(inode, struct nfs_inode, vfs_inode);
> >> }
> >> +#endif
> >>
> >>
> >
> > Actually Trond maybe we can achieve a "0 length increase" of
> > nfs_inode if dhowells would take a patch to modify the definition
> > of struct netfs_inode and netfs_inode_init(), something like the WIP
> > patch below.  What do you think?
>
> That works for me.
>
> >
> > I think maybe this could be a follow-on patch and if you/dhowells
> > think it's an ok idea I can try to work out what is needed across
> > the tree.  I thought about it more and I kinda agree that in the
> > case for NFS where fscache is "configured but not enabled",
> > then even though we're only adding 24 bytes to the nfs_inode
> > each time, it will add up so it is worth at least a discussion.
> >
> > diff --git a/include/linux/netfs.h b/include/linux/netfs.h
> > index f2402ddeafbf..195714f1c355 100644
> > --- a/include/linux/netfs.h
> > +++ b/include/linux/netfs.h
> > @@ -118,11 +118,7 @@ enum netfs_io_source {
> > typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error,
> >                                     bool was_async);
> >
> > -/*
> > - * Per-inode context.  This wraps the VFS inode.
> > - */
> > -struct netfs_inode {
> > -       struct inode            inode;          /* The VFS inode */
> > +struct netfs_info {
> >       const struct netfs_request_ops *ops;
> > #if IS_ENABLED(CONFIG_FSCACHE)
> >       struct fscache_cookie   *cache;
> > @@ -130,6 +126,14 @@ struct netfs_inode {
> >       loff_t                  remote_i_size;  /* Size of the remote file */
> > };
> >
> > +/*
> > + * Per-inode context.  This wraps the VFS inode.
> > + */
> > +struct netfs_inode {
> > +       struct inode            inode;          /* The VFS inode */
> > +       struct netfs_info       *netfs;         /* Rest of netfs data */
> > +};
> > +
> > /*
> > * Resources required to do operations on a cache.
> > */
> > @@ -312,10 +316,12 @@ static inline struct netfs_inode
> > *netfs_inode(struct inode *inode)
> > static inline void netfs_inode_init(struct netfs_inode *ctx,
> >                                   const struct netfs_request_ops *ops)
> > {
> > -       ctx->ops = ops;
> > -       ctx->remote_i_size = i_size_read(&ctx->inode);
> > +       ctx->netfs = kzalloc(sizeof(struct netfs_info)), GFP_KERNEL);
> > +       /* FIXME: Check for NULL */
> > +       ctx->netfs->ops = ops;
> > +       ctx->netfs->remote_i_size = i_size_read(&ctx->inode);
> > #if IS_ENABLED(CONFIG_FSCACHE)
> > -       ctx->cache = NULL;
> > +       ctx->netfs->cache = NULL;
> > #endif
> > }
> >
> >
> >
> >>
> >>>>
> >>>>
> >>>> Are you ok with the stub functions which are placed in fscache.h, and
> >>>> when CONFIG_NFS_FSCACHE is not set, become either a no-op
> >>>> or a 1-liner (nfs_netfs_readpage_release)?
> >>>>
> >>>> #else /* CONFIG_NFS_FSCACHE */
> >>>> +static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {}
> >>>> +static inline void nfs_netfs_initiate_read(struct nfs_pgio_header
> >>>> *hdr) {}
> >>>> +static inline void nfs_netfs_read_completion(struct nfs_pgio_header
> >>>> *hdr) {}
> >>>> +static inline void nfs_netfs_readpage_release(struct nfs_page *req)
> >>>> +{
> >>>> +       unlock_page(req->wb_page);
> >>>> +}
> >>>> static inline void nfs_fscache_release_super_cookie(struct
> >>>> super_block *sb) {}
> >>>> static inline void nfs_fscache_init_inode(struct inode *inode) {}
> >>>>
> >>>>
> >>>> Do you object to the below?  If so, then do you want
> >>>> #ifdef CONFIG_NFS_FSCACHE here?
> >>>>
> >>>> -- a/fs/nfs/inode.c
> >>>> +++ b/fs/nfs/inode.c
> >>>> @@ -2249,6 +2249,8 @@ struct inode *nfs_alloc_inode(struct
> >>>> super_block *sb)
> >>>> #ifdef CONFIG_NFS_V4_2
> >>>>       nfsi->xattr_cache = NULL;
> >>>> #endif
> >>>> +       nfs_netfs_inode_init(nfsi);
> >>>> +
> >>>>       return VFS_I(nfsi);
> >>>> }
> >>>> EXPORT_SYMBOL_GPL(nfs_alloc_i
> >>>> node);
> >>>>
> >>>>
> >>>> Do you object to the changes in fs/nfs/read.c?  Specifically,
> >>>> how about the below calls to netfs from nfs_read_folio and
> >>>> nfs_readahead into equivalent netfs calls?  So when
> >>>> NFS_CONFIG_FSCACHE is set, but fscache is not enabled
> >>>> ('fsc' not on mount), these netfs functions do immediately call
> >>>> netfs_alloc_request().  But I wonder if we could simply add a
> >>>> check to see if fscache is enabled on the mount, and skip
> >>>> over to satisfy what you want.  Am I understanding what you
> >>>> want?
> >>>
> >>> Quite frankly, I'd prefer that we just split out the functionality that
> >>> is needed from the netfs code so that it can be optimised. However I'm
> >>> not interested enough in the cachefs functionality to work on that
> >>> myself. ...and as I indicated above, I might be OK with opting into the
> >>> netfs project, once the overhead can be made to disappear.
> >>>
> >> Understood.
> >>
> >> If you think it makes more sense, I can move some of the nfs_netfs_*
> >> functions into a netfs.c file as a starting point.  Or that can maybe
> >> be done in a future patchset?
> >>
> >> For now I was equating netfs and fscache together so we can
> >> move on from the much older and single-page limiting fscache
> >> interface that is likely to go away soon.
> >>
> >>>>
> >>>> @@ -355,6 +343,10 @@ int nfs_read_folio(struct file *file, struct
> >>>> folio *folio)
> >>>>       if (NFS_STALE(inode))
> >>>>               goto out_unlock;
> >>>>
> >>>> +       ret = nfs_netfs_read_folio(file, folio);
> >>>> +       if (!ret)
> >>>> +               goto out;
> >>>> +
> >>>>
> >>>> @@ -405,6 +399,10 @@ void nfs_readahead(struct readahead_control
> >>>> *ractl)
> >>>>       if (NFS_STALE(inode))
> >>>>               goto out;
> >>>>
> >>>> +       ret = nfs_netfs_readahead(ractl);
> >>>> +       if (!ret)
> >>>> +               goto out;
> >>>> +
> >>>>
> >> The above wrappers should prevent any additional overhead when fscache
> >> is not enabled.  As far as I know these work to avoid calling netfs
> >> when 'fsc' is not on the mount.
> >>
> >>>>
> >>>> And how about these calls from different points in the read
> >>>> path to the earlier mentioned stub functions?
> >>>>
> >>>> @@ -110,20 +110,13 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
> >>>>
> >>>> static void nfs_readpage_release(struct nfs_page *req, int error)
> >>>> {
> >>>> -       struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> >>>>       struct page *page = req->wb_page;
> >>>>
> >>>> -       dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb-
> >>>>> s_id,
> >>>> -               (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
> >>>> -               (long long)req_offset(req));
> >>>> -
> >>>>       if (nfs_error_is_fatal_on_server(error) && error != -
> >>>> ETIMEDOUT)
> >>>>               SetPageError(page);
> >>>> -       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
> >>>> -               if (PageUptodate(page))
> >>>> -                       nfs_fscache_write_page(inode, page);
> >>>> -               unlock_page(page);
> >>>> -       }
> >>>> +       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> >>>> +               nfs_netfs_readpage_release(req);
> >>>> +
> >>>
> >>> I'm not seeing the value of wrapping unlock_page(), no... That code is
> >>> going to need to change when we move it to use folios natively anyway.
> >>>
> >> Ok, how about I make it conditional on whether fscache is configured
> >> and enabled then, similar to the nfs_netfs_read_folio() and
> >> nfs_netfs_readahead()?  Below is what that would look like.
> >> I could inline the code in nfs_netfs_readpage_release() if you
> >> think it would be clearer.
> >>
> >> static void nfs_readpage_release(struct nfs_page *req, int error)
> >> {
> >>       struct page *page = req->wb_page;
> >>
> >>       if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
> >>               SetPageError(page);
> >>       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> >> #ifndef CONFIG_NFS_FSCACHE
> >>               unlock_page(req->wb_page);
> >> #else
> >>               nfs_netfs_readpage_release(req);
> >> #endif
> >>       nfs_release_request(req);
> >> }
> >>
> >>
> >> void nfs_netfs_readpage_release(struct nfs_page *req)
> >> {
> >>   struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> >>
> >>   /*
> >>    * If fscache is enabled, netfs will unlock pages.
> >>    */
> >>   if (netfs_inode(inode)->cache)
> >>       return;
> >>
> >>   unlock_page(req->wb_page);
> >> }
> >>
> >>
> >>>>       nfs_release_request(req);
> >>>> }
> >>>>
> >>>> @@ -177,6 +170,8 @@ static void nfs_read_completion(struct
> >>>> nfs_pgio_header *hdr)
> >>>>               nfs_list_remove_request(req);
> >>>>               nfs_readpage_release(req, error);
> >>>>       }
> >>>> +       nfs_netfs_read_completion(hdr);
> >>>> +
> >>>> out:
> >>>>       hdr->release(hdr);
> >>>> }
> >>>> @@ -187,6 +182,7 @@ static void nfs_initiate_read(struct
> >>>> nfs_pgio_header *hdr,
> >>>>                             struct rpc_task_setup *task_setup_data,
> >>>> int how)
> >>>> {
> >>>>       rpc_ops->read_setup(hdr, msg);
> >>>> +       nfs_netfs_initiate_read(hdr);
> >>>>       trace_nfs_initiate_read(hdr);
> >>>> }
> >>>>
> >>>>
> >>>> Are you ok with these additions?  Something like this would
> >>>> be required in the case of fscache configured and enabled,
> >>>> because we could have some of the data in a read in
> >>>> fscache, and some not.  That is the reason for the netfs
> >>>> design, and why we need to be able to call the normal
> >>>> NFS read IO path (netfs calls into issue_read, and we call
> >>>> back via netfs_subreq_terminated)?
> >>>>
> >>>> @@ -101,6 +101,9 @@ struct nfs_pageio_descriptor {
> >>>>       struct pnfs_layout_segment *pg_lseg;
> >>>>       struct nfs_io_completion *pg_io_completion;
> >>>>       struct nfs_direct_req   *pg_dreq;
> >>>> +#ifdef CONFIG_NFS_FSCACHE
> >>>> +       void                    *pg_netfs;
> >>>> +#endif
> >>>>
> >>>> @@ -1619,6 +1619,9 @@ struct nfs_pgio_header {
> >>>>       const struct nfs_rw_ops *rw_ops;
> >>>>       struct nfs_io_completion *io_completion;
> >>>>       struct nfs_direct_req   *dreq;
> >>>> +#ifdef CONFIG_NFS_FSCACHE
> >>>> +       void                    *netfs;
> >>>> +#endif
> >>>>
> >>>>
> >>>> And these additions to pagelist.c?
> >>>>
> >>>> @@ -68,6 +69,10 @@ void nfs_pgheader_init(struct
> >>>> nfs_pageio_descriptor *desc,
> >>>>       hdr->good_bytes = mirror->pg_count;
> >>>>       hdr->io_completion = desc->pg_io_completion;
> >>>>       hdr->dreq = desc->pg_dreq;
> >>>> +#ifdef CONFIG_NFS_FSCACHE
> >>>> +       if (desc->pg_netfs)
> >>>> +               hdr->netfs = desc->pg_netfs;
> >>>> +#endif
> >>>
> >>> Why the conditional?
> >>>
> >> Not really needed and I was thinking of removing it, so I'll do that.
> >>
> >>>>
> >>>>
> >>>> @@ -846,6 +851,9 @@ void nfs_pageio_init(struct nfs_pageio_descriptor
> >>>> *desc,
> >>>>       desc->pg_lseg = NULL;
> >>>>       desc->pg_io_completion = NULL;
> >>>>       desc->pg_dreq = NULL;
> >>>> +#ifdef CONFIG_NFS_FSCACHE
> >>>> +       desc->pg_netfs = NULL;
> >>>> +#endif
> >>>>
> >>>>
> >>>> @@ -1360,6 +1369,9 @@ int nfs_pageio_resend(struct
> >>>> nfs_pageio_descriptor *desc,
> >>>>
> >>>>       desc->pg_io_completion = hdr->io_completion;
> >>>>       desc->pg_dreq = hdr->dreq;
> >>>> +#ifdef CONFIG_NFS_FSCACHE
> >>>> +       desc->pg_netfs = hdr->netfs;
> >>>> +#endif
> >>>
> >>> Those all need wrapper functions instead of embedding #ifdefs.
> >>>
> >> Ok.
> >>
> >>
> >>
> >>>>
> >>>>
> >>>>> My expectation is that the standard I/O path should have minimal
> >>>>> overhead, and should certainly not increase the overhead that we
> >>>>> already have. Will this be addressed in future iterations of these
> >>>>> patches?
> >>>>>
> >>>>
> >>>> I will do what I can to satisfy what you want, either by fixing up
> >>>> this patch or follow-on patches.  Hopefully the above questions
> >>>> will clarify the next steps.
> >>>>
> >>>
> >>> --
> >>> Trond Myklebust
> >>> Linux NFS client maintainer, Hammerspace
> >>> trond.myklebust@hammerspace.com
>
>
>
> Trond Myklebust
> CTO, Hammerspace Inc
> 1900 S Norfolk St, Suite 350 - #45
> San Mateo, CA 94403
>
> www.hammer.space
>
>
David Wysochanski Nov. 14, 2022, 10:41 a.m. UTC | #8
Hi Ben,

Thanks for testing these patches.  More below.

On Sat, Nov 12, 2022 at 7:47 AM Benjamin Maynard <benmaynard@google.com> wrote:
>
> Hi all,
>
> I've been doing some more testing with these patches, I applied all of
> the patches (v10 from
> https://patchwork.kernel.org/project/linux-nfs/list/?series=691729)
> apart from Patch 6 (the RFC patch) to version 6.0.8 of the kernel.
>
> I have the following setup:
>
> Source NFS Server <-- Re-Export Server (with FS-Cache) <-- NFS Client.
>
> I have a 500Gb file on the Source NFS Server, which I am then copying
> to the NFS Client via the Re-Export Server.
>
> On the first copy, I see heavy writes to /var/cache/fscache on the
> re-export server, and once the file copy completes I see that
> /var/cache/fscache is approximately 500Gb in size. All good so far.
>
> I then deleted that file from the NFS Client, and dropped the caches
> just to be safe (echo 3 > /proc/sys/vm/drop_caches on the NFS Client).
>
> I then performed another copy of the 500Gb file on the NFS Client,
> again via the Re-Export Server. What I expected would happen is that I
> would see heavy reads from the /var/cache/fscache volume as the file
> should be served from FS-Cache.
>
> However what I actually saw was no reads whatsoever, FS-Cache seems to
> be ignored and the file is pulled from the Source NFS Filer again. I
> also see heavy writes to /var/cache/fscache, so it appears that
> FS-Cache is overwriting its existing cache, and never using it.
>
Due to use of "drop_caches" this is almost certainly the known issue #1
I mentioned in the opening post of this series:
https://lore.kernel.org/all/20221103161637.1725471-1-dwysocha@redhat.com/

The above issue will be fixed with the following patch which has not
been merged yet:
https://www.mail-archive.com/linux-cachefs@redhat.com/msg03043.html

Do you have time to do another test to verify that is the case?
If so, I can re-post that patch on top of the first 5 patches in this series,
as well as a second patch that allows NFS to use it.


> I only have 104Gb of memory on the Re-Export Server (with FS-Cache) so
> it is not possible that the file is being served from the page cache.
>
> We saw this behaviour before on an older set of the patches when our
> mount between the Re-Export Server and the Source NFS Filer was using
> the "sync" option, but we are now using the "async" option and the
> same is happening.
>
> Mount options:
>
> Source NFS Server <-- Re-Export Server (with FS-Cache):
>
> 10.0.0.49:/files /srv/nfs/files nfs
> rw,noatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,acregmin=600,acregmax=600,acdirmin=600,acdirmax=600,hard,nocto,proto=tcp,nconnect=16,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.49,mountvers=3,mountport=37485,mountproto=tcp,fsc,local_lock=none,addr=10.0.0.49
>
> Re-Export Server (with FS-Cache) <-- NFS Client:
>
> 10.0.0.3:/files /mnt/nfs nfs
> rw,relatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.3,mountvers=3,mountport=20048,mountproto=tcp,local_lock=none,addr=10.0.0.3
>
> It is also worth noting this behaviour is not unique to the re-export
> use case. I see FS-Cache not being used with the following setup:
>
> Source NFS Server <-- Client (with FS-Cache).
>
> Thanks,
> Ben
>
>
> Kind Regards
>
> Benjamin Maynard
>
> Customer Engineer
>
> benmaynard@google.com
>
> Google, Inc.
>
>
>
>
> On Mon, 31 Oct 2022 at 22:22, Trond Myklebust <trondmy@hammerspace.com> wrote:
> >
> >
> >
> > > On Oct 30, 2022, at 19:25, David Wysochanski <dwysocha@redhat.com> wrote:
> > >
> > > On Sat, Oct 29, 2022 at 12:46 PM David Wysochanski <dwysocha@redhat.com> wrote:
> > >>
> > >> On Fri, Oct 28, 2022 at 12:59 PM Trond Myklebust <trondmy@kernel.org> wrote:
> > >>>
> > >>> On Fri, 2022-10-28 at 07:50 -0400, David Wysochanski wrote:
> > >>>> On Thu, Oct 27, 2022 at 3:16 PM Trond Myklebust <trondmy@kernel.org>
> > >>>> wrote:
> > >>>>>
> > >>>>> On Mon, 2022-10-17 at 06:52 -0400, Dave Wysochanski wrote:
> > >>>>>> Convert the NFS buffered read code paths to corresponding netfs
> > >>>>>> APIs,
> > >>>>>> but only when fscache is configured and enabled.
> > >>>>>>
> > >>>>>> The netfs API defines struct netfs_request_ops which must be
> > >>>>>> filled
> > >>>>>> in by the network filesystem.  For NFS, we only need to define 5
> > >>>>>> of
> > >>>>>> the functions, the main one being the issue_read() function.
> > >>>>>> The issue_read() function is called by the netfs layer when a
> > >>>>>> read
> > >>>>>> cannot be fulfilled locally, and must be sent to the server
> > >>>>>> (either
> > >>>>>> the cache is not active, or it is active but the data is not
> > >>>>>> available).
> > >>>>>> Once the read from the server is complete, netfs requires a call
> > >>>>>> to
> > >>>>>> netfs_subreq_terminated() which conveys either how many bytes
> > >>>>>> were
> > >>>>>> read
> > >>>>>> successfully, or an error.  Note that issue_read() is called with
> > >>>>>> a
> > >>>>>> structure, netfs_io_subrequest, which defines the IO requested,
> > >>>>>> and
> > >>>>>> contains a start and a length (both in bytes), and assumes the
> > >>>>>> underlying
> > >>>>>> netfs will return a either an error on the whole region, or the
> > >>>>>> number
> > >>>>>> of bytes successfully read.
> > >>>>>>
> > >>>>>> The NFS IO path is page based and the main APIs are the pgio APIs
> > >>>>>> defined
> > >>>>>> in pagelist.c.  For the pgio APIs, there is no way for the caller
> > >>>>>> to
> > >>>>>> know how many RPCs will be sent and how the pages will be broken
> > >>>>>> up
> > >>>>>> into underlying RPCs, each of which will have their own
> > >>>>>> completion
> > >>>>>> and
> > >>>>>> return code.  In contrast, netfs is subrequest based, a single
> > >>>>>> subrequest may contain multiple pages, and a single subrequest is
> > >>>>>> initiated with issue_read() and terminated with
> > >>>>>> netfs_subreq_terminated().
> > >>>>>> Thus, to utilze the netfs APIs, NFS needs some way to accommodate
> > >>>>>> the netfs API requirement on the single response to the whole
> > >>>>>> subrequest, while also minimizing disruptive changes to the NFS
> > >>>>>> pgio layer.
> > >>>>>>
> > >>>>>> The approach taken with this patch is to allocate a small
> > >>>>>> structure
> > >>>>>> for each nfs_netfs_issue_read() call, store the final error and
> > >>>>>> number
> > >>>>>> of bytes successfully transferred in the structure, and update
> > >>>>>> these
> > >>>>>> values
> > >>>>>> as each RPC completes.  The refcount on the structure is used as
> > >>>>>> a
> > >>>>>> marker
> > >>>>>> for the last RPC completion, is incremented in
> > >>>>>> nfs_netfs_read_initiate(),
> > >>>>>> and decremented inside nfs_netfs_read_completion(), when a
> > >>>>>> nfs_pgio_header
> > >>>>>> contains a valid pointer to the data.  On the final put (which
> > >>>>>> signals
> > >>>>>> the final outstanding RPC is complete) in
> > >>>>>> nfs_netfs_read_completion(),
> > >>>>>> call netfs_subreq_terminated() with either the final error value
> > >>>>>> (if
> > >>>>>> one or more READs complete with an error) or the number of bytes
> > >>>>>> successfully transferred (if all RPCs complete successfully).
> > >>>>>> Note
> > >>>>>> that when all RPCs complete successfully, the number of bytes
> > >>>>>> transferred
> > >>>>>> is capped to the length of the subrequest.  Capping the
> > >>>>>> transferred
> > >>>>>> length
> > >>>>>> to the subrequest length prevents "Subreq overread" warnings from
> > >>>>>> netfs.
> > >>>>>> This is due to the "aligned_len" in nfs_pageio_add_page(), and
> > >>>>>> the
> > >>>>>> corner case where NFS requests a full page at the end of the
> > >>>>>> file,
> > >>>>>> even when i_size reflects only a partial page (NFS overread).
> > >>>>>>
> > >>>>>> Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
> > >>>>>> Reviewed-by: Jeff Layton <jlayton@kernel.org>
> > >>>>>
> > >>>>>
> > >>>>> This is not doing what I asked for, which was to separate out the
> > >>>>> fscache functionality, so that we can call that if and when it is
> > >>>>> available.
> > >>>>>
> > >>>> I must have misunderstood then.
> > >>>>
> > >>>> The last feedback I have from you was that you wanted it to be
> > >>>> an opt-in feature, and it was a comment on a previous patch
> > >>>> to Kconfig.  I was proceeding the best I knew how, but
> > >>>> let me try to get back on track.
> > >>>>
> > >>>>> Instead, it is just wrapping the NFS requests inside netfs
> > >>>>> requests. As
> > >>>>> it stands, that means it is just duplicating information, and
> > >>>>> adding
> > >>>>> unnecessary overhead to the standard I/O path (extra allocations,
> > >>>>> extra
> > >>>>> indirect calls, and extra bloat to the inode).
> > >>>>>
> > >>>> I think I understand what you're saying but I'm not sure.  Let me
> > >>>> ask some clarifying questions.
> > >>>>
> > >>>> Are you objecting to the code when CONFIG_NFS_FSCACHE is
> > >>>> configured?  Or when it is not?  Or both?  I think you're objecting
> > >>>> when it's configured, but not enabled (we mount without 'fsc').
> > >>>> Am I right?
> > >>>>
> > >>>> Also, are you objecting to the design that to use fcache we now
> > >>>> have to use netfs, specifically:
> > >>>> - call into netfs via either netfs_read_folio or netfs_readahead
> > >>>> - if fscache is enabled, then the IO can be satisfied from fscache
> > >>>> - if fscache is not enabled, or some of the IO cannot be satisfied
> > >>>> from the cache, then NFS is called back via netfs_issue_read
> > >>>> and we use the normal NFS read pageio interface.  This requires
> > >>>> we call netfs_subreq_terminated() when all the RPCs complete,
> > >>>> which is the reason for the small changes to pagelist.c
> > >>>
> > >>> I'm objecting to any middle layer "solution" that adds overhead to the
> > >>> NFS I/O paths.
> > >>>
> > >> Got it.
> > >>
> > >>> I'm willing to consider solutions that are specific only to the fscache
> > >>> use case (i.e. when the 'fsc' mount option is specified). However when
> > >>> I perform a normal NFS mount, and do I/O, then I don't want to see
> > >>> extra memory allocations, extra indirect calls and larger inode
> > >>> footprints.
> > >>>
> > >>> IOW: I want the code to optimise for the case of standard NFS, not for
> > >>> the case of 'NFS with cachefs additions'.
> > >>>
> > >> I agree completely.  Are you seeing extra memory allocations
> > >> happen on mounts without 'fsc' or is it more a concern or how
> > >> some of the patches look?  We should not be calling any netfs or
> > >> fscache code if 'fsc' is not on the mount and I don't see any in my
> > >> testing. So either there's a misunderstanding here, or there's a
> > >> bug I'm missing.
> > >>
> > >> If fscache is not configured, then nfs_netfs_read_folio() and
> > >> nfs_netfs_readahead() is a wrapper that returns -ENOBUFS.
> > >> If it's configured but not enabled, then the checks for
> > >> netfs_inode(inode)->cache should skip over any netfs code.
> > >> But maybe there's a non-obvious bug you're seeing and
> > >> somehow netfs is still getting called?  Because I cannot
> > >> see netfs getting called if 'fsc' is not on the mount in my
> > >> tests.
> > >>
> > >> int nfs_netfs_read_folio(struct file *file, struct folio *folio)
> > >> {
> > >>       if (!netfs_inode(folio_inode(folio))->cache)
> > >>               return -ENOBUFS;
> > >>
> > >>       return netfs_read_folio(file, folio);
> > >> }
> > >>
> > >> int nfs_netfs_readahead(struct readahead_control *ractl)
> > >> {
> > >>       struct inode *inode = ractl->mapping->host;
> > >>
> > >>       if (!netfs_inode(inode)->cache)
> > >>               return -ENOBUFS;
> > >>
> > >>       netfs_readahead(ractl);
> > >>       return 0;
> > >> }
> > >>
> > >>
> > >>>>
> > >>>> Can you be more specific as to the portions of the patch you don't
> > >>>> like
> > >>>> so I can move it in the right direction?
> > >>>>
> > >>>> This is from patch #2 which you didn't comment on.  I'm not sure
> > >>>> you're
> > >>>> ok with it though, since you mention "extra bloat to the inode".
> > >>>> Do you object to this even though it's wrapped in an
> > >>>> #ifdef CONFIG_NFS_FSCACHE?  If so, do you require no
> > >>>> extra size be added to nfs_inode?
> > >>>>
> > >>>> @@ -204,9 +208,11 @@ struct nfs_inode {
> > >>>>       __u64 write_io;
> > >>>>       __u64 read_io;
> > >>>> #ifdef CONFIG_NFS_FSCACHE
> > >>>> -       struct fscache_cookie   *fscache;
> > >>>> -#endif
> > >>>> +       struct netfs_inode      netfs; /* netfs context and VFS inode
> > >>>> */
> > >>>> +#else
> > >>>>       struct inode            vfs_inode;
> > >>>> +#endif
> > >>>> +
> > >>>
> > >>> Ideally, I'd prefer no extra size. I can live with it up to a certain
> > >>> point, however for now NFS is not unconditionally opting into the netfs
> > >>> project. If we're to ever do that, then I want to see streamlined code
> > >>> for the standard I/O case.
> > >>>
> > >> Ok and understood about standard I/O case.
> > >>
> > >> I was thinking how we might not increase the size, but I don't think
> > >> I can make it work.
> > >>
> > >> I thought we could change to something like the below, without an
> > >> embedded struct inode:
> > >>
> > >> @@ -204,9 +208,11 @@ struct nfs_inode {
> > >>       __u64 write_io;
> > >>       __u64 read_io;
> > >> #ifdef CONFIG_NFS_FSCACHE
> > >> -       struct fscache_cookie   *fscache;
> > >> -#endif
> > >> +       struct netfs_inode      *netfs; /* netfs context and VFS inode */
> > >> +#else
> > >>       struct inode            vfs_inode;
> > >> +#endif
> > >> +
> > >>
> > >> Then I would need to alloc/free a netfs_inode at the time of
> > >> nfs_inode initiation.  Unfortunately this has the issue that the NFS_I()
> > >> macro cannot work, because it requires an embedded "struct inode"
> > >> due to "container_of" use:
> > >>
> > >> +#ifdef CONFIG_NFS_FSCACHE
> > >> +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > >> +{
> > >> +       return &nfsi->netfs.inode;
> > >> +}
> > >> +static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > >> +{
> > >> +       return container_of(inode, struct nfs_inode, netfs.inode);
> > >> +}
> > >> +#else
> > >> +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > >> +{
> > >> +       return &nfsi->vfs_inode;
> > >> +}
> > >> static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > >> {
> > >>       return container_of(inode, struct nfs_inode, vfs_inode);
> > >> }
> > >> +#endif
> > >>
> > >>
> > >
> > > Actually Trond maybe we can achieve a "0 length increase" of
> > > nfs_inode if dhowells would take a patch to modify the definition
> > > of struct netfs_inode and netfs_inode_init(), something like the WIP
> > > patch below.  What do you think?
> >
> > That works for me.
> >
> > >
> > > I think maybe this could be a follow-on patch and if you/dhowells
> > > think it's an ok idea I can try to work out what is needed across
> > > the tree.  I thought about it more and I kinda agree that in the
> > > case for NFS where fscache is "configured but not enabled",
> > > then even though we're only adding 24 bytes to the nfs_inode
> > > each time, it will add up so it is worth at least a discussion.
> > >
> > > diff --git a/include/linux/netfs.h b/include/linux/netfs.h
> > > index f2402ddeafbf..195714f1c355 100644
> > > --- a/include/linux/netfs.h
> > > +++ b/include/linux/netfs.h
> > > @@ -118,11 +118,7 @@ enum netfs_io_source {
> > > typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error,
> > >                                     bool was_async);
> > >
> > > -/*
> > > - * Per-inode context.  This wraps the VFS inode.
> > > - */
> > > -struct netfs_inode {
> > > -       struct inode            inode;          /* The VFS inode */
> > > +struct netfs_info {
> > >       const struct netfs_request_ops *ops;
> > > #if IS_ENABLED(CONFIG_FSCACHE)
> > >       struct fscache_cookie   *cache;
> > > @@ -130,6 +126,14 @@ struct netfs_inode {
> > >       loff_t                  remote_i_size;  /* Size of the remote file */
> > > };
> > >
> > > +/*
> > > + * Per-inode context.  This wraps the VFS inode.
> > > + */
> > > +struct netfs_inode {
> > > +       struct inode            inode;          /* The VFS inode */
> > > +       struct netfs_info       *netfs;         /* Rest of netfs data */
> > > +};
> > > +
> > > /*
> > > * Resources required to do operations on a cache.
> > > */
> > > @@ -312,10 +316,12 @@ static inline struct netfs_inode
> > > *netfs_inode(struct inode *inode)
> > > static inline void netfs_inode_init(struct netfs_inode *ctx,
> > >                                   const struct netfs_request_ops *ops)
> > > {
> > > -       ctx->ops = ops;
> > > -       ctx->remote_i_size = i_size_read(&ctx->inode);
> > > +       ctx->netfs = kzalloc(sizeof(struct netfs_info)), GFP_KERNEL);
> > > +       /* FIXME: Check for NULL */
> > > +       ctx->netfs->ops = ops;
> > > +       ctx->netfs->remote_i_size = i_size_read(&ctx->inode);
> > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > -       ctx->cache = NULL;
> > > +       ctx->netfs->cache = NULL;
> > > #endif
> > > }
> > >
> > >
> > >
> > >>
> > >>>>
> > >>>>
> > >>>> Are you ok with the stub functions which are placed in fscache.h, and
> > >>>> when CONFIG_NFS_FSCACHE is not set, become either a no-op
> > >>>> or a 1-liner (nfs_netfs_readpage_release)?
> > >>>>
> > >>>> #else /* CONFIG_NFS_FSCACHE */
> > >>>> +static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {}
> > >>>> +static inline void nfs_netfs_initiate_read(struct nfs_pgio_header
> > >>>> *hdr) {}
> > >>>> +static inline void nfs_netfs_read_completion(struct nfs_pgio_header
> > >>>> *hdr) {}
> > >>>> +static inline void nfs_netfs_readpage_release(struct nfs_page *req)
> > >>>> +{
> > >>>> +       unlock_page(req->wb_page);
> > >>>> +}
> > >>>> static inline void nfs_fscache_release_super_cookie(struct
> > >>>> super_block *sb) {}
> > >>>> static inline void nfs_fscache_init_inode(struct inode *inode) {}
> > >>>>
> > >>>>
> > >>>> Do you object to the below?  If so, then do you want
> > >>>> #ifdef CONFIG_NFS_FSCACHE here?
> > >>>>
> > >>>> -- a/fs/nfs/inode.c
> > >>>> +++ b/fs/nfs/inode.c
> > >>>> @@ -2249,6 +2249,8 @@ struct inode *nfs_alloc_inode(struct
> > >>>> super_block *sb)
> > >>>> #ifdef CONFIG_NFS_V4_2
> > >>>>       nfsi->xattr_cache = NULL;
> > >>>> #endif
> > >>>> +       nfs_netfs_inode_init(nfsi);
> > >>>> +
> > >>>>       return VFS_I(nfsi);
> > >>>> }
> > >>>> EXPORT_SYMBOL_GPL(nfs_alloc_i
> > >>>> node);
> > >>>>
> > >>>>
> > >>>> Do you object to the changes in fs/nfs/read.c?  Specifically,
> > >>>> how about the below calls to netfs from nfs_read_folio and
> > >>>> nfs_readahead into equivalent netfs calls?  So when
> > >>>> NFS_CONFIG_FSCACHE is set, but fscache is not enabled
> > >>>> ('fsc' not on mount), these netfs functions do immediately call
> > >>>> netfs_alloc_request().  But I wonder if we could simply add a
> > >>>> check to see if fscache is enabled on the mount, and skip
> > >>>> over to satisfy what you want.  Am I understanding what you
> > >>>> want?
> > >>>
> > >>> Quite frankly, I'd prefer that we just split out the functionality that
> > >>> is needed from the netfs code so that it can be optimised. However I'm
> > >>> not interested enough in the cachefs functionality to work on that
> > >>> myself. ...and as I indicated above, I might be OK with opting into the
> > >>> netfs project, once the overhead can be made to disappear.
> > >>>
> > >> Understood.
> > >>
> > >> If you think it makes more sense, I can move some of the nfs_netfs_*
> > >> functions into a netfs.c file as a starting point.  Or that can maybe
> > >> be done in a future patchset?
> > >>
> > >> For now I was equating netfs and fscache together so we can
> > >> move on from the much older and single-page limiting fscache
> > >> interface that is likely to go away soon.
> > >>
> > >>>>
> > >>>> @@ -355,6 +343,10 @@ int nfs_read_folio(struct file *file, struct
> > >>>> folio *folio)
> > >>>>       if (NFS_STALE(inode))
> > >>>>               goto out_unlock;
> > >>>>
> > >>>> +       ret = nfs_netfs_read_folio(file, folio);
> > >>>> +       if (!ret)
> > >>>> +               goto out;
> > >>>> +
> > >>>>
> > >>>> @@ -405,6 +399,10 @@ void nfs_readahead(struct readahead_control
> > >>>> *ractl)
> > >>>>       if (NFS_STALE(inode))
> > >>>>               goto out;
> > >>>>
> > >>>> +       ret = nfs_netfs_readahead(ractl);
> > >>>> +       if (!ret)
> > >>>> +               goto out;
> > >>>> +
> > >>>>
> > >> The above wrappers should prevent any additional overhead when fscache
> > >> is not enabled.  As far as I know these work to avoid calling netfs
> > >> when 'fsc' is not on the mount.
> > >>
> > >>>>
> > >>>> And how about these calls from different points in the read
> > >>>> path to the earlier mentioned stub functions?
> > >>>>
> > >>>> @@ -110,20 +110,13 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
> > >>>>
> > >>>> static void nfs_readpage_release(struct nfs_page *req, int error)
> > >>>> {
> > >>>> -       struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > >>>>       struct page *page = req->wb_page;
> > >>>>
> > >>>> -       dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb-
> > >>>>> s_id,
> > >>>> -               (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
> > >>>> -               (long long)req_offset(req));
> > >>>> -
> > >>>>       if (nfs_error_is_fatal_on_server(error) && error != -
> > >>>> ETIMEDOUT)
> > >>>>               SetPageError(page);
> > >>>> -       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
> > >>>> -               if (PageUptodate(page))
> > >>>> -                       nfs_fscache_write_page(inode, page);
> > >>>> -               unlock_page(page);
> > >>>> -       }
> > >>>> +       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > >>>> +               nfs_netfs_readpage_release(req);
> > >>>> +
> > >>>
> > >>> I'm not seeing the value of wrapping unlock_page(), no... That code is
> > >>> going to need to change when we move it to use folios natively anyway.
> > >>>
> > >> Ok, how about I make it conditional on whether fscache is configured
> > >> and enabled then, similar to the nfs_netfs_read_folio() and
> > >> nfs_netfs_readahead()?  Below is what that would look like.
> > >> I could inline the code in nfs_netfs_readpage_release() if you
> > >> think it would be clearer.
> > >>
> > >> static void nfs_readpage_release(struct nfs_page *req, int error)
> > >> {
> > >>       struct page *page = req->wb_page;
> > >>
> > >>       if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
> > >>               SetPageError(page);
> > >>       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > >> #ifndef CONFIG_NFS_FSCACHE
> > >>               unlock_page(req->wb_page);
> > >> #else
> > >>               nfs_netfs_readpage_release(req);
> > >> #endif
> > >>       nfs_release_request(req);
> > >> }
> > >>
> > >>
> > >> void nfs_netfs_readpage_release(struct nfs_page *req)
> > >> {
> > >>   struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > >>
> > >>   /*
> > >>    * If fscache is enabled, netfs will unlock pages.
> > >>    */
> > >>   if (netfs_inode(inode)->cache)
> > >>       return;
> > >>
> > >>   unlock_page(req->wb_page);
> > >> }
> > >>
> > >>
> > >>>>       nfs_release_request(req);
> > >>>> }
> > >>>>
> > >>>> @@ -177,6 +170,8 @@ static void nfs_read_completion(struct
> > >>>> nfs_pgio_header *hdr)
> > >>>>               nfs_list_remove_request(req);
> > >>>>               nfs_readpage_release(req, error);
> > >>>>       }
> > >>>> +       nfs_netfs_read_completion(hdr);
> > >>>> +
> > >>>> out:
> > >>>>       hdr->release(hdr);
> > >>>> }
> > >>>> @@ -187,6 +182,7 @@ static void nfs_initiate_read(struct
> > >>>> nfs_pgio_header *hdr,
> > >>>>                             struct rpc_task_setup *task_setup_data,
> > >>>> int how)
> > >>>> {
> > >>>>       rpc_ops->read_setup(hdr, msg);
> > >>>> +       nfs_netfs_initiate_read(hdr);
> > >>>>       trace_nfs_initiate_read(hdr);
> > >>>> }
> > >>>>
> > >>>>
> > >>>> Are you ok with these additions?  Something like this would
> > >>>> be required in the case of fscache configured and enabled,
> > >>>> because we could have some of the data in a read in
> > >>>> fscache, and some not.  That is the reason for the netfs
> > >>>> design, and why we need to be able to call the normal
> > >>>> NFS read IO path (netfs calls into issue_read, and we call
> > >>>> back via netfs_subreq_terminated)?
> > >>>>
> > >>>> @@ -101,6 +101,9 @@ struct nfs_pageio_descriptor {
> > >>>>       struct pnfs_layout_segment *pg_lseg;
> > >>>>       struct nfs_io_completion *pg_io_completion;
> > >>>>       struct nfs_direct_req   *pg_dreq;
> > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > >>>> +       void                    *pg_netfs;
> > >>>> +#endif
> > >>>>
> > >>>> @@ -1619,6 +1619,9 @@ struct nfs_pgio_header {
> > >>>>       const struct nfs_rw_ops *rw_ops;
> > >>>>       struct nfs_io_completion *io_completion;
> > >>>>       struct nfs_direct_req   *dreq;
> > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > >>>> +       void                    *netfs;
> > >>>> +#endif
> > >>>>
> > >>>>
> > >>>> And these additions to pagelist.c?
> > >>>>
> > >>>> @@ -68,6 +69,10 @@ void nfs_pgheader_init(struct
> > >>>> nfs_pageio_descriptor *desc,
> > >>>>       hdr->good_bytes = mirror->pg_count;
> > >>>>       hdr->io_completion = desc->pg_io_completion;
> > >>>>       hdr->dreq = desc->pg_dreq;
> > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > >>>> +       if (desc->pg_netfs)
> > >>>> +               hdr->netfs = desc->pg_netfs;
> > >>>> +#endif
> > >>>
> > >>> Why the conditional?
> > >>>
> > >> Not really needed and I was thinking of removing it, so I'll do that.
> > >>
> > >>>>
> > >>>>
> > >>>> @@ -846,6 +851,9 @@ void nfs_pageio_init(struct nfs_pageio_descriptor
> > >>>> *desc,
> > >>>>       desc->pg_lseg = NULL;
> > >>>>       desc->pg_io_completion = NULL;
> > >>>>       desc->pg_dreq = NULL;
> > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > >>>> +       desc->pg_netfs = NULL;
> > >>>> +#endif
> > >>>>
> > >>>>
> > >>>> @@ -1360,6 +1369,9 @@ int nfs_pageio_resend(struct
> > >>>> nfs_pageio_descriptor *desc,
> > >>>>
> > >>>>       desc->pg_io_completion = hdr->io_completion;
> > >>>>       desc->pg_dreq = hdr->dreq;
> > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > >>>> +       desc->pg_netfs = hdr->netfs;
> > >>>> +#endif
> > >>>
> > >>> Those all need wrapper functions instead of embedding #ifdefs.
> > >>>
> > >> Ok.
> > >>
> > >>
> > >>
> > >>>>
> > >>>>
> > >>>>> My expectation is that the standard I/O path should have minimal
> > >>>>> overhead, and should certainly not increase the overhead that we
> > >>>>> already have. Will this be addressed in future iterations of these
> > >>>>> patches?
> > >>>>>
> > >>>>
> > >>>> I will do what I can to satisfy what you want, either by fixing up
> > >>>> this patch or follow-on patches.  Hopefully the above questions
> > >>>> will clarify the next steps.
> > >>>>
> > >>>
> > >>> --
> > >>> Trond Myklebust
> > >>> Linux NFS client maintainer, Hammerspace
> > >>> trond.myklebust@hammerspace.com
> >
> >
> >
> > Trond Myklebust
> > CTO, Hammerspace Inc
> > 1900 S Norfolk St, Suite 350 - #45
> > San Mateo, CA 94403
> >
> > www.hammer.space
> >
> >
>
Benjamin Maynard Nov. 14, 2022, 12:42 p.m. UTC | #9
Thanks Dave for getting back to me so quickly.

> Due to use of "drop_caches" this is almost certainly the known issue #1
> I mentioned in the opening post of this series:
> https://lore.kernel.org/all/20221103161637.1725471-1-dwysocha@redhat.com/

Apologies, I completely missed the known issues in the original
opening message of the series. Just to clarify, I was only ever
dropping the caches on the "NFS Client" in the below relationship:

Source NFS Server <-- Re-Export Server (with FS-Cache) <-- NFS Client.

I never dropped the caches on the Re-Export Server (the server running
FS-Cache) at any point.

However my rsize was lower than my readahead value. I've since corrected that:

benmaynard@demo-cluster-1-26hm:~$ cat /proc/mounts | grep nfs
10.0.0.49:/files /srv/nfs/files nfs
rw,noatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,acregmin=600,acregmax=600,acdirmin=600,acdirmax=600,hard,nocto,proto=tcp,nconnect=16,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.49,mountvers=3,mountport=37485,mountproto=tcp,fsc,local_lock=none,addr=10.0.0.49
0 0

benmaynard@demo-cluster-1-26hm:~$ findmnt -rnu -t nfs,nfs4 -o MAJ:MIN,TARGET
0:52 /srv/nfs/files
benmaynard@demo-cluster-1-26hm:~$ cat /sys/class/bdi/0\:52/read_ahead_kb
512

With this configuration I see the same issue, FS-Cache never reads
from /var/cache/fscache, and copying the same file always leads to
heavy writes to /var/cache/fscache (the cache is overwriting itself).

I have also tried this copy without clearing the caches on any server
in the chain, and the same happens.

Would you expect this behaviour even though rsize > read ahead? Would
you expect the referenced patch to fix this?

I tried to apply the patch you suggested
(https://www.mail-archive.com/linux-cachefs@redhat.com/msg03043.html)
but it did not apply cleanly, and I ran out of time to troubleshoot. I
should get some more time on Wednesday and I can re-try.


Kind Regards
Benjamin Maynard


Kind Regards

Benjamin Maynard

Customer Engineer

benmaynard@google.com

Google, Inc.




On Mon, 14 Nov 2022 at 10:41, David Wysochanski <dwysocha@redhat.com> wrote:
>
> Hi Ben,
>
> Thanks for testing these patches.  More below.
>
> On Sat, Nov 12, 2022 at 7:47 AM Benjamin Maynard <benmaynard@google.com> wrote:
> >
> > Hi all,
> >
> > I've been doing some more testing with these patches, I applied all of
> > the patches (v10 from
> > https://patchwork.kernel.org/project/linux-nfs/list/?series=691729)
> > apart from Patch 6 (the RFC patch) to version 6.0.8 of the kernel.
> >
> > I have the following setup:
> >
> > Source NFS Server <-- Re-Export Server (with FS-Cache) <-- NFS Client.
> >
> > I have a 500Gb file on the Source NFS Server, which I am then copying
> > to the NFS Client via the Re-Export Server.
> >
> > On the first copy, I see heavy writes to /var/cache/fscache on the
> > re-export server, and once the file copy completes I see that
> > /var/cache/fscache is approximately 500Gb in size. All good so far.
> >
> > I then deleted that file from the NFS Client, and dropped the caches
> > just to be safe (echo 3 > /proc/sys/vm/drop_caches on the NFS Client).
> >
> > I then performed another copy of the 500Gb file on the NFS Client,
> > again via the Re-Export Server. What I expected would happen is that I
> > would see heavy reads from the /var/cache/fscache volume as the file
> > should be served from FS-Cache.
> >
> > However what I actually saw was no reads whatsoever, FS-Cache seems to
> > be ignored and the file is pulled from the Source NFS Filer again. I
> > also see heavy writes to /var/cache/fscache, so it appears that
> > FS-Cache is overwriting its existing cache, and never using it.
> >
> Due to use of "drop_caches" this is almost certainly the known issue #1
> I mentioned in the opening post of this series:
> https://lore.kernel.org/all/20221103161637.1725471-1-dwysocha@redhat.com/
>
> The above issue will be fixed with the following patch which has not
> been merged yet:
> https://www.mail-archive.com/linux-cachefs@redhat.com/msg03043.html
>
> Do you have time to do another test to verify that is the case?
> If so, I can re-post that patch on top of the first 5 patches in this series,
> as well as a second patch that allows NFS to use it.
>
>
> > I only have 104Gb of memory on the Re-Export Server (with FS-Cache) so
> > it is not possible that the file is being served from the page cache.
> >
> > We saw this behaviour before on an older set of the patches when our
> > mount between the Re-Export Server and the Source NFS Filer was using
> > the "sync" option, but we are now using the "async" option and the
> > same is happening.
> >
> > Mount options:
> >
> > Source NFS Server <-- Re-Export Server (with FS-Cache):
> >
> > 10.0.0.49:/files /srv/nfs/files nfs
> > rw,noatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,acregmin=600,acregmax=600,acdirmin=600,acdirmax=600,hard,nocto,proto=tcp,nconnect=16,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.49,mountvers=3,mountport=37485,mountproto=tcp,fsc,local_lock=none,addr=10.0.0.49
> >
> > Re-Export Server (with FS-Cache) <-- NFS Client:
> >
> > 10.0.0.3:/files /mnt/nfs nfs
> > rw,relatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.3,mountvers=3,mountport=20048,mountproto=tcp,local_lock=none,addr=10.0.0.3
> >
> > It is also worth noting this behaviour is not unique to the re-export
> > use case. I see FS-Cache not being used with the following setup:
> >
> > Source NFS Server <-- Client (with FS-Cache).
> >
> > Thanks,
> > Ben
> >
> >
> > Kind Regards
> >
> > Benjamin Maynard
> >
> > Customer Engineer
> >
> > benmaynard@google.com
> >
> > Google, Inc.
> >
> >
> >
> >
> > On Mon, 31 Oct 2022 at 22:22, Trond Myklebust <trondmy@hammerspace.com> wrote:
> > >
> > >
> > >
> > > > On Oct 30, 2022, at 19:25, David Wysochanski <dwysocha@redhat.com> wrote:
> > > >
> > > > On Sat, Oct 29, 2022 at 12:46 PM David Wysochanski <dwysocha@redhat.com> wrote:
> > > >>
> > > >> On Fri, Oct 28, 2022 at 12:59 PM Trond Myklebust <trondmy@kernel.org> wrote:
> > > >>>
> > > >>> On Fri, 2022-10-28 at 07:50 -0400, David Wysochanski wrote:
> > > >>>> On Thu, Oct 27, 2022 at 3:16 PM Trond Myklebust <trondmy@kernel.org>
> > > >>>> wrote:
> > > >>>>>
> > > >>>>> On Mon, 2022-10-17 at 06:52 -0400, Dave Wysochanski wrote:
> > > >>>>>> Convert the NFS buffered read code paths to corresponding netfs
> > > >>>>>> APIs,
> > > >>>>>> but only when fscache is configured and enabled.
> > > >>>>>>
> > > >>>>>> The netfs API defines struct netfs_request_ops which must be
> > > >>>>>> filled
> > > >>>>>> in by the network filesystem.  For NFS, we only need to define 5
> > > >>>>>> of
> > > >>>>>> the functions, the main one being the issue_read() function.
> > > >>>>>> The issue_read() function is called by the netfs layer when a
> > > >>>>>> read
> > > >>>>>> cannot be fulfilled locally, and must be sent to the server
> > > >>>>>> (either
> > > >>>>>> the cache is not active, or it is active but the data is not
> > > >>>>>> available).
> > > >>>>>> Once the read from the server is complete, netfs requires a call
> > > >>>>>> to
> > > >>>>>> netfs_subreq_terminated() which conveys either how many bytes
> > > >>>>>> were
> > > >>>>>> read
> > > >>>>>> successfully, or an error.  Note that issue_read() is called with
> > > >>>>>> a
> > > >>>>>> structure, netfs_io_subrequest, which defines the IO requested,
> > > >>>>>> and
> > > >>>>>> contains a start and a length (both in bytes), and assumes the
> > > >>>>>> underlying
> > > >>>>>> netfs will return a either an error on the whole region, or the
> > > >>>>>> number
> > > >>>>>> of bytes successfully read.
> > > >>>>>>
> > > >>>>>> The NFS IO path is page based and the main APIs are the pgio APIs
> > > >>>>>> defined
> > > >>>>>> in pagelist.c.  For the pgio APIs, there is no way for the caller
> > > >>>>>> to
> > > >>>>>> know how many RPCs will be sent and how the pages will be broken
> > > >>>>>> up
> > > >>>>>> into underlying RPCs, each of which will have their own
> > > >>>>>> completion
> > > >>>>>> and
> > > >>>>>> return code.  In contrast, netfs is subrequest based, a single
> > > >>>>>> subrequest may contain multiple pages, and a single subrequest is
> > > >>>>>> initiated with issue_read() and terminated with
> > > >>>>>> netfs_subreq_terminated().
> > > >>>>>> Thus, to utilze the netfs APIs, NFS needs some way to accommodate
> > > >>>>>> the netfs API requirement on the single response to the whole
> > > >>>>>> subrequest, while also minimizing disruptive changes to the NFS
> > > >>>>>> pgio layer.
> > > >>>>>>
> > > >>>>>> The approach taken with this patch is to allocate a small
> > > >>>>>> structure
> > > >>>>>> for each nfs_netfs_issue_read() call, store the final error and
> > > >>>>>> number
> > > >>>>>> of bytes successfully transferred in the structure, and update
> > > >>>>>> these
> > > >>>>>> values
> > > >>>>>> as each RPC completes.  The refcount on the structure is used as
> > > >>>>>> a
> > > >>>>>> marker
> > > >>>>>> for the last RPC completion, is incremented in
> > > >>>>>> nfs_netfs_read_initiate(),
> > > >>>>>> and decremented inside nfs_netfs_read_completion(), when a
> > > >>>>>> nfs_pgio_header
> > > >>>>>> contains a valid pointer to the data.  On the final put (which
> > > >>>>>> signals
> > > >>>>>> the final outstanding RPC is complete) in
> > > >>>>>> nfs_netfs_read_completion(),
> > > >>>>>> call netfs_subreq_terminated() with either the final error value
> > > >>>>>> (if
> > > >>>>>> one or more READs complete with an error) or the number of bytes
> > > >>>>>> successfully transferred (if all RPCs complete successfully).
> > > >>>>>> Note
> > > >>>>>> that when all RPCs complete successfully, the number of bytes
> > > >>>>>> transferred
> > > >>>>>> is capped to the length of the subrequest.  Capping the
> > > >>>>>> transferred
> > > >>>>>> length
> > > >>>>>> to the subrequest length prevents "Subreq overread" warnings from
> > > >>>>>> netfs.
> > > >>>>>> This is due to the "aligned_len" in nfs_pageio_add_page(), and
> > > >>>>>> the
> > > >>>>>> corner case where NFS requests a full page at the end of the
> > > >>>>>> file,
> > > >>>>>> even when i_size reflects only a partial page (NFS overread).
> > > >>>>>>
> > > >>>>>> Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
> > > >>>>>> Reviewed-by: Jeff Layton <jlayton@kernel.org>
> > > >>>>>
> > > >>>>>
> > > >>>>> This is not doing what I asked for, which was to separate out the
> > > >>>>> fscache functionality, so that we can call that if and when it is
> > > >>>>> available.
> > > >>>>>
> > > >>>> I must have misunderstood then.
> > > >>>>
> > > >>>> The last feedback I have from you was that you wanted it to be
> > > >>>> an opt-in feature, and it was a comment on a previous patch
> > > >>>> to Kconfig.  I was proceeding the best I knew how, but
> > > >>>> let me try to get back on track.
> > > >>>>
> > > >>>>> Instead, it is just wrapping the NFS requests inside netfs
> > > >>>>> requests. As
> > > >>>>> it stands, that means it is just duplicating information, and
> > > >>>>> adding
> > > >>>>> unnecessary overhead to the standard I/O path (extra allocations,
> > > >>>>> extra
> > > >>>>> indirect calls, and extra bloat to the inode).
> > > >>>>>
> > > >>>> I think I understand what you're saying but I'm not sure.  Let me
> > > >>>> ask some clarifying questions.
> > > >>>>
> > > >>>> Are you objecting to the code when CONFIG_NFS_FSCACHE is
> > > >>>> configured?  Or when it is not?  Or both?  I think you're objecting
> > > >>>> when it's configured, but not enabled (we mount without 'fsc').
> > > >>>> Am I right?
> > > >>>>
> > > >>>> Also, are you objecting to the design that to use fcache we now
> > > >>>> have to use netfs, specifically:
> > > >>>> - call into netfs via either netfs_read_folio or netfs_readahead
> > > >>>> - if fscache is enabled, then the IO can be satisfied from fscache
> > > >>>> - if fscache is not enabled, or some of the IO cannot be satisfied
> > > >>>> from the cache, then NFS is called back via netfs_issue_read
> > > >>>> and we use the normal NFS read pageio interface.  This requires
> > > >>>> we call netfs_subreq_terminated() when all the RPCs complete,
> > > >>>> which is the reason for the small changes to pagelist.c
> > > >>>
> > > >>> I'm objecting to any middle layer "solution" that adds overhead to the
> > > >>> NFS I/O paths.
> > > >>>
> > > >> Got it.
> > > >>
> > > >>> I'm willing to consider solutions that are specific only to the fscache
> > > >>> use case (i.e. when the 'fsc' mount option is specified). However when
> > > >>> I perform a normal NFS mount, and do I/O, then I don't want to see
> > > >>> extra memory allocations, extra indirect calls and larger inode
> > > >>> footprints.
> > > >>>
> > > >>> IOW: I want the code to optimise for the case of standard NFS, not for
> > > >>> the case of 'NFS with cachefs additions'.
> > > >>>
> > > >> I agree completely.  Are you seeing extra memory allocations
> > > >> happen on mounts without 'fsc' or is it more a concern or how
> > > >> some of the patches look?  We should not be calling any netfs or
> > > >> fscache code if 'fsc' is not on the mount and I don't see any in my
> > > >> testing. So either there's a misunderstanding here, or there's a
> > > >> bug I'm missing.
> > > >>
> > > >> If fscache is not configured, then nfs_netfs_read_folio() and
> > > >> nfs_netfs_readahead() is a wrapper that returns -ENOBUFS.
> > > >> If it's configured but not enabled, then the checks for
> > > >> netfs_inode(inode)->cache should skip over any netfs code.
> > > >> But maybe there's a non-obvious bug you're seeing and
> > > >> somehow netfs is still getting called?  Because I cannot
> > > >> see netfs getting called if 'fsc' is not on the mount in my
> > > >> tests.
> > > >>
> > > >> int nfs_netfs_read_folio(struct file *file, struct folio *folio)
> > > >> {
> > > >>       if (!netfs_inode(folio_inode(folio))->cache)
> > > >>               return -ENOBUFS;
> > > >>
> > > >>       return netfs_read_folio(file, folio);
> > > >> }
> > > >>
> > > >> int nfs_netfs_readahead(struct readahead_control *ractl)
> > > >> {
> > > >>       struct inode *inode = ractl->mapping->host;
> > > >>
> > > >>       if (!netfs_inode(inode)->cache)
> > > >>               return -ENOBUFS;
> > > >>
> > > >>       netfs_readahead(ractl);
> > > >>       return 0;
> > > >> }
> > > >>
> > > >>
> > > >>>>
> > > >>>> Can you be more specific as to the portions of the patch you don't
> > > >>>> like
> > > >>>> so I can move it in the right direction?
> > > >>>>
> > > >>>> This is from patch #2 which you didn't comment on.  I'm not sure
> > > >>>> you're
> > > >>>> ok with it though, since you mention "extra bloat to the inode".
> > > >>>> Do you object to this even though it's wrapped in an
> > > >>>> #ifdef CONFIG_NFS_FSCACHE?  If so, do you require no
> > > >>>> extra size be added to nfs_inode?
> > > >>>>
> > > >>>> @@ -204,9 +208,11 @@ struct nfs_inode {
> > > >>>>       __u64 write_io;
> > > >>>>       __u64 read_io;
> > > >>>> #ifdef CONFIG_NFS_FSCACHE
> > > >>>> -       struct fscache_cookie   *fscache;
> > > >>>> -#endif
> > > >>>> +       struct netfs_inode      netfs; /* netfs context and VFS inode
> > > >>>> */
> > > >>>> +#else
> > > >>>>       struct inode            vfs_inode;
> > > >>>> +#endif
> > > >>>> +
> > > >>>
> > > >>> Ideally, I'd prefer no extra size. I can live with it up to a certain
> > > >>> point, however for now NFS is not unconditionally opting into the netfs
> > > >>> project. If we're to ever do that, then I want to see streamlined code
> > > >>> for the standard I/O case.
> > > >>>
> > > >> Ok and understood about standard I/O case.
> > > >>
> > > >> I was thinking how we might not increase the size, but I don't think
> > > >> I can make it work.
> > > >>
> > > >> I thought we could change to something like the below, without an
> > > >> embedded struct inode:
> > > >>
> > > >> @@ -204,9 +208,11 @@ struct nfs_inode {
> > > >>       __u64 write_io;
> > > >>       __u64 read_io;
> > > >> #ifdef CONFIG_NFS_FSCACHE
> > > >> -       struct fscache_cookie   *fscache;
> > > >> -#endif
> > > >> +       struct netfs_inode      *netfs; /* netfs context and VFS inode */
> > > >> +#else
> > > >>       struct inode            vfs_inode;
> > > >> +#endif
> > > >> +
> > > >>
> > > >> Then I would need to alloc/free a netfs_inode at the time of
> > > >> nfs_inode initiation.  Unfortunately this has the issue that the NFS_I()
> > > >> macro cannot work, because it requires an embedded "struct inode"
> > > >> due to "container_of" use:
> > > >>
> > > >> +#ifdef CONFIG_NFS_FSCACHE
> > > >> +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > > >> +{
> > > >> +       return &nfsi->netfs.inode;
> > > >> +}
> > > >> +static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > > >> +{
> > > >> +       return container_of(inode, struct nfs_inode, netfs.inode);
> > > >> +}
> > > >> +#else
> > > >> +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > > >> +{
> > > >> +       return &nfsi->vfs_inode;
> > > >> +}
> > > >> static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > > >> {
> > > >>       return container_of(inode, struct nfs_inode, vfs_inode);
> > > >> }
> > > >> +#endif
> > > >>
> > > >>
> > > >
> > > > Actually Trond maybe we can achieve a "0 length increase" of
> > > > nfs_inode if dhowells would take a patch to modify the definition
> > > > of struct netfs_inode and netfs_inode_init(), something like the WIP
> > > > patch below.  What do you think?
> > >
> > > That works for me.
> > >
> > > >
> > > > I think maybe this could be a follow-on patch and if you/dhowells
> > > > think it's an ok idea I can try to work out what is needed across
> > > > the tree.  I thought about it more and I kinda agree that in the
> > > > case for NFS where fscache is "configured but not enabled",
> > > > then even though we're only adding 24 bytes to the nfs_inode
> > > > each time, it will add up so it is worth at least a discussion.
> > > >
> > > > diff --git a/include/linux/netfs.h b/include/linux/netfs.h
> > > > index f2402ddeafbf..195714f1c355 100644
> > > > --- a/include/linux/netfs.h
> > > > +++ b/include/linux/netfs.h
> > > > @@ -118,11 +118,7 @@ enum netfs_io_source {
> > > > typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error,
> > > >                                     bool was_async);
> > > >
> > > > -/*
> > > > - * Per-inode context.  This wraps the VFS inode.
> > > > - */
> > > > -struct netfs_inode {
> > > > -       struct inode            inode;          /* The VFS inode */
> > > > +struct netfs_info {
> > > >       const struct netfs_request_ops *ops;
> > > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > >       struct fscache_cookie   *cache;
> > > > @@ -130,6 +126,14 @@ struct netfs_inode {
> > > >       loff_t                  remote_i_size;  /* Size of the remote file */
> > > > };
> > > >
> > > > +/*
> > > > + * Per-inode context.  This wraps the VFS inode.
> > > > + */
> > > > +struct netfs_inode {
> > > > +       struct inode            inode;          /* The VFS inode */
> > > > +       struct netfs_info       *netfs;         /* Rest of netfs data */
> > > > +};
> > > > +
> > > > /*
> > > > * Resources required to do operations on a cache.
> > > > */
> > > > @@ -312,10 +316,12 @@ static inline struct netfs_inode
> > > > *netfs_inode(struct inode *inode)
> > > > static inline void netfs_inode_init(struct netfs_inode *ctx,
> > > >                                   const struct netfs_request_ops *ops)
> > > > {
> > > > -       ctx->ops = ops;
> > > > -       ctx->remote_i_size = i_size_read(&ctx->inode);
> > > > +       ctx->netfs = kzalloc(sizeof(struct netfs_info)), GFP_KERNEL);
> > > > +       /* FIXME: Check for NULL */
> > > > +       ctx->netfs->ops = ops;
> > > > +       ctx->netfs->remote_i_size = i_size_read(&ctx->inode);
> > > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > > -       ctx->cache = NULL;
> > > > +       ctx->netfs->cache = NULL;
> > > > #endif
> > > > }
> > > >
> > > >
> > > >
> > > >>
> > > >>>>
> > > >>>>
> > > >>>> Are you ok with the stub functions which are placed in fscache.h, and
> > > >>>> when CONFIG_NFS_FSCACHE is not set, become either a no-op
> > > >>>> or a 1-liner (nfs_netfs_readpage_release)?
> > > >>>>
> > > >>>> #else /* CONFIG_NFS_FSCACHE */
> > > >>>> +static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {}
> > > >>>> +static inline void nfs_netfs_initiate_read(struct nfs_pgio_header
> > > >>>> *hdr) {}
> > > >>>> +static inline void nfs_netfs_read_completion(struct nfs_pgio_header
> > > >>>> *hdr) {}
> > > >>>> +static inline void nfs_netfs_readpage_release(struct nfs_page *req)
> > > >>>> +{
> > > >>>> +       unlock_page(req->wb_page);
> > > >>>> +}
> > > >>>> static inline void nfs_fscache_release_super_cookie(struct
> > > >>>> super_block *sb) {}
> > > >>>> static inline void nfs_fscache_init_inode(struct inode *inode) {}
> > > >>>>
> > > >>>>
> > > >>>> Do you object to the below?  If so, then do you want
> > > >>>> #ifdef CONFIG_NFS_FSCACHE here?
> > > >>>>
> > > >>>> -- a/fs/nfs/inode.c
> > > >>>> +++ b/fs/nfs/inode.c
> > > >>>> @@ -2249,6 +2249,8 @@ struct inode *nfs_alloc_inode(struct
> > > >>>> super_block *sb)
> > > >>>> #ifdef CONFIG_NFS_V4_2
> > > >>>>       nfsi->xattr_cache = NULL;
> > > >>>> #endif
> > > >>>> +       nfs_netfs_inode_init(nfsi);
> > > >>>> +
> > > >>>>       return VFS_I(nfsi);
> > > >>>> }
> > > >>>> EXPORT_SYMBOL_GPL(nfs_alloc_i
> > > >>>> node);
> > > >>>>
> > > >>>>
> > > >>>> Do you object to the changes in fs/nfs/read.c?  Specifically,
> > > >>>> how about the below calls to netfs from nfs_read_folio and
> > > >>>> nfs_readahead into equivalent netfs calls?  So when
> > > >>>> NFS_CONFIG_FSCACHE is set, but fscache is not enabled
> > > >>>> ('fsc' not on mount), these netfs functions do immediately call
> > > >>>> netfs_alloc_request().  But I wonder if we could simply add a
> > > >>>> check to see if fscache is enabled on the mount, and skip
> > > >>>> over to satisfy what you want.  Am I understanding what you
> > > >>>> want?
> > > >>>
> > > >>> Quite frankly, I'd prefer that we just split out the functionality that
> > > >>> is needed from the netfs code so that it can be optimised. However I'm
> > > >>> not interested enough in the cachefs functionality to work on that
> > > >>> myself. ...and as I indicated above, I might be OK with opting into the
> > > >>> netfs project, once the overhead can be made to disappear.
> > > >>>
> > > >> Understood.
> > > >>
> > > >> If you think it makes more sense, I can move some of the nfs_netfs_*
> > > >> functions into a netfs.c file as a starting point.  Or that can maybe
> > > >> be done in a future patchset?
> > > >>
> > > >> For now I was equating netfs and fscache together so we can
> > > >> move on from the much older and single-page limiting fscache
> > > >> interface that is likely to go away soon.
> > > >>
> > > >>>>
> > > >>>> @@ -355,6 +343,10 @@ int nfs_read_folio(struct file *file, struct
> > > >>>> folio *folio)
> > > >>>>       if (NFS_STALE(inode))
> > > >>>>               goto out_unlock;
> > > >>>>
> > > >>>> +       ret = nfs_netfs_read_folio(file, folio);
> > > >>>> +       if (!ret)
> > > >>>> +               goto out;
> > > >>>> +
> > > >>>>
> > > >>>> @@ -405,6 +399,10 @@ void nfs_readahead(struct readahead_control
> > > >>>> *ractl)
> > > >>>>       if (NFS_STALE(inode))
> > > >>>>               goto out;
> > > >>>>
> > > >>>> +       ret = nfs_netfs_readahead(ractl);
> > > >>>> +       if (!ret)
> > > >>>> +               goto out;
> > > >>>> +
> > > >>>>
> > > >> The above wrappers should prevent any additional overhead when fscache
> > > >> is not enabled.  As far as I know these work to avoid calling netfs
> > > >> when 'fsc' is not on the mount.
> > > >>
> > > >>>>
> > > >>>> And how about these calls from different points in the read
> > > >>>> path to the earlier mentioned stub functions?
> > > >>>>
> > > >>>> @@ -110,20 +110,13 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
> > > >>>>
> > > >>>> static void nfs_readpage_release(struct nfs_page *req, int error)
> > > >>>> {
> > > >>>> -       struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > > >>>>       struct page *page = req->wb_page;
> > > >>>>
> > > >>>> -       dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb-
> > > >>>>> s_id,
> > > >>>> -               (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
> > > >>>> -               (long long)req_offset(req));
> > > >>>> -
> > > >>>>       if (nfs_error_is_fatal_on_server(error) && error != -
> > > >>>> ETIMEDOUT)
> > > >>>>               SetPageError(page);
> > > >>>> -       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
> > > >>>> -               if (PageUptodate(page))
> > > >>>> -                       nfs_fscache_write_page(inode, page);
> > > >>>> -               unlock_page(page);
> > > >>>> -       }
> > > >>>> +       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > >>>> +               nfs_netfs_readpage_release(req);
> > > >>>> +
> > > >>>
> > > >>> I'm not seeing the value of wrapping unlock_page(), no... That code is
> > > >>> going to need to change when we move it to use folios natively anyway.
> > > >>>
> > > >> Ok, how about I make it conditional on whether fscache is configured
> > > >> and enabled then, similar to the nfs_netfs_read_folio() and
> > > >> nfs_netfs_readahead()?  Below is what that would look like.
> > > >> I could inline the code in nfs_netfs_readpage_release() if you
> > > >> think it would be clearer.
> > > >>
> > > >> static void nfs_readpage_release(struct nfs_page *req, int error)
> > > >> {
> > > >>       struct page *page = req->wb_page;
> > > >>
> > > >>       if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
> > > >>               SetPageError(page);
> > > >>       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > >> #ifndef CONFIG_NFS_FSCACHE
> > > >>               unlock_page(req->wb_page);
> > > >> #else
> > > >>               nfs_netfs_readpage_release(req);
> > > >> #endif
> > > >>       nfs_release_request(req);
> > > >> }
> > > >>
> > > >>
> > > >> void nfs_netfs_readpage_release(struct nfs_page *req)
> > > >> {
> > > >>   struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > > >>
> > > >>   /*
> > > >>    * If fscache is enabled, netfs will unlock pages.
> > > >>    */
> > > >>   if (netfs_inode(inode)->cache)
> > > >>       return;
> > > >>
> > > >>   unlock_page(req->wb_page);
> > > >> }
> > > >>
> > > >>
> > > >>>>       nfs_release_request(req);
> > > >>>> }
> > > >>>>
> > > >>>> @@ -177,6 +170,8 @@ static void nfs_read_completion(struct
> > > >>>> nfs_pgio_header *hdr)
> > > >>>>               nfs_list_remove_request(req);
> > > >>>>               nfs_readpage_release(req, error);
> > > >>>>       }
> > > >>>> +       nfs_netfs_read_completion(hdr);
> > > >>>> +
> > > >>>> out:
> > > >>>>       hdr->release(hdr);
> > > >>>> }
> > > >>>> @@ -187,6 +182,7 @@ static void nfs_initiate_read(struct
> > > >>>> nfs_pgio_header *hdr,
> > > >>>>                             struct rpc_task_setup *task_setup_data,
> > > >>>> int how)
> > > >>>> {
> > > >>>>       rpc_ops->read_setup(hdr, msg);
> > > >>>> +       nfs_netfs_initiate_read(hdr);
> > > >>>>       trace_nfs_initiate_read(hdr);
> > > >>>> }
> > > >>>>
> > > >>>>
> > > >>>> Are you ok with these additions?  Something like this would
> > > >>>> be required in the case of fscache configured and enabled,
> > > >>>> because we could have some of the data in a read in
> > > >>>> fscache, and some not.  That is the reason for the netfs
> > > >>>> design, and why we need to be able to call the normal
> > > >>>> NFS read IO path (netfs calls into issue_read, and we call
> > > >>>> back via netfs_subreq_terminated)?
> > > >>>>
> > > >>>> @@ -101,6 +101,9 @@ struct nfs_pageio_descriptor {
> > > >>>>       struct pnfs_layout_segment *pg_lseg;
> > > >>>>       struct nfs_io_completion *pg_io_completion;
> > > >>>>       struct nfs_direct_req   *pg_dreq;
> > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > >>>> +       void                    *pg_netfs;
> > > >>>> +#endif
> > > >>>>
> > > >>>> @@ -1619,6 +1619,9 @@ struct nfs_pgio_header {
> > > >>>>       const struct nfs_rw_ops *rw_ops;
> > > >>>>       struct nfs_io_completion *io_completion;
> > > >>>>       struct nfs_direct_req   *dreq;
> > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > >>>> +       void                    *netfs;
> > > >>>> +#endif
> > > >>>>
> > > >>>>
> > > >>>> And these additions to pagelist.c?
> > > >>>>
> > > >>>> @@ -68,6 +69,10 @@ void nfs_pgheader_init(struct
> > > >>>> nfs_pageio_descriptor *desc,
> > > >>>>       hdr->good_bytes = mirror->pg_count;
> > > >>>>       hdr->io_completion = desc->pg_io_completion;
> > > >>>>       hdr->dreq = desc->pg_dreq;
> > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > >>>> +       if (desc->pg_netfs)
> > > >>>> +               hdr->netfs = desc->pg_netfs;
> > > >>>> +#endif
> > > >>>
> > > >>> Why the conditional?
> > > >>>
> > > >> Not really needed and I was thinking of removing it, so I'll do that.
> > > >>
> > > >>>>
> > > >>>>
> > > >>>> @@ -846,6 +851,9 @@ void nfs_pageio_init(struct nfs_pageio_descriptor
> > > >>>> *desc,
> > > >>>>       desc->pg_lseg = NULL;
> > > >>>>       desc->pg_io_completion = NULL;
> > > >>>>       desc->pg_dreq = NULL;
> > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > >>>> +       desc->pg_netfs = NULL;
> > > >>>> +#endif
> > > >>>>
> > > >>>>
> > > >>>> @@ -1360,6 +1369,9 @@ int nfs_pageio_resend(struct
> > > >>>> nfs_pageio_descriptor *desc,
> > > >>>>
> > > >>>>       desc->pg_io_completion = hdr->io_completion;
> > > >>>>       desc->pg_dreq = hdr->dreq;
> > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > >>>> +       desc->pg_netfs = hdr->netfs;
> > > >>>> +#endif
> > > >>>
> > > >>> Those all need wrapper functions instead of embedding #ifdefs.
> > > >>>
> > > >> Ok.
> > > >>
> > > >>
> > > >>
> > > >>>>
> > > >>>>
> > > >>>>> My expectation is that the standard I/O path should have minimal
> > > >>>>> overhead, and should certainly not increase the overhead that we
> > > >>>>> already have. Will this be addressed in future iterations of these
> > > >>>>> patches?
> > > >>>>>
> > > >>>>
> > > >>>> I will do what I can to satisfy what you want, either by fixing up
> > > >>>> this patch or follow-on patches.  Hopefully the above questions
> > > >>>> will clarify the next steps.
> > > >>>>
> > > >>>
> > > >>> --
> > > >>> Trond Myklebust
> > > >>> Linux NFS client maintainer, Hammerspace
> > > >>> trond.myklebust@hammerspace.com
> > >
> > >
> > >
> > > Trond Myklebust
> > > CTO, Hammerspace Inc
> > > 1900 S Norfolk St, Suite 350 - #45
> > > San Mateo, CA 94403
> > >
> > > www.hammer.space
> > >
> > >
> >
>
Jeff Layton Nov. 14, 2022, 1:07 p.m. UTC | #10
On Mon, 2022-11-14 at 12:42 +0000, Benjamin Maynard wrote:
> Thanks Dave for getting back to me so quickly.
> 
> > Due to use of "drop_caches" this is almost certainly the known issue #1
> > I mentioned in the opening post of this series:
> > https://lore.kernel.org/all/20221103161637.1725471-1-dwysocha@redhat.com/
> 
> Apologies, I completely missed the known issues in the original
> opening message of the series. Just to clarify, I was only ever
> dropping the caches on the "NFS Client" in the below relationship:
> 
> Source NFS Server <-- Re-Export Server (with FS-Cache) <-- NFS Client.
> 

What sort of server is the Source NFS server here? If it's also Linux,
then what sort of filesystem is being exported?

> I never dropped the caches on the Re-Export Server (the server running
> FS-Cache) at any point.
> 
> However my rsize was lower than my readahead value. I've since corrected that:
> 
> benmaynard@demo-cluster-1-26hm:~$ cat /proc/mounts | grep nfs
> 10.0.0.49:/files /srv/nfs/files nfs
> rw,noatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,acregmin=600,acregmax=600,acdirmin=600,acdirmax=600,hard,nocto,proto=tcp,nconnect=16,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.49,mountvers=3,mountport=37485,mountproto=tcp,fsc,local_lock=none,addr=10.0.0.49
> 0 0
> 
> benmaynard@demo-cluster-1-26hm:~$ findmnt -rnu -t nfs,nfs4 -o MAJ:MIN,TARGET
> 0:52 /srv/nfs/files
> benmaynard@demo-cluster-1-26hm:~$ cat /sys/class/bdi/0\:52/read_ahead_kb
> 512
> 
> With this configuration I see the same issue, FS-Cache never reads
> from /var/cache/fscache, and copying the same file always leads to
> heavy writes to /var/cache/fscache (the cache is overwriting itself).
> 
> I have also tried this copy without clearing the caches on any server
> in the chain, and the same happens.
> 
> Would you expect this behaviour even though rsize > read ahead? Would
> you expect the referenced patch to fix this?
> 
> I tried to apply the patch you suggested
> (https://www.mail-archive.com/linux-cachefs@redhat.com/msg03043.html)
> but it did not apply cleanly, and I ran out of time to troubleshoot. I
> should get some more time on Wednesday and I can re-try.
> 
> 
> Kind Regards
> Benjamin Maynard
> 
> 
> Kind Regards
> 
> Benjamin Maynard
> 
> Customer Engineer
> 
> benmaynard@google.com
> 
> Google, Inc.
> 
> 
> 
> 
> On Mon, 14 Nov 2022 at 10:41, David Wysochanski <dwysocha@redhat.com> wrote:
> > 
> > Hi Ben,
> > 
> > Thanks for testing these patches.  More below.
> > 
> > On Sat, Nov 12, 2022 at 7:47 AM Benjamin Maynard <benmaynard@google.com> wrote:
> > > 
> > > Hi all,
> > > 
> > > I've been doing some more testing with these patches, I applied all of
> > > the patches (v10 from
> > > https://patchwork.kernel.org/project/linux-nfs/list/?series=691729)
> > > apart from Patch 6 (the RFC patch) to version 6.0.8 of the kernel.
> > > 
> > > I have the following setup:
> > > 
> > > Source NFS Server <-- Re-Export Server (with FS-Cache) <-- NFS Client.
> > > 
> > > I have a 500Gb file on the Source NFS Server, which I am then copying
> > > to the NFS Client via the Re-Export Server.
> > > 
> > > On the first copy, I see heavy writes to /var/cache/fscache on the
> > > re-export server, and once the file copy completes I see that
> > > /var/cache/fscache is approximately 500Gb in size. All good so far.
> > > 
> > > I then deleted that file from the NFS Client, and dropped the caches
> > > just to be safe (echo 3 > /proc/sys/vm/drop_caches on the NFS Client).
> > > 
> > > I then performed another copy of the 500Gb file on the NFS Client,
> > > again via the Re-Export Server. What I expected would happen is that I
> > > would see heavy reads from the /var/cache/fscache volume as the file
> > > should be served from FS-Cache.
> > > 
> > > However what I actually saw was no reads whatsoever, FS-Cache seems to
> > > be ignored and the file is pulled from the Source NFS Filer again. I
> > > also see heavy writes to /var/cache/fscache, so it appears that
> > > FS-Cache is overwriting its existing cache, and never using it.
> > > 
> > Due to use of "drop_caches" this is almost certainly the known issue #1
> > I mentioned in the opening post of this series:
> > https://lore.kernel.org/all/20221103161637.1725471-1-dwysocha@redhat.com/
> > 
> > The above issue will be fixed with the following patch which has not
> > been merged yet:
> > https://www.mail-archive.com/linux-cachefs@redhat.com/msg03043.html
> > 
> > Do you have time to do another test to verify that is the case?
> > If so, I can re-post that patch on top of the first 5 patches in this series,
> > as well as a second patch that allows NFS to use it.
> > 
> > 
> > > I only have 104Gb of memory on the Re-Export Server (with FS-Cache) so
> > > it is not possible that the file is being served from the page cache.
> > > 
> > > We saw this behaviour before on an older set of the patches when our
> > > mount between the Re-Export Server and the Source NFS Filer was using
> > > the "sync" option, but we are now using the "async" option and the
> > > same is happening.
> > > 
> > > Mount options:
> > > 
> > > Source NFS Server <-- Re-Export Server (with FS-Cache):
> > > 
> > > 10.0.0.49:/files /srv/nfs/files nfs
> > > rw,noatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,acregmin=600,acregmax=600,acdirmin=600,acdirmax=600,hard,nocto,proto=tcp,nconnect=16,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.49,mountvers=3,mountport=37485,mountproto=tcp,fsc,local_lock=none,addr=10.0.0.49
> > > 
> > > Re-Export Server (with FS-Cache) <-- NFS Client:
> > > 
> > > 10.0.0.3:/files /mnt/nfs nfs
> > > rw,relatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.3,mountvers=3,mountport=20048,mountproto=tcp,local_lock=none,addr=10.0.0.3
> > > 
> > > It is also worth noting this behaviour is not unique to the re-export
> > > use case. I see FS-Cache not being used with the following setup:
> > > 
> > > Source NFS Server <-- Client (with FS-Cache).
> > > 
> > > Thanks,
> > > Ben
> > > 
> > > 
> > > Kind Regards
> > > 
> > > Benjamin Maynard
> > > 
> > > Customer Engineer
> > > 
> > > benmaynard@google.com
> > > 
> > > Google, Inc.
> > > 
> > > 
> > > 
> > > 
> > > On Mon, 31 Oct 2022 at 22:22, Trond Myklebust <trondmy@hammerspace.com> wrote:
> > > > 
> > > > 
> > > > 
> > > > > On Oct 30, 2022, at 19:25, David Wysochanski <dwysocha@redhat.com> wrote:
> > > > > 
> > > > > On Sat, Oct 29, 2022 at 12:46 PM David Wysochanski <dwysocha@redhat.com> wrote:
> > > > > > 
> > > > > > On Fri, Oct 28, 2022 at 12:59 PM Trond Myklebust <trondmy@kernel.org> wrote:
> > > > > > > 
> > > > > > > On Fri, 2022-10-28 at 07:50 -0400, David Wysochanski wrote:
> > > > > > > > On Thu, Oct 27, 2022 at 3:16 PM Trond Myklebust <trondmy@kernel.org>
> > > > > > > > wrote:
> > > > > > > > > 
> > > > > > > > > On Mon, 2022-10-17 at 06:52 -0400, Dave Wysochanski wrote:
> > > > > > > > > > Convert the NFS buffered read code paths to corresponding netfs
> > > > > > > > > > APIs,
> > > > > > > > > > but only when fscache is configured and enabled.
> > > > > > > > > > 
> > > > > > > > > > The netfs API defines struct netfs_request_ops which must be
> > > > > > > > > > filled
> > > > > > > > > > in by the network filesystem.  For NFS, we only need to define 5
> > > > > > > > > > of
> > > > > > > > > > the functions, the main one being the issue_read() function.
> > > > > > > > > > The issue_read() function is called by the netfs layer when a
> > > > > > > > > > read
> > > > > > > > > > cannot be fulfilled locally, and must be sent to the server
> > > > > > > > > > (either
> > > > > > > > > > the cache is not active, or it is active but the data is not
> > > > > > > > > > available).
> > > > > > > > > > Once the read from the server is complete, netfs requires a call
> > > > > > > > > > to
> > > > > > > > > > netfs_subreq_terminated() which conveys either how many bytes
> > > > > > > > > > were
> > > > > > > > > > read
> > > > > > > > > > successfully, or an error.  Note that issue_read() is called with
> > > > > > > > > > a
> > > > > > > > > > structure, netfs_io_subrequest, which defines the IO requested,
> > > > > > > > > > and
> > > > > > > > > > contains a start and a length (both in bytes), and assumes the
> > > > > > > > > > underlying
> > > > > > > > > > netfs will return a either an error on the whole region, or the
> > > > > > > > > > number
> > > > > > > > > > of bytes successfully read.
> > > > > > > > > > 
> > > > > > > > > > The NFS IO path is page based and the main APIs are the pgio APIs
> > > > > > > > > > defined
> > > > > > > > > > in pagelist.c.  For the pgio APIs, there is no way for the caller
> > > > > > > > > > to
> > > > > > > > > > know how many RPCs will be sent and how the pages will be broken
> > > > > > > > > > up
> > > > > > > > > > into underlying RPCs, each of which will have their own
> > > > > > > > > > completion
> > > > > > > > > > and
> > > > > > > > > > return code.  In contrast, netfs is subrequest based, a single
> > > > > > > > > > subrequest may contain multiple pages, and a single subrequest is
> > > > > > > > > > initiated with issue_read() and terminated with
> > > > > > > > > > netfs_subreq_terminated().
> > > > > > > > > > Thus, to utilze the netfs APIs, NFS needs some way to accommodate
> > > > > > > > > > the netfs API requirement on the single response to the whole
> > > > > > > > > > subrequest, while also minimizing disruptive changes to the NFS
> > > > > > > > > > pgio layer.
> > > > > > > > > > 
> > > > > > > > > > The approach taken with this patch is to allocate a small
> > > > > > > > > > structure
> > > > > > > > > > for each nfs_netfs_issue_read() call, store the final error and
> > > > > > > > > > number
> > > > > > > > > > of bytes successfully transferred in the structure, and update
> > > > > > > > > > these
> > > > > > > > > > values
> > > > > > > > > > as each RPC completes.  The refcount on the structure is used as
> > > > > > > > > > a
> > > > > > > > > > marker
> > > > > > > > > > for the last RPC completion, is incremented in
> > > > > > > > > > nfs_netfs_read_initiate(),
> > > > > > > > > > and decremented inside nfs_netfs_read_completion(), when a
> > > > > > > > > > nfs_pgio_header
> > > > > > > > > > contains a valid pointer to the data.  On the final put (which
> > > > > > > > > > signals
> > > > > > > > > > the final outstanding RPC is complete) in
> > > > > > > > > > nfs_netfs_read_completion(),
> > > > > > > > > > call netfs_subreq_terminated() with either the final error value
> > > > > > > > > > (if
> > > > > > > > > > one or more READs complete with an error) or the number of bytes
> > > > > > > > > > successfully transferred (if all RPCs complete successfully).
> > > > > > > > > > Note
> > > > > > > > > > that when all RPCs complete successfully, the number of bytes
> > > > > > > > > > transferred
> > > > > > > > > > is capped to the length of the subrequest.  Capping the
> > > > > > > > > > transferred
> > > > > > > > > > length
> > > > > > > > > > to the subrequest length prevents "Subreq overread" warnings from
> > > > > > > > > > netfs.
> > > > > > > > > > This is due to the "aligned_len" in nfs_pageio_add_page(), and
> > > > > > > > > > the
> > > > > > > > > > corner case where NFS requests a full page at the end of the
> > > > > > > > > > file,
> > > > > > > > > > even when i_size reflects only a partial page (NFS overread).
> > > > > > > > > > 
> > > > > > > > > > Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
> > > > > > > > > > Reviewed-by: Jeff Layton <jlayton@kernel.org>
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > This is not doing what I asked for, which was to separate out the
> > > > > > > > > fscache functionality, so that we can call that if and when it is
> > > > > > > > > available.
> > > > > > > > > 
> > > > > > > > I must have misunderstood then.
> > > > > > > > 
> > > > > > > > The last feedback I have from you was that you wanted it to be
> > > > > > > > an opt-in feature, and it was a comment on a previous patch
> > > > > > > > to Kconfig.  I was proceeding the best I knew how, but
> > > > > > > > let me try to get back on track.
> > > > > > > > 
> > > > > > > > > Instead, it is just wrapping the NFS requests inside netfs
> > > > > > > > > requests. As
> > > > > > > > > it stands, that means it is just duplicating information, and
> > > > > > > > > adding
> > > > > > > > > unnecessary overhead to the standard I/O path (extra allocations,
> > > > > > > > > extra
> > > > > > > > > indirect calls, and extra bloat to the inode).
> > > > > > > > > 
> > > > > > > > I think I understand what you're saying but I'm not sure.  Let me
> > > > > > > > ask some clarifying questions.
> > > > > > > > 
> > > > > > > > Are you objecting to the code when CONFIG_NFS_FSCACHE is
> > > > > > > > configured?  Or when it is not?  Or both?  I think you're objecting
> > > > > > > > when it's configured, but not enabled (we mount without 'fsc').
> > > > > > > > Am I right?
> > > > > > > > 
> > > > > > > > Also, are you objecting to the design that to use fcache we now
> > > > > > > > have to use netfs, specifically:
> > > > > > > > - call into netfs via either netfs_read_folio or netfs_readahead
> > > > > > > > - if fscache is enabled, then the IO can be satisfied from fscache
> > > > > > > > - if fscache is not enabled, or some of the IO cannot be satisfied
> > > > > > > > from the cache, then NFS is called back via netfs_issue_read
> > > > > > > > and we use the normal NFS read pageio interface.  This requires
> > > > > > > > we call netfs_subreq_terminated() when all the RPCs complete,
> > > > > > > > which is the reason for the small changes to pagelist.c
> > > > > > > 
> > > > > > > I'm objecting to any middle layer "solution" that adds overhead to the
> > > > > > > NFS I/O paths.
> > > > > > > 
> > > > > > Got it.
> > > > > > 
> > > > > > > I'm willing to consider solutions that are specific only to the fscache
> > > > > > > use case (i.e. when the 'fsc' mount option is specified). However when
> > > > > > > I perform a normal NFS mount, and do I/O, then I don't want to see
> > > > > > > extra memory allocations, extra indirect calls and larger inode
> > > > > > > footprints.
> > > > > > > 
> > > > > > > IOW: I want the code to optimise for the case of standard NFS, not for
> > > > > > > the case of 'NFS with cachefs additions'.
> > > > > > > 
> > > > > > I agree completely.  Are you seeing extra memory allocations
> > > > > > happen on mounts without 'fsc' or is it more a concern or how
> > > > > > some of the patches look?  We should not be calling any netfs or
> > > > > > fscache code if 'fsc' is not on the mount and I don't see any in my
> > > > > > testing. So either there's a misunderstanding here, or there's a
> > > > > > bug I'm missing.
> > > > > > 
> > > > > > If fscache is not configured, then nfs_netfs_read_folio() and
> > > > > > nfs_netfs_readahead() is a wrapper that returns -ENOBUFS.
> > > > > > If it's configured but not enabled, then the checks for
> > > > > > netfs_inode(inode)->cache should skip over any netfs code.
> > > > > > But maybe there's a non-obvious bug you're seeing and
> > > > > > somehow netfs is still getting called?  Because I cannot
> > > > > > see netfs getting called if 'fsc' is not on the mount in my
> > > > > > tests.
> > > > > > 
> > > > > > int nfs_netfs_read_folio(struct file *file, struct folio *folio)
> > > > > > {
> > > > > >       if (!netfs_inode(folio_inode(folio))->cache)
> > > > > >               return -ENOBUFS;
> > > > > > 
> > > > > >       return netfs_read_folio(file, folio);
> > > > > > }
> > > > > > 
> > > > > > int nfs_netfs_readahead(struct readahead_control *ractl)
> > > > > > {
> > > > > >       struct inode *inode = ractl->mapping->host;
> > > > > > 
> > > > > >       if (!netfs_inode(inode)->cache)
> > > > > >               return -ENOBUFS;
> > > > > > 
> > > > > >       netfs_readahead(ractl);
> > > > > >       return 0;
> > > > > > }
> > > > > > 
> > > > > > 
> > > > > > > > 
> > > > > > > > Can you be more specific as to the portions of the patch you don't
> > > > > > > > like
> > > > > > > > so I can move it in the right direction?
> > > > > > > > 
> > > > > > > > This is from patch #2 which you didn't comment on.  I'm not sure
> > > > > > > > you're
> > > > > > > > ok with it though, since you mention "extra bloat to the inode".
> > > > > > > > Do you object to this even though it's wrapped in an
> > > > > > > > #ifdef CONFIG_NFS_FSCACHE?  If so, do you require no
> > > > > > > > extra size be added to nfs_inode?
> > > > > > > > 
> > > > > > > > @@ -204,9 +208,11 @@ struct nfs_inode {
> > > > > > > >       __u64 write_io;
> > > > > > > >       __u64 read_io;
> > > > > > > > #ifdef CONFIG_NFS_FSCACHE
> > > > > > > > -       struct fscache_cookie   *fscache;
> > > > > > > > -#endif
> > > > > > > > +       struct netfs_inode      netfs; /* netfs context and VFS inode
> > > > > > > > */
> > > > > > > > +#else
> > > > > > > >       struct inode            vfs_inode;
> > > > > > > > +#endif
> > > > > > > > +
> > > > > > > 
> > > > > > > Ideally, I'd prefer no extra size. I can live with it up to a certain
> > > > > > > point, however for now NFS is not unconditionally opting into the netfs
> > > > > > > project. If we're to ever do that, then I want to see streamlined code
> > > > > > > for the standard I/O case.
> > > > > > > 
> > > > > > Ok and understood about standard I/O case.
> > > > > > 
> > > > > > I was thinking how we might not increase the size, but I don't think
> > > > > > I can make it work.
> > > > > > 
> > > > > > I thought we could change to something like the below, without an
> > > > > > embedded struct inode:
> > > > > > 
> > > > > > @@ -204,9 +208,11 @@ struct nfs_inode {
> > > > > >       __u64 write_io;
> > > > > >       __u64 read_io;
> > > > > > #ifdef CONFIG_NFS_FSCACHE
> > > > > > -       struct fscache_cookie   *fscache;
> > > > > > -#endif
> > > > > > +       struct netfs_inode      *netfs; /* netfs context and VFS inode */
> > > > > > +#else
> > > > > >       struct inode            vfs_inode;
> > > > > > +#endif
> > > > > > +
> > > > > > 
> > > > > > Then I would need to alloc/free a netfs_inode at the time of
> > > > > > nfs_inode initiation.  Unfortunately this has the issue that the NFS_I()
> > > > > > macro cannot work, because it requires an embedded "struct inode"
> > > > > > due to "container_of" use:
> > > > > > 
> > > > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > > > +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > > > > > +{
> > > > > > +       return &nfsi->netfs.inode;
> > > > > > +}
> > > > > > +static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > > > > > +{
> > > > > > +       return container_of(inode, struct nfs_inode, netfs.inode);
> > > > > > +}
> > > > > > +#else
> > > > > > +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > > > > > +{
> > > > > > +       return &nfsi->vfs_inode;
> > > > > > +}
> > > > > > static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > > > > > {
> > > > > >       return container_of(inode, struct nfs_inode, vfs_inode);
> > > > > > }
> > > > > > +#endif
> > > > > > 
> > > > > > 
> > > > > 
> > > > > Actually Trond maybe we can achieve a "0 length increase" of
> > > > > nfs_inode if dhowells would take a patch to modify the definition
> > > > > of struct netfs_inode and netfs_inode_init(), something like the WIP
> > > > > patch below.  What do you think?
> > > > 
> > > > That works for me.
> > > > 
> > > > > 
> > > > > I think maybe this could be a follow-on patch and if you/dhowells
> > > > > think it's an ok idea I can try to work out what is needed across
> > > > > the tree.  I thought about it more and I kinda agree that in the
> > > > > case for NFS where fscache is "configured but not enabled",
> > > > > then even though we're only adding 24 bytes to the nfs_inode
> > > > > each time, it will add up so it is worth at least a discussion.
> > > > > 
> > > > > diff --git a/include/linux/netfs.h b/include/linux/netfs.h
> > > > > index f2402ddeafbf..195714f1c355 100644
> > > > > --- a/include/linux/netfs.h
> > > > > +++ b/include/linux/netfs.h
> > > > > @@ -118,11 +118,7 @@ enum netfs_io_source {
> > > > > typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error,
> > > > >                                     bool was_async);
> > > > > 
> > > > > -/*
> > > > > - * Per-inode context.  This wraps the VFS inode.
> > > > > - */
> > > > > -struct netfs_inode {
> > > > > -       struct inode            inode;          /* The VFS inode */
> > > > > +struct netfs_info {
> > > > >       const struct netfs_request_ops *ops;
> > > > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > > >       struct fscache_cookie   *cache;
> > > > > @@ -130,6 +126,14 @@ struct netfs_inode {
> > > > >       loff_t                  remote_i_size;  /* Size of the remote file */
> > > > > };
> > > > > 
> > > > > +/*
> > > > > + * Per-inode context.  This wraps the VFS inode.
> > > > > + */
> > > > > +struct netfs_inode {
> > > > > +       struct inode            inode;          /* The VFS inode */
> > > > > +       struct netfs_info       *netfs;         /* Rest of netfs data */
> > > > > +};
> > > > > +
> > > > > /*
> > > > > * Resources required to do operations on a cache.
> > > > > */
> > > > > @@ -312,10 +316,12 @@ static inline struct netfs_inode
> > > > > *netfs_inode(struct inode *inode)
> > > > > static inline void netfs_inode_init(struct netfs_inode *ctx,
> > > > >                                   const struct netfs_request_ops *ops)
> > > > > {
> > > > > -       ctx->ops = ops;
> > > > > -       ctx->remote_i_size = i_size_read(&ctx->inode);
> > > > > +       ctx->netfs = kzalloc(sizeof(struct netfs_info)), GFP_KERNEL);
> > > > > +       /* FIXME: Check for NULL */
> > > > > +       ctx->netfs->ops = ops;
> > > > > +       ctx->netfs->remote_i_size = i_size_read(&ctx->inode);
> > > > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > > > -       ctx->cache = NULL;
> > > > > +       ctx->netfs->cache = NULL;
> > > > > #endif
> > > > > }
> > > > > 
> > > > > 
> > > > > 
> > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > Are you ok with the stub functions which are placed in fscache.h, and
> > > > > > > > when CONFIG_NFS_FSCACHE is not set, become either a no-op
> > > > > > > > or a 1-liner (nfs_netfs_readpage_release)?
> > > > > > > > 
> > > > > > > > #else /* CONFIG_NFS_FSCACHE */
> > > > > > > > +static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {}
> > > > > > > > +static inline void nfs_netfs_initiate_read(struct nfs_pgio_header
> > > > > > > > *hdr) {}
> > > > > > > > +static inline void nfs_netfs_read_completion(struct nfs_pgio_header
> > > > > > > > *hdr) {}
> > > > > > > > +static inline void nfs_netfs_readpage_release(struct nfs_page *req)
> > > > > > > > +{
> > > > > > > > +       unlock_page(req->wb_page);
> > > > > > > > +}
> > > > > > > > static inline void nfs_fscache_release_super_cookie(struct
> > > > > > > > super_block *sb) {}
> > > > > > > > static inline void nfs_fscache_init_inode(struct inode *inode) {}
> > > > > > > > 
> > > > > > > > 
> > > > > > > > Do you object to the below?  If so, then do you want
> > > > > > > > #ifdef CONFIG_NFS_FSCACHE here?
> > > > > > > > 
> > > > > > > > -- a/fs/nfs/inode.c
> > > > > > > > +++ b/fs/nfs/inode.c
> > > > > > > > @@ -2249,6 +2249,8 @@ struct inode *nfs_alloc_inode(struct
> > > > > > > > super_block *sb)
> > > > > > > > #ifdef CONFIG_NFS_V4_2
> > > > > > > >       nfsi->xattr_cache = NULL;
> > > > > > > > #endif
> > > > > > > > +       nfs_netfs_inode_init(nfsi);
> > > > > > > > +
> > > > > > > >       return VFS_I(nfsi);
> > > > > > > > }
> > > > > > > > EXPORT_SYMBOL_GPL(nfs_alloc_i
> > > > > > > > node);
> > > > > > > > 
> > > > > > > > 
> > > > > > > > Do you object to the changes in fs/nfs/read.c?  Specifically,
> > > > > > > > how about the below calls to netfs from nfs_read_folio and
> > > > > > > > nfs_readahead into equivalent netfs calls?  So when
> > > > > > > > NFS_CONFIG_FSCACHE is set, but fscache is not enabled
> > > > > > > > ('fsc' not on mount), these netfs functions do immediately call
> > > > > > > > netfs_alloc_request().  But I wonder if we could simply add a
> > > > > > > > check to see if fscache is enabled on the mount, and skip
> > > > > > > > over to satisfy what you want.  Am I understanding what you
> > > > > > > > want?
> > > > > > > 
> > > > > > > Quite frankly, I'd prefer that we just split out the functionality that
> > > > > > > is needed from the netfs code so that it can be optimised. However I'm
> > > > > > > not interested enough in the cachefs functionality to work on that
> > > > > > > myself. ...and as I indicated above, I might be OK with opting into the
> > > > > > > netfs project, once the overhead can be made to disappear.
> > > > > > > 
> > > > > > Understood.
> > > > > > 
> > > > > > If you think it makes more sense, I can move some of the nfs_netfs_*
> > > > > > functions into a netfs.c file as a starting point.  Or that can maybe
> > > > > > be done in a future patchset?
> > > > > > 
> > > > > > For now I was equating netfs and fscache together so we can
> > > > > > move on from the much older and single-page limiting fscache
> > > > > > interface that is likely to go away soon.
> > > > > > 
> > > > > > > > 
> > > > > > > > @@ -355,6 +343,10 @@ int nfs_read_folio(struct file *file, struct
> > > > > > > > folio *folio)
> > > > > > > >       if (NFS_STALE(inode))
> > > > > > > >               goto out_unlock;
> > > > > > > > 
> > > > > > > > +       ret = nfs_netfs_read_folio(file, folio);
> > > > > > > > +       if (!ret)
> > > > > > > > +               goto out;
> > > > > > > > +
> > > > > > > > 
> > > > > > > > @@ -405,6 +399,10 @@ void nfs_readahead(struct readahead_control
> > > > > > > > *ractl)
> > > > > > > >       if (NFS_STALE(inode))
> > > > > > > >               goto out;
> > > > > > > > 
> > > > > > > > +       ret = nfs_netfs_readahead(ractl);
> > > > > > > > +       if (!ret)
> > > > > > > > +               goto out;
> > > > > > > > +
> > > > > > > > 
> > > > > > The above wrappers should prevent any additional overhead when fscache
> > > > > > is not enabled.  As far as I know these work to avoid calling netfs
> > > > > > when 'fsc' is not on the mount.
> > > > > > 
> > > > > > > > 
> > > > > > > > And how about these calls from different points in the read
> > > > > > > > path to the earlier mentioned stub functions?
> > > > > > > > 
> > > > > > > > @@ -110,20 +110,13 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
> > > > > > > > 
> > > > > > > > static void nfs_readpage_release(struct nfs_page *req, int error)
> > > > > > > > {
> > > > > > > > -       struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > > > > > > >       struct page *page = req->wb_page;
> > > > > > > > 
> > > > > > > > -       dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb-
> > > > > > > > > s_id,
> > > > > > > > -               (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
> > > > > > > > -               (long long)req_offset(req));
> > > > > > > > -
> > > > > > > >       if (nfs_error_is_fatal_on_server(error) && error != -
> > > > > > > > ETIMEDOUT)
> > > > > > > >               SetPageError(page);
> > > > > > > > -       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
> > > > > > > > -               if (PageUptodate(page))
> > > > > > > > -                       nfs_fscache_write_page(inode, page);
> > > > > > > > -               unlock_page(page);
> > > > > > > > -       }
> > > > > > > > +       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > > > > > > +               nfs_netfs_readpage_release(req);
> > > > > > > > +
> > > > > > > 
> > > > > > > I'm not seeing the value of wrapping unlock_page(), no... That code is
> > > > > > > going to need to change when we move it to use folios natively anyway.
> > > > > > > 
> > > > > > Ok, how about I make it conditional on whether fscache is configured
> > > > > > and enabled then, similar to the nfs_netfs_read_folio() and
> > > > > > nfs_netfs_readahead()?  Below is what that would look like.
> > > > > > I could inline the code in nfs_netfs_readpage_release() if you
> > > > > > think it would be clearer.
> > > > > > 
> > > > > > static void nfs_readpage_release(struct nfs_page *req, int error)
> > > > > > {
> > > > > >       struct page *page = req->wb_page;
> > > > > > 
> > > > > >       if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
> > > > > >               SetPageError(page);
> > > > > >       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > > > > #ifndef CONFIG_NFS_FSCACHE
> > > > > >               unlock_page(req->wb_page);
> > > > > > #else
> > > > > >               nfs_netfs_readpage_release(req);
> > > > > > #endif
> > > > > >       nfs_release_request(req);
> > > > > > }
> > > > > > 
> > > > > > 
> > > > > > void nfs_netfs_readpage_release(struct nfs_page *req)
> > > > > > {
> > > > > >   struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > > > > > 
> > > > > >   /*
> > > > > >    * If fscache is enabled, netfs will unlock pages.
> > > > > >    */
> > > > > >   if (netfs_inode(inode)->cache)
> > > > > >       return;
> > > > > > 
> > > > > >   unlock_page(req->wb_page);
> > > > > > }
> > > > > > 
> > > > > > 
> > > > > > > >       nfs_release_request(req);
> > > > > > > > }
> > > > > > > > 
> > > > > > > > @@ -177,6 +170,8 @@ static void nfs_read_completion(struct
> > > > > > > > nfs_pgio_header *hdr)
> > > > > > > >               nfs_list_remove_request(req);
> > > > > > > >               nfs_readpage_release(req, error);
> > > > > > > >       }
> > > > > > > > +       nfs_netfs_read_completion(hdr);
> > > > > > > > +
> > > > > > > > out:
> > > > > > > >       hdr->release(hdr);
> > > > > > > > }
> > > > > > > > @@ -187,6 +182,7 @@ static void nfs_initiate_read(struct
> > > > > > > > nfs_pgio_header *hdr,
> > > > > > > >                             struct rpc_task_setup *task_setup_data,
> > > > > > > > int how)
> > > > > > > > {
> > > > > > > >       rpc_ops->read_setup(hdr, msg);
> > > > > > > > +       nfs_netfs_initiate_read(hdr);
> > > > > > > >       trace_nfs_initiate_read(hdr);
> > > > > > > > }
> > > > > > > > 
> > > > > > > > 
> > > > > > > > Are you ok with these additions?  Something like this would
> > > > > > > > be required in the case of fscache configured and enabled,
> > > > > > > > because we could have some of the data in a read in
> > > > > > > > fscache, and some not.  That is the reason for the netfs
> > > > > > > > design, and why we need to be able to call the normal
> > > > > > > > NFS read IO path (netfs calls into issue_read, and we call
> > > > > > > > back via netfs_subreq_terminated)?
> > > > > > > > 
> > > > > > > > @@ -101,6 +101,9 @@ struct nfs_pageio_descriptor {
> > > > > > > >       struct pnfs_layout_segment *pg_lseg;
> > > > > > > >       struct nfs_io_completion *pg_io_completion;
> > > > > > > >       struct nfs_direct_req   *pg_dreq;
> > > > > > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > +       void                    *pg_netfs;
> > > > > > > > +#endif
> > > > > > > > 
> > > > > > > > @@ -1619,6 +1619,9 @@ struct nfs_pgio_header {
> > > > > > > >       const struct nfs_rw_ops *rw_ops;
> > > > > > > >       struct nfs_io_completion *io_completion;
> > > > > > > >       struct nfs_direct_req   *dreq;
> > > > > > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > +       void                    *netfs;
> > > > > > > > +#endif
> > > > > > > > 
> > > > > > > > 
> > > > > > > > And these additions to pagelist.c?
> > > > > > > > 
> > > > > > > > @@ -68,6 +69,10 @@ void nfs_pgheader_init(struct
> > > > > > > > nfs_pageio_descriptor *desc,
> > > > > > > >       hdr->good_bytes = mirror->pg_count;
> > > > > > > >       hdr->io_completion = desc->pg_io_completion;
> > > > > > > >       hdr->dreq = desc->pg_dreq;
> > > > > > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > +       if (desc->pg_netfs)
> > > > > > > > +               hdr->netfs = desc->pg_netfs;
> > > > > > > > +#endif
> > > > > > > 
> > > > > > > Why the conditional?
> > > > > > > 
> > > > > > Not really needed and I was thinking of removing it, so I'll do that.
> > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > @@ -846,6 +851,9 @@ void nfs_pageio_init(struct nfs_pageio_descriptor
> > > > > > > > *desc,
> > > > > > > >       desc->pg_lseg = NULL;
> > > > > > > >       desc->pg_io_completion = NULL;
> > > > > > > >       desc->pg_dreq = NULL;
> > > > > > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > +       desc->pg_netfs = NULL;
> > > > > > > > +#endif
> > > > > > > > 
> > > > > > > > 
> > > > > > > > @@ -1360,6 +1369,9 @@ int nfs_pageio_resend(struct
> > > > > > > > nfs_pageio_descriptor *desc,
> > > > > > > > 
> > > > > > > >       desc->pg_io_completion = hdr->io_completion;
> > > > > > > >       desc->pg_dreq = hdr->dreq;
> > > > > > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > +       desc->pg_netfs = hdr->netfs;
> > > > > > > > +#endif
> > > > > > > 
> > > > > > > Those all need wrapper functions instead of embedding #ifdefs.
> > > > > > > 
> > > > > > Ok.
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > > My expectation is that the standard I/O path should have minimal
> > > > > > > > > overhead, and should certainly not increase the overhead that we
> > > > > > > > > already have. Will this be addressed in future iterations of these
> > > > > > > > > patches?
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > I will do what I can to satisfy what you want, either by fixing up
> > > > > > > > this patch or follow-on patches.  Hopefully the above questions
> > > > > > > > will clarify the next steps.
> > > > > > > > 
> > > > > > > 
> > > > > > > --
> > > > > > > Trond Myklebust
> > > > > > > Linux NFS client maintainer, Hammerspace
> > > > > > > trond.myklebust@hammerspace.com
> > > > 
> > > > 
> > > > 
> > > > Trond Myklebust
> > > > CTO, Hammerspace Inc
> > > > 1900 S Norfolk St, Suite 350 - #45
> > > > San Mateo, CA 94403
> > > > 
> > > > www.hammer.space
> > > > 
> > > > 
> > > 
> >
Benjamin Maynard Nov. 14, 2022, 1:14 p.m. UTC | #11
The source server is Linux, exporting an ext4 filesystem.

benmaynard@bjmtesting-source:~$ cat /etc/lsb-release
DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=20.04
DISTRIB_CODENAME=focal
DISTRIB_DESCRIPTION="Ubuntu 20.04.5 LTS"

benmaynard@bjmtesting-source:~$ uname -r
5.15.0-1021-gcp

benmaynard@bjmtesting-source:~$ df -Th
Filesystem     Type      Size  Used Avail Use% Mounted on
/dev/root      ext4      194G  2.5G  192G   2% /
/dev/sdb1      ext4      2.0T  501G  1.4T  27% /files

benmaynard@bjmtesting-source:~$ cat /etc/exports
/files 10.0.0.0/8(rw,sync,wdelay,no_root_squash,no_all_squash,no_subtree_check,sec=sys,secure,nohide)


Kind Regards
Benjamin Maynard


Kind Regards

Benjamin Maynard

Customer Engineer

benmaynard@google.com

Google, Inc.




On Mon, 14 Nov 2022 at 13:07, Jeff Layton <jlayton@poochiereds.net> wrote:
>
> On Mon, 2022-11-14 at 12:42 +0000, Benjamin Maynard wrote:
> > Thanks Dave for getting back to me so quickly.
> >
> > > Due to use of "drop_caches" this is almost certainly the known issue #1
> > > I mentioned in the opening post of this series:
> > > https://lore.kernel.org/all/20221103161637.1725471-1-dwysocha@redhat.com/
> >
> > Apologies, I completely missed the known issues in the original
> > opening message of the series. Just to clarify, I was only ever
> > dropping the caches on the "NFS Client" in the below relationship:
> >
> > Source NFS Server <-- Re-Export Server (with FS-Cache) <-- NFS Client.
> >
>
> What sort of server is the Source NFS server here? If it's also Linux,
> then what sort of filesystem is being exported?
>
> > I never dropped the caches on the Re-Export Server (the server running
> > FS-Cache) at any point.
> >
> > However my rsize was lower than my readahead value. I've since corrected that:
> >
> > benmaynard@demo-cluster-1-26hm:~$ cat /proc/mounts | grep nfs
> > 10.0.0.49:/files /srv/nfs/files nfs
> > rw,noatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,acregmin=600,acregmax=600,acdirmin=600,acdirmax=600,hard,nocto,proto=tcp,nconnect=16,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.49,mountvers=3,mountport=37485,mountproto=tcp,fsc,local_lock=none,addr=10.0.0.49
> > 0 0
> >
> > benmaynard@demo-cluster-1-26hm:~$ findmnt -rnu -t nfs,nfs4 -o MAJ:MIN,TARGET
> > 0:52 /srv/nfs/files
> > benmaynard@demo-cluster-1-26hm:~$ cat /sys/class/bdi/0\:52/read_ahead_kb
> > 512
> >
> > With this configuration I see the same issue, FS-Cache never reads
> > from /var/cache/fscache, and copying the same file always leads to
> > heavy writes to /var/cache/fscache (the cache is overwriting itself).
> >
> > I have also tried this copy without clearing the caches on any server
> > in the chain, and the same happens.
> >
> > Would you expect this behaviour even though rsize > read ahead? Would
> > you expect the referenced patch to fix this?
> >
> > I tried to apply the patch you suggested
> > (https://www.mail-archive.com/linux-cachefs@redhat.com/msg03043.html)
> > but it did not apply cleanly, and I ran out of time to troubleshoot. I
> > should get some more time on Wednesday and I can re-try.
> >
> >
> > Kind Regards
> > Benjamin Maynard
> >
> >
> > Kind Regards
> >
> > Benjamin Maynard
> >
> > Customer Engineer
> >
> > benmaynard@google.com
> >
> > Google, Inc.
> >
> >
> >
> >
> > On Mon, 14 Nov 2022 at 10:41, David Wysochanski <dwysocha@redhat.com> wrote:
> > >
> > > Hi Ben,
> > >
> > > Thanks for testing these patches.  More below.
> > >
> > > On Sat, Nov 12, 2022 at 7:47 AM Benjamin Maynard <benmaynard@google.com> wrote:
> > > >
> > > > Hi all,
> > > >
> > > > I've been doing some more testing with these patches, I applied all of
> > > > the patches (v10 from
> > > > https://patchwork.kernel.org/project/linux-nfs/list/?series=691729)
> > > > apart from Patch 6 (the RFC patch) to version 6.0.8 of the kernel.
> > > >
> > > > I have the following setup:
> > > >
> > > > Source NFS Server <-- Re-Export Server (with FS-Cache) <-- NFS Client.
> > > >
> > > > I have a 500Gb file on the Source NFS Server, which I am then copying
> > > > to the NFS Client via the Re-Export Server.
> > > >
> > > > On the first copy, I see heavy writes to /var/cache/fscache on the
> > > > re-export server, and once the file copy completes I see that
> > > > /var/cache/fscache is approximately 500Gb in size. All good so far.
> > > >
> > > > I then deleted that file from the NFS Client, and dropped the caches
> > > > just to be safe (echo 3 > /proc/sys/vm/drop_caches on the NFS Client).
> > > >
> > > > I then performed another copy of the 500Gb file on the NFS Client,
> > > > again via the Re-Export Server. What I expected would happen is that I
> > > > would see heavy reads from the /var/cache/fscache volume as the file
> > > > should be served from FS-Cache.
> > > >
> > > > However what I actually saw was no reads whatsoever, FS-Cache seems to
> > > > be ignored and the file is pulled from the Source NFS Filer again. I
> > > > also see heavy writes to /var/cache/fscache, so it appears that
> > > > FS-Cache is overwriting its existing cache, and never using it.
> > > >
> > > Due to use of "drop_caches" this is almost certainly the known issue #1
> > > I mentioned in the opening post of this series:
> > > https://lore.kernel.org/all/20221103161637.1725471-1-dwysocha@redhat.com/
> > >
> > > The above issue will be fixed with the following patch which has not
> > > been merged yet:
> > > https://www.mail-archive.com/linux-cachefs@redhat.com/msg03043.html
> > >
> > > Do you have time to do another test to verify that is the case?
> > > If so, I can re-post that patch on top of the first 5 patches in this series,
> > > as well as a second patch that allows NFS to use it.
> > >
> > >
> > > > I only have 104Gb of memory on the Re-Export Server (with FS-Cache) so
> > > > it is not possible that the file is being served from the page cache.
> > > >
> > > > We saw this behaviour before on an older set of the patches when our
> > > > mount between the Re-Export Server and the Source NFS Filer was using
> > > > the "sync" option, but we are now using the "async" option and the
> > > > same is happening.
> > > >
> > > > Mount options:
> > > >
> > > > Source NFS Server <-- Re-Export Server (with FS-Cache):
> > > >
> > > > 10.0.0.49:/files /srv/nfs/files nfs
> > > > rw,noatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,acregmin=600,acregmax=600,acdirmin=600,acdirmax=600,hard,nocto,proto=tcp,nconnect=16,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.49,mountvers=3,mountport=37485,mountproto=tcp,fsc,local_lock=none,addr=10.0.0.49
> > > >
> > > > Re-Export Server (with FS-Cache) <-- NFS Client:
> > > >
> > > > 10.0.0.3:/files /mnt/nfs nfs
> > > > rw,relatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.3,mountvers=3,mountport=20048,mountproto=tcp,local_lock=none,addr=10.0.0.3
> > > >
> > > > It is also worth noting this behaviour is not unique to the re-export
> > > > use case. I see FS-Cache not being used with the following setup:
> > > >
> > > > Source NFS Server <-- Client (with FS-Cache).
> > > >
> > > > Thanks,
> > > > Ben
> > > >
> > > >
> > > > Kind Regards
> > > >
> > > > Benjamin Maynard
> > > >
> > > > Customer Engineer
> > > >
> > > > benmaynard@google.com
> > > >
> > > > Google, Inc.
> > > >
> > > >
> > > >
> > > >
> > > > On Mon, 31 Oct 2022 at 22:22, Trond Myklebust <trondmy@hammerspace.com> wrote:
> > > > >
> > > > >
> > > > >
> > > > > > On Oct 30, 2022, at 19:25, David Wysochanski <dwysocha@redhat.com> wrote:
> > > > > >
> > > > > > On Sat, Oct 29, 2022 at 12:46 PM David Wysochanski <dwysocha@redhat.com> wrote:
> > > > > > >
> > > > > > > On Fri, Oct 28, 2022 at 12:59 PM Trond Myklebust <trondmy@kernel.org> wrote:
> > > > > > > >
> > > > > > > > On Fri, 2022-10-28 at 07:50 -0400, David Wysochanski wrote:
> > > > > > > > > On Thu, Oct 27, 2022 at 3:16 PM Trond Myklebust <trondmy@kernel.org>
> > > > > > > > > wrote:
> > > > > > > > > >
> > > > > > > > > > On Mon, 2022-10-17 at 06:52 -0400, Dave Wysochanski wrote:
> > > > > > > > > > > Convert the NFS buffered read code paths to corresponding netfs
> > > > > > > > > > > APIs,
> > > > > > > > > > > but only when fscache is configured and enabled.
> > > > > > > > > > >
> > > > > > > > > > > The netfs API defines struct netfs_request_ops which must be
> > > > > > > > > > > filled
> > > > > > > > > > > in by the network filesystem.  For NFS, we only need to define 5
> > > > > > > > > > > of
> > > > > > > > > > > the functions, the main one being the issue_read() function.
> > > > > > > > > > > The issue_read() function is called by the netfs layer when a
> > > > > > > > > > > read
> > > > > > > > > > > cannot be fulfilled locally, and must be sent to the server
> > > > > > > > > > > (either
> > > > > > > > > > > the cache is not active, or it is active but the data is not
> > > > > > > > > > > available).
> > > > > > > > > > > Once the read from the server is complete, netfs requires a call
> > > > > > > > > > > to
> > > > > > > > > > > netfs_subreq_terminated() which conveys either how many bytes
> > > > > > > > > > > were
> > > > > > > > > > > read
> > > > > > > > > > > successfully, or an error.  Note that issue_read() is called with
> > > > > > > > > > > a
> > > > > > > > > > > structure, netfs_io_subrequest, which defines the IO requested,
> > > > > > > > > > > and
> > > > > > > > > > > contains a start and a length (both in bytes), and assumes the
> > > > > > > > > > > underlying
> > > > > > > > > > > netfs will return a either an error on the whole region, or the
> > > > > > > > > > > number
> > > > > > > > > > > of bytes successfully read.
> > > > > > > > > > >
> > > > > > > > > > > The NFS IO path is page based and the main APIs are the pgio APIs
> > > > > > > > > > > defined
> > > > > > > > > > > in pagelist.c.  For the pgio APIs, there is no way for the caller
> > > > > > > > > > > to
> > > > > > > > > > > know how many RPCs will be sent and how the pages will be broken
> > > > > > > > > > > up
> > > > > > > > > > > into underlying RPCs, each of which will have their own
> > > > > > > > > > > completion
> > > > > > > > > > > and
> > > > > > > > > > > return code.  In contrast, netfs is subrequest based, a single
> > > > > > > > > > > subrequest may contain multiple pages, and a single subrequest is
> > > > > > > > > > > initiated with issue_read() and terminated with
> > > > > > > > > > > netfs_subreq_terminated().
> > > > > > > > > > > Thus, to utilze the netfs APIs, NFS needs some way to accommodate
> > > > > > > > > > > the netfs API requirement on the single response to the whole
> > > > > > > > > > > subrequest, while also minimizing disruptive changes to the NFS
> > > > > > > > > > > pgio layer.
> > > > > > > > > > >
> > > > > > > > > > > The approach taken with this patch is to allocate a small
> > > > > > > > > > > structure
> > > > > > > > > > > for each nfs_netfs_issue_read() call, store the final error and
> > > > > > > > > > > number
> > > > > > > > > > > of bytes successfully transferred in the structure, and update
> > > > > > > > > > > these
> > > > > > > > > > > values
> > > > > > > > > > > as each RPC completes.  The refcount on the structure is used as
> > > > > > > > > > > a
> > > > > > > > > > > marker
> > > > > > > > > > > for the last RPC completion, is incremented in
> > > > > > > > > > > nfs_netfs_read_initiate(),
> > > > > > > > > > > and decremented inside nfs_netfs_read_completion(), when a
> > > > > > > > > > > nfs_pgio_header
> > > > > > > > > > > contains a valid pointer to the data.  On the final put (which
> > > > > > > > > > > signals
> > > > > > > > > > > the final outstanding RPC is complete) in
> > > > > > > > > > > nfs_netfs_read_completion(),
> > > > > > > > > > > call netfs_subreq_terminated() with either the final error value
> > > > > > > > > > > (if
> > > > > > > > > > > one or more READs complete with an error) or the number of bytes
> > > > > > > > > > > successfully transferred (if all RPCs complete successfully).
> > > > > > > > > > > Note
> > > > > > > > > > > that when all RPCs complete successfully, the number of bytes
> > > > > > > > > > > transferred
> > > > > > > > > > > is capped to the length of the subrequest.  Capping the
> > > > > > > > > > > transferred
> > > > > > > > > > > length
> > > > > > > > > > > to the subrequest length prevents "Subreq overread" warnings from
> > > > > > > > > > > netfs.
> > > > > > > > > > > This is due to the "aligned_len" in nfs_pageio_add_page(), and
> > > > > > > > > > > the
> > > > > > > > > > > corner case where NFS requests a full page at the end of the
> > > > > > > > > > > file,
> > > > > > > > > > > even when i_size reflects only a partial page (NFS overread).
> > > > > > > > > > >
> > > > > > > > > > > Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
> > > > > > > > > > > Reviewed-by: Jeff Layton <jlayton@kernel.org>
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > This is not doing what I asked for, which was to separate out the
> > > > > > > > > > fscache functionality, so that we can call that if and when it is
> > > > > > > > > > available.
> > > > > > > > > >
> > > > > > > > > I must have misunderstood then.
> > > > > > > > >
> > > > > > > > > The last feedback I have from you was that you wanted it to be
> > > > > > > > > an opt-in feature, and it was a comment on a previous patch
> > > > > > > > > to Kconfig.  I was proceeding the best I knew how, but
> > > > > > > > > let me try to get back on track.
> > > > > > > > >
> > > > > > > > > > Instead, it is just wrapping the NFS requests inside netfs
> > > > > > > > > > requests. As
> > > > > > > > > > it stands, that means it is just duplicating information, and
> > > > > > > > > > adding
> > > > > > > > > > unnecessary overhead to the standard I/O path (extra allocations,
> > > > > > > > > > extra
> > > > > > > > > > indirect calls, and extra bloat to the inode).
> > > > > > > > > >
> > > > > > > > > I think I understand what you're saying but I'm not sure.  Let me
> > > > > > > > > ask some clarifying questions.
> > > > > > > > >
> > > > > > > > > Are you objecting to the code when CONFIG_NFS_FSCACHE is
> > > > > > > > > configured?  Or when it is not?  Or both?  I think you're objecting
> > > > > > > > > when it's configured, but not enabled (we mount without 'fsc').
> > > > > > > > > Am I right?
> > > > > > > > >
> > > > > > > > > Also, are you objecting to the design that to use fcache we now
> > > > > > > > > have to use netfs, specifically:
> > > > > > > > > - call into netfs via either netfs_read_folio or netfs_readahead
> > > > > > > > > - if fscache is enabled, then the IO can be satisfied from fscache
> > > > > > > > > - if fscache is not enabled, or some of the IO cannot be satisfied
> > > > > > > > > from the cache, then NFS is called back via netfs_issue_read
> > > > > > > > > and we use the normal NFS read pageio interface.  This requires
> > > > > > > > > we call netfs_subreq_terminated() when all the RPCs complete,
> > > > > > > > > which is the reason for the small changes to pagelist.c
> > > > > > > >
> > > > > > > > I'm objecting to any middle layer "solution" that adds overhead to the
> > > > > > > > NFS I/O paths.
> > > > > > > >
> > > > > > > Got it.
> > > > > > >
> > > > > > > > I'm willing to consider solutions that are specific only to the fscache
> > > > > > > > use case (i.e. when the 'fsc' mount option is specified). However when
> > > > > > > > I perform a normal NFS mount, and do I/O, then I don't want to see
> > > > > > > > extra memory allocations, extra indirect calls and larger inode
> > > > > > > > footprints.
> > > > > > > >
> > > > > > > > IOW: I want the code to optimise for the case of standard NFS, not for
> > > > > > > > the case of 'NFS with cachefs additions'.
> > > > > > > >
> > > > > > > I agree completely.  Are you seeing extra memory allocations
> > > > > > > happen on mounts without 'fsc' or is it more a concern or how
> > > > > > > some of the patches look?  We should not be calling any netfs or
> > > > > > > fscache code if 'fsc' is not on the mount and I don't see any in my
> > > > > > > testing. So either there's a misunderstanding here, or there's a
> > > > > > > bug I'm missing.
> > > > > > >
> > > > > > > If fscache is not configured, then nfs_netfs_read_folio() and
> > > > > > > nfs_netfs_readahead() is a wrapper that returns -ENOBUFS.
> > > > > > > If it's configured but not enabled, then the checks for
> > > > > > > netfs_inode(inode)->cache should skip over any netfs code.
> > > > > > > But maybe there's a non-obvious bug you're seeing and
> > > > > > > somehow netfs is still getting called?  Because I cannot
> > > > > > > see netfs getting called if 'fsc' is not on the mount in my
> > > > > > > tests.
> > > > > > >
> > > > > > > int nfs_netfs_read_folio(struct file *file, struct folio *folio)
> > > > > > > {
> > > > > > >       if (!netfs_inode(folio_inode(folio))->cache)
> > > > > > >               return -ENOBUFS;
> > > > > > >
> > > > > > >       return netfs_read_folio(file, folio);
> > > > > > > }
> > > > > > >
> > > > > > > int nfs_netfs_readahead(struct readahead_control *ractl)
> > > > > > > {
> > > > > > >       struct inode *inode = ractl->mapping->host;
> > > > > > >
> > > > > > >       if (!netfs_inode(inode)->cache)
> > > > > > >               return -ENOBUFS;
> > > > > > >
> > > > > > >       netfs_readahead(ractl);
> > > > > > >       return 0;
> > > > > > > }
> > > > > > >
> > > > > > >
> > > > > > > > >
> > > > > > > > > Can you be more specific as to the portions of the patch you don't
> > > > > > > > > like
> > > > > > > > > so I can move it in the right direction?
> > > > > > > > >
> > > > > > > > > This is from patch #2 which you didn't comment on.  I'm not sure
> > > > > > > > > you're
> > > > > > > > > ok with it though, since you mention "extra bloat to the inode".
> > > > > > > > > Do you object to this even though it's wrapped in an
> > > > > > > > > #ifdef CONFIG_NFS_FSCACHE?  If so, do you require no
> > > > > > > > > extra size be added to nfs_inode?
> > > > > > > > >
> > > > > > > > > @@ -204,9 +208,11 @@ struct nfs_inode {
> > > > > > > > >       __u64 write_io;
> > > > > > > > >       __u64 read_io;
> > > > > > > > > #ifdef CONFIG_NFS_FSCACHE
> > > > > > > > > -       struct fscache_cookie   *fscache;
> > > > > > > > > -#endif
> > > > > > > > > +       struct netfs_inode      netfs; /* netfs context and VFS inode
> > > > > > > > > */
> > > > > > > > > +#else
> > > > > > > > >       struct inode            vfs_inode;
> > > > > > > > > +#endif
> > > > > > > > > +
> > > > > > > >
> > > > > > > > Ideally, I'd prefer no extra size. I can live with it up to a certain
> > > > > > > > point, however for now NFS is not unconditionally opting into the netfs
> > > > > > > > project. If we're to ever do that, then I want to see streamlined code
> > > > > > > > for the standard I/O case.
> > > > > > > >
> > > > > > > Ok and understood about standard I/O case.
> > > > > > >
> > > > > > > I was thinking how we might not increase the size, but I don't think
> > > > > > > I can make it work.
> > > > > > >
> > > > > > > I thought we could change to something like the below, without an
> > > > > > > embedded struct inode:
> > > > > > >
> > > > > > > @@ -204,9 +208,11 @@ struct nfs_inode {
> > > > > > >       __u64 write_io;
> > > > > > >       __u64 read_io;
> > > > > > > #ifdef CONFIG_NFS_FSCACHE
> > > > > > > -       struct fscache_cookie   *fscache;
> > > > > > > -#endif
> > > > > > > +       struct netfs_inode      *netfs; /* netfs context and VFS inode */
> > > > > > > +#else
> > > > > > >       struct inode            vfs_inode;
> > > > > > > +#endif
> > > > > > > +
> > > > > > >
> > > > > > > Then I would need to alloc/free a netfs_inode at the time of
> > > > > > > nfs_inode initiation.  Unfortunately this has the issue that the NFS_I()
> > > > > > > macro cannot work, because it requires an embedded "struct inode"
> > > > > > > due to "container_of" use:
> > > > > > >
> > > > > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > > > > > > +{
> > > > > > > +       return &nfsi->netfs.inode;
> > > > > > > +}
> > > > > > > +static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > > > > > > +{
> > > > > > > +       return container_of(inode, struct nfs_inode, netfs.inode);
> > > > > > > +}
> > > > > > > +#else
> > > > > > > +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > > > > > > +{
> > > > > > > +       return &nfsi->vfs_inode;
> > > > > > > +}
> > > > > > > static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > > > > > > {
> > > > > > >       return container_of(inode, struct nfs_inode, vfs_inode);
> > > > > > > }
> > > > > > > +#endif
> > > > > > >
> > > > > > >
> > > > > >
> > > > > > Actually Trond maybe we can achieve a "0 length increase" of
> > > > > > nfs_inode if dhowells would take a patch to modify the definition
> > > > > > of struct netfs_inode and netfs_inode_init(), something like the WIP
> > > > > > patch below.  What do you think?
> > > > >
> > > > > That works for me.
> > > > >
> > > > > >
> > > > > > I think maybe this could be a follow-on patch and if you/dhowells
> > > > > > think it's an ok idea I can try to work out what is needed across
> > > > > > the tree.  I thought about it more and I kinda agree that in the
> > > > > > case for NFS where fscache is "configured but not enabled",
> > > > > > then even though we're only adding 24 bytes to the nfs_inode
> > > > > > each time, it will add up so it is worth at least a discussion.
> > > > > >
> > > > > > diff --git a/include/linux/netfs.h b/include/linux/netfs.h
> > > > > > index f2402ddeafbf..195714f1c355 100644
> > > > > > --- a/include/linux/netfs.h
> > > > > > +++ b/include/linux/netfs.h
> > > > > > @@ -118,11 +118,7 @@ enum netfs_io_source {
> > > > > > typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error,
> > > > > >                                     bool was_async);
> > > > > >
> > > > > > -/*
> > > > > > - * Per-inode context.  This wraps the VFS inode.
> > > > > > - */
> > > > > > -struct netfs_inode {
> > > > > > -       struct inode            inode;          /* The VFS inode */
> > > > > > +struct netfs_info {
> > > > > >       const struct netfs_request_ops *ops;
> > > > > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > > > >       struct fscache_cookie   *cache;
> > > > > > @@ -130,6 +126,14 @@ struct netfs_inode {
> > > > > >       loff_t                  remote_i_size;  /* Size of the remote file */
> > > > > > };
> > > > > >
> > > > > > +/*
> > > > > > + * Per-inode context.  This wraps the VFS inode.
> > > > > > + */
> > > > > > +struct netfs_inode {
> > > > > > +       struct inode            inode;          /* The VFS inode */
> > > > > > +       struct netfs_info       *netfs;         /* Rest of netfs data */
> > > > > > +};
> > > > > > +
> > > > > > /*
> > > > > > * Resources required to do operations on a cache.
> > > > > > */
> > > > > > @@ -312,10 +316,12 @@ static inline struct netfs_inode
> > > > > > *netfs_inode(struct inode *inode)
> > > > > > static inline void netfs_inode_init(struct netfs_inode *ctx,
> > > > > >                                   const struct netfs_request_ops *ops)
> > > > > > {
> > > > > > -       ctx->ops = ops;
> > > > > > -       ctx->remote_i_size = i_size_read(&ctx->inode);
> > > > > > +       ctx->netfs = kzalloc(sizeof(struct netfs_info)), GFP_KERNEL);
> > > > > > +       /* FIXME: Check for NULL */
> > > > > > +       ctx->netfs->ops = ops;
> > > > > > +       ctx->netfs->remote_i_size = i_size_read(&ctx->inode);
> > > > > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > > > > -       ctx->cache = NULL;
> > > > > > +       ctx->netfs->cache = NULL;
> > > > > > #endif
> > > > > > }
> > > > > >
> > > > > >
> > > > > >
> > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > Are you ok with the stub functions which are placed in fscache.h, and
> > > > > > > > > when CONFIG_NFS_FSCACHE is not set, become either a no-op
> > > > > > > > > or a 1-liner (nfs_netfs_readpage_release)?
> > > > > > > > >
> > > > > > > > > #else /* CONFIG_NFS_FSCACHE */
> > > > > > > > > +static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {}
> > > > > > > > > +static inline void nfs_netfs_initiate_read(struct nfs_pgio_header
> > > > > > > > > *hdr) {}
> > > > > > > > > +static inline void nfs_netfs_read_completion(struct nfs_pgio_header
> > > > > > > > > *hdr) {}
> > > > > > > > > +static inline void nfs_netfs_readpage_release(struct nfs_page *req)
> > > > > > > > > +{
> > > > > > > > > +       unlock_page(req->wb_page);
> > > > > > > > > +}
> > > > > > > > > static inline void nfs_fscache_release_super_cookie(struct
> > > > > > > > > super_block *sb) {}
> > > > > > > > > static inline void nfs_fscache_init_inode(struct inode *inode) {}
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > Do you object to the below?  If so, then do you want
> > > > > > > > > #ifdef CONFIG_NFS_FSCACHE here?
> > > > > > > > >
> > > > > > > > > -- a/fs/nfs/inode.c
> > > > > > > > > +++ b/fs/nfs/inode.c
> > > > > > > > > @@ -2249,6 +2249,8 @@ struct inode *nfs_alloc_inode(struct
> > > > > > > > > super_block *sb)
> > > > > > > > > #ifdef CONFIG_NFS_V4_2
> > > > > > > > >       nfsi->xattr_cache = NULL;
> > > > > > > > > #endif
> > > > > > > > > +       nfs_netfs_inode_init(nfsi);
> > > > > > > > > +
> > > > > > > > >       return VFS_I(nfsi);
> > > > > > > > > }
> > > > > > > > > EXPORT_SYMBOL_GPL(nfs_alloc_i
> > > > > > > > > node);
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > Do you object to the changes in fs/nfs/read.c?  Specifically,
> > > > > > > > > how about the below calls to netfs from nfs_read_folio and
> > > > > > > > > nfs_readahead into equivalent netfs calls?  So when
> > > > > > > > > NFS_CONFIG_FSCACHE is set, but fscache is not enabled
> > > > > > > > > ('fsc' not on mount), these netfs functions do immediately call
> > > > > > > > > netfs_alloc_request().  But I wonder if we could simply add a
> > > > > > > > > check to see if fscache is enabled on the mount, and skip
> > > > > > > > > over to satisfy what you want.  Am I understanding what you
> > > > > > > > > want?
> > > > > > > >
> > > > > > > > Quite frankly, I'd prefer that we just split out the functionality that
> > > > > > > > is needed from the netfs code so that it can be optimised. However I'm
> > > > > > > > not interested enough in the cachefs functionality to work on that
> > > > > > > > myself. ...and as I indicated above, I might be OK with opting into the
> > > > > > > > netfs project, once the overhead can be made to disappear.
> > > > > > > >
> > > > > > > Understood.
> > > > > > >
> > > > > > > If you think it makes more sense, I can move some of the nfs_netfs_*
> > > > > > > functions into a netfs.c file as a starting point.  Or that can maybe
> > > > > > > be done in a future patchset?
> > > > > > >
> > > > > > > For now I was equating netfs and fscache together so we can
> > > > > > > move on from the much older and single-page limiting fscache
> > > > > > > interface that is likely to go away soon.
> > > > > > >
> > > > > > > > >
> > > > > > > > > @@ -355,6 +343,10 @@ int nfs_read_folio(struct file *file, struct
> > > > > > > > > folio *folio)
> > > > > > > > >       if (NFS_STALE(inode))
> > > > > > > > >               goto out_unlock;
> > > > > > > > >
> > > > > > > > > +       ret = nfs_netfs_read_folio(file, folio);
> > > > > > > > > +       if (!ret)
> > > > > > > > > +               goto out;
> > > > > > > > > +
> > > > > > > > >
> > > > > > > > > @@ -405,6 +399,10 @@ void nfs_readahead(struct readahead_control
> > > > > > > > > *ractl)
> > > > > > > > >       if (NFS_STALE(inode))
> > > > > > > > >               goto out;
> > > > > > > > >
> > > > > > > > > +       ret = nfs_netfs_readahead(ractl);
> > > > > > > > > +       if (!ret)
> > > > > > > > > +               goto out;
> > > > > > > > > +
> > > > > > > > >
> > > > > > > The above wrappers should prevent any additional overhead when fscache
> > > > > > > is not enabled.  As far as I know these work to avoid calling netfs
> > > > > > > when 'fsc' is not on the mount.
> > > > > > >
> > > > > > > > >
> > > > > > > > > And how about these calls from different points in the read
> > > > > > > > > path to the earlier mentioned stub functions?
> > > > > > > > >
> > > > > > > > > @@ -110,20 +110,13 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
> > > > > > > > >
> > > > > > > > > static void nfs_readpage_release(struct nfs_page *req, int error)
> > > > > > > > > {
> > > > > > > > > -       struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > > > > > > > >       struct page *page = req->wb_page;
> > > > > > > > >
> > > > > > > > > -       dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb-
> > > > > > > > > > s_id,
> > > > > > > > > -               (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
> > > > > > > > > -               (long long)req_offset(req));
> > > > > > > > > -
> > > > > > > > >       if (nfs_error_is_fatal_on_server(error) && error != -
> > > > > > > > > ETIMEDOUT)
> > > > > > > > >               SetPageError(page);
> > > > > > > > > -       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
> > > > > > > > > -               if (PageUptodate(page))
> > > > > > > > > -                       nfs_fscache_write_page(inode, page);
> > > > > > > > > -               unlock_page(page);
> > > > > > > > > -       }
> > > > > > > > > +       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > > > > > > > +               nfs_netfs_readpage_release(req);
> > > > > > > > > +
> > > > > > > >
> > > > > > > > I'm not seeing the value of wrapping unlock_page(), no... That code is
> > > > > > > > going to need to change when we move it to use folios natively anyway.
> > > > > > > >
> > > > > > > Ok, how about I make it conditional on whether fscache is configured
> > > > > > > and enabled then, similar to the nfs_netfs_read_folio() and
> > > > > > > nfs_netfs_readahead()?  Below is what that would look like.
> > > > > > > I could inline the code in nfs_netfs_readpage_release() if you
> > > > > > > think it would be clearer.
> > > > > > >
> > > > > > > static void nfs_readpage_release(struct nfs_page *req, int error)
> > > > > > > {
> > > > > > >       struct page *page = req->wb_page;
> > > > > > >
> > > > > > >       if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
> > > > > > >               SetPageError(page);
> > > > > > >       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > > > > > #ifndef CONFIG_NFS_FSCACHE
> > > > > > >               unlock_page(req->wb_page);
> > > > > > > #else
> > > > > > >               nfs_netfs_readpage_release(req);
> > > > > > > #endif
> > > > > > >       nfs_release_request(req);
> > > > > > > }
> > > > > > >
> > > > > > >
> > > > > > > void nfs_netfs_readpage_release(struct nfs_page *req)
> > > > > > > {
> > > > > > >   struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > > > > > >
> > > > > > >   /*
> > > > > > >    * If fscache is enabled, netfs will unlock pages.
> > > > > > >    */
> > > > > > >   if (netfs_inode(inode)->cache)
> > > > > > >       return;
> > > > > > >
> > > > > > >   unlock_page(req->wb_page);
> > > > > > > }
> > > > > > >
> > > > > > >
> > > > > > > > >       nfs_release_request(req);
> > > > > > > > > }
> > > > > > > > >
> > > > > > > > > @@ -177,6 +170,8 @@ static void nfs_read_completion(struct
> > > > > > > > > nfs_pgio_header *hdr)
> > > > > > > > >               nfs_list_remove_request(req);
> > > > > > > > >               nfs_readpage_release(req, error);
> > > > > > > > >       }
> > > > > > > > > +       nfs_netfs_read_completion(hdr);
> > > > > > > > > +
> > > > > > > > > out:
> > > > > > > > >       hdr->release(hdr);
> > > > > > > > > }
> > > > > > > > > @@ -187,6 +182,7 @@ static void nfs_initiate_read(struct
> > > > > > > > > nfs_pgio_header *hdr,
> > > > > > > > >                             struct rpc_task_setup *task_setup_data,
> > > > > > > > > int how)
> > > > > > > > > {
> > > > > > > > >       rpc_ops->read_setup(hdr, msg);
> > > > > > > > > +       nfs_netfs_initiate_read(hdr);
> > > > > > > > >       trace_nfs_initiate_read(hdr);
> > > > > > > > > }
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > Are you ok with these additions?  Something like this would
> > > > > > > > > be required in the case of fscache configured and enabled,
> > > > > > > > > because we could have some of the data in a read in
> > > > > > > > > fscache, and some not.  That is the reason for the netfs
> > > > > > > > > design, and why we need to be able to call the normal
> > > > > > > > > NFS read IO path (netfs calls into issue_read, and we call
> > > > > > > > > back via netfs_subreq_terminated)?
> > > > > > > > >
> > > > > > > > > @@ -101,6 +101,9 @@ struct nfs_pageio_descriptor {
> > > > > > > > >       struct pnfs_layout_segment *pg_lseg;
> > > > > > > > >       struct nfs_io_completion *pg_io_completion;
> > > > > > > > >       struct nfs_direct_req   *pg_dreq;
> > > > > > > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > > +       void                    *pg_netfs;
> > > > > > > > > +#endif
> > > > > > > > >
> > > > > > > > > @@ -1619,6 +1619,9 @@ struct nfs_pgio_header {
> > > > > > > > >       const struct nfs_rw_ops *rw_ops;
> > > > > > > > >       struct nfs_io_completion *io_completion;
> > > > > > > > >       struct nfs_direct_req   *dreq;
> > > > > > > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > > +       void                    *netfs;
> > > > > > > > > +#endif
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > And these additions to pagelist.c?
> > > > > > > > >
> > > > > > > > > @@ -68,6 +69,10 @@ void nfs_pgheader_init(struct
> > > > > > > > > nfs_pageio_descriptor *desc,
> > > > > > > > >       hdr->good_bytes = mirror->pg_count;
> > > > > > > > >       hdr->io_completion = desc->pg_io_completion;
> > > > > > > > >       hdr->dreq = desc->pg_dreq;
> > > > > > > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > > +       if (desc->pg_netfs)
> > > > > > > > > +               hdr->netfs = desc->pg_netfs;
> > > > > > > > > +#endif
> > > > > > > >
> > > > > > > > Why the conditional?
> > > > > > > >
> > > > > > > Not really needed and I was thinking of removing it, so I'll do that.
> > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > @@ -846,6 +851,9 @@ void nfs_pageio_init(struct nfs_pageio_descriptor
> > > > > > > > > *desc,
> > > > > > > > >       desc->pg_lseg = NULL;
> > > > > > > > >       desc->pg_io_completion = NULL;
> > > > > > > > >       desc->pg_dreq = NULL;
> > > > > > > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > > +       desc->pg_netfs = NULL;
> > > > > > > > > +#endif
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > @@ -1360,6 +1369,9 @@ int nfs_pageio_resend(struct
> > > > > > > > > nfs_pageio_descriptor *desc,
> > > > > > > > >
> > > > > > > > >       desc->pg_io_completion = hdr->io_completion;
> > > > > > > > >       desc->pg_dreq = hdr->dreq;
> > > > > > > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > > +       desc->pg_netfs = hdr->netfs;
> > > > > > > > > +#endif
> > > > > > > >
> > > > > > > > Those all need wrapper functions instead of embedding #ifdefs.
> > > > > > > >
> > > > > > > Ok.
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > > My expectation is that the standard I/O path should have minimal
> > > > > > > > > > overhead, and should certainly not increase the overhead that we
> > > > > > > > > > already have. Will this be addressed in future iterations of these
> > > > > > > > > > patches?
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > I will do what I can to satisfy what you want, either by fixing up
> > > > > > > > > this patch or follow-on patches.  Hopefully the above questions
> > > > > > > > > will clarify the next steps.
> > > > > > > > >
> > > > > > > >
> > > > > > > > --
> > > > > > > > Trond Myklebust
> > > > > > > > Linux NFS client maintainer, Hammerspace
> > > > > > > > trond.myklebust@hammerspace.com
> > > > >
> > > > >
> > > > >
> > > > > Trond Myklebust
> > > > > CTO, Hammerspace Inc
> > > > > 1900 S Norfolk St, Suite 350 - #45
> > > > > San Mateo, CA 94403
> > > > >
> > > > > www.hammer.space
> > > > >
> > > > >
> > > >
> > >
>
> --
> Jeff Layton <jlayton@poochiereds.net>
Daire Byrne Nov. 14, 2022, 1:33 p.m. UTC | #12
On Mon, 14 Nov 2022 at 12:44, Benjamin Maynard <benmaynard@google.com> wrote:
>
> Thanks Dave for getting back to me so quickly.
>
> > Due to use of "drop_caches" this is almost certainly the known issue #1
> > I mentioned in the opening post of this series:
> > https://lore.kernel.org/all/20221103161637.1725471-1-dwysocha@redhat.com/
>
> Apologies, I completely missed the known issues in the original
> opening message of the series. Just to clarify, I was only ever
> dropping the caches on the "NFS Client" in the below relationship:
>
> Source NFS Server <-- Re-Export Server (with FS-Cache) <-- NFS Client.
>
> I never dropped the caches on the Re-Export Server (the server running
> FS-Cache) at any point.

So I have never actually done that particular test (I will need to
verify, but I think I would have noticed by now), but dropping caches
on the re-export server definitely caused repeat reads from "source"
NFS server and (re)writes to the fscache disk *without* David's
suggested patch. With the patch, you can drop caches on the re-export
server and get the repeat reads coming from the fscache disk as
expected.

You can certainly try that test too (just source NFS server ->
FS-cache client - read,drop cache,read).

I'm not sure about your particular test, but your re-export server
must have dropped the file from memory otherwise you would see the
repeat read just coming from page cache (with no fscache or disk cache
interaction required)? So I'll assume the file in question is too
large to fit into memory and effectively the cache has been dropped.
So I think the suggested patch on the re-export server will fix that
issue.

I should also add that I had this series working well (+suggested
patch) and the performance to/from disk cache is an order of magnitude
better than mainline (40MB/s vs 5000MB/s with NVMe), but it did expose
a race condition in the fscache use/unuse cookie code (David is
aware). In the NFS re-export case, we have lots of knfsd threads
thrashing around the netfs/fscache functions.

Daire
David Wysochanski Nov. 14, 2022, 1:46 p.m. UTC | #13
I apologize I did not read carefully enough and I missed some details
in your original post.
More below.

On Sat, Nov 12, 2022 at 7:47 AM Benjamin Maynard <benmaynard@google.com> wrote:
>
> Hi all,
>
> I've been doing some more testing with these patches, I applied all of
> the patches (v10 from
> https://patchwork.kernel.org/project/linux-nfs/list/?series=691729)
> apart from Patch 6 (the RFC patch) to version 6.0.8 of the kernel.
>
> I have the following setup:
>
> Source NFS Server <-- Re-Export Server (with FS-Cache) <-- NFS Client.
>
> I have a 500Gb file on the Source NFS Server, which I am then copying
> to the NFS Client via the Re-Export Server.
>
> On the first copy, I see heavy writes to /var/cache/fscache on the
> re-export server, and once the file copy completes I see that
> /var/cache/fscache is approximately 500Gb in size. All good so far.
>
> I then deleted that file from the NFS Client, and dropped the caches
> just to be safe (echo 3 > /proc/sys/vm/drop_caches on the NFS Client).
>
If you delete the file from the NFS client, how does that not delete the
file from the original NFS server?

> I then performed another copy of the 500Gb file on the NFS Client,
> again via the Re-Export Server. What I expected would happen is that I
> would see heavy reads from the /var/cache/fscache volume as the file
> should be served from FS-Cache.
>
I don't understand this.  When you say you "performed another copy"
of what file?  Wasn't the file deleted in the above step?

> However what I actually saw was no reads whatsoever, FS-Cache seems to
> be ignored and the file is pulled from the Source NFS Filer again. I
> also see heavy writes to /var/cache/fscache, so it appears that
> FS-Cache is overwriting its existing cache, and never using it.

That would happen if the file was changed or re-created.

> I only have 104Gb of memory on the Re-Export Server (with FS-Cache) so
> it is not possible that the file is being served from the page cache.
>
> We saw this behaviour before on an older set of the patches when our
> mount between the Re-Export Server and the Source NFS Filer was using
> the "sync" option, but we are now using the "async" option and the
> same is happening.
>
> Mount options:
>
> Source NFS Server <-- Re-Export Server (with FS-Cache):
>
> 10.0.0.49:/files /srv/nfs/files nfs
> rw,noatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,acregmin=600,acregmax=600,acdirmin=600,acdirmax=600,hard,nocto,proto=tcp,nconnect=16,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.49,mountvers=3,mountport=37485,mountproto=tcp,fsc,local_lock=none,addr=10.0.0.49
>
> Re-Export Server (with FS-Cache) <-- NFS Client:
>
> 10.0.0.3:/files /mnt/nfs nfs
> rw,relatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.3,mountvers=3,mountport=20048,mountproto=tcp,local_lock=none,addr=10.0.0.3
>
> It is also worth noting this behaviour is not unique to the re-export
> use case. I see FS-Cache not being used with the following setup:
>
> Source NFS Server <-- Client (with FS-Cache).
>

This points at something more fundamental like something missed
in the test or maybe a mount option.  Can you explain what test
you're doing here when you say "this behavior is not unique"?

Can you show the mount options for both:
- fscache filesystem on the re-export server (/var/cache/fscache)
- exported filesystem on the NFS server (filesystem in /etc/exports)

Unfortunately the problem with drop_caches makes it more difficult
to know when fscache is truly working.  But some other unit test
I have shows fscache does work with this patchset so I'm puzzled why
you're not seeing it work at all.

I pinged dhowells on the drop_caches issue so maybe we can get
that one sorted out soon but I'm not sure since it's part of a series
and proposes changes in mm.

> Thanks,
> Ben
>
>
> Kind Regards
>
> Benjamin Maynard
>
> Customer Engineer
>
> benmaynard@google.com
>
> Google, Inc.
>
>
>
>
> On Mon, 31 Oct 2022 at 22:22, Trond Myklebust <trondmy@hammerspace.com> wrote:
> >
> >
> >
> > > On Oct 30, 2022, at 19:25, David Wysochanski <dwysocha@redhat.com> wrote:
> > >
> > > On Sat, Oct 29, 2022 at 12:46 PM David Wysochanski <dwysocha@redhat.com> wrote:
> > >>
> > >> On Fri, Oct 28, 2022 at 12:59 PM Trond Myklebust <trondmy@kernel.org> wrote:
> > >>>
> > >>> On Fri, 2022-10-28 at 07:50 -0400, David Wysochanski wrote:
> > >>>> On Thu, Oct 27, 2022 at 3:16 PM Trond Myklebust <trondmy@kernel.org>
> > >>>> wrote:
> > >>>>>
> > >>>>> On Mon, 2022-10-17 at 06:52 -0400, Dave Wysochanski wrote:
> > >>>>>> Convert the NFS buffered read code paths to corresponding netfs
> > >>>>>> APIs,
> > >>>>>> but only when fscache is configured and enabled.
> > >>>>>>
> > >>>>>> The netfs API defines struct netfs_request_ops which must be
> > >>>>>> filled
> > >>>>>> in by the network filesystem.  For NFS, we only need to define 5
> > >>>>>> of
> > >>>>>> the functions, the main one being the issue_read() function.
> > >>>>>> The issue_read() function is called by the netfs layer when a
> > >>>>>> read
> > >>>>>> cannot be fulfilled locally, and must be sent to the server
> > >>>>>> (either
> > >>>>>> the cache is not active, or it is active but the data is not
> > >>>>>> available).
> > >>>>>> Once the read from the server is complete, netfs requires a call
> > >>>>>> to
> > >>>>>> netfs_subreq_terminated() which conveys either how many bytes
> > >>>>>> were
> > >>>>>> read
> > >>>>>> successfully, or an error.  Note that issue_read() is called with
> > >>>>>> a
> > >>>>>> structure, netfs_io_subrequest, which defines the IO requested,
> > >>>>>> and
> > >>>>>> contains a start and a length (both in bytes), and assumes the
> > >>>>>> underlying
> > >>>>>> netfs will return a either an error on the whole region, or the
> > >>>>>> number
> > >>>>>> of bytes successfully read.
> > >>>>>>
> > >>>>>> The NFS IO path is page based and the main APIs are the pgio APIs
> > >>>>>> defined
> > >>>>>> in pagelist.c.  For the pgio APIs, there is no way for the caller
> > >>>>>> to
> > >>>>>> know how many RPCs will be sent and how the pages will be broken
> > >>>>>> up
> > >>>>>> into underlying RPCs, each of which will have their own
> > >>>>>> completion
> > >>>>>> and
> > >>>>>> return code.  In contrast, netfs is subrequest based, a single
> > >>>>>> subrequest may contain multiple pages, and a single subrequest is
> > >>>>>> initiated with issue_read() and terminated with
> > >>>>>> netfs_subreq_terminated().
> > >>>>>> Thus, to utilze the netfs APIs, NFS needs some way to accommodate
> > >>>>>> the netfs API requirement on the single response to the whole
> > >>>>>> subrequest, while also minimizing disruptive changes to the NFS
> > >>>>>> pgio layer.
> > >>>>>>
> > >>>>>> The approach taken with this patch is to allocate a small
> > >>>>>> structure
> > >>>>>> for each nfs_netfs_issue_read() call, store the final error and
> > >>>>>> number
> > >>>>>> of bytes successfully transferred in the structure, and update
> > >>>>>> these
> > >>>>>> values
> > >>>>>> as each RPC completes.  The refcount on the structure is used as
> > >>>>>> a
> > >>>>>> marker
> > >>>>>> for the last RPC completion, is incremented in
> > >>>>>> nfs_netfs_read_initiate(),
> > >>>>>> and decremented inside nfs_netfs_read_completion(), when a
> > >>>>>> nfs_pgio_header
> > >>>>>> contains a valid pointer to the data.  On the final put (which
> > >>>>>> signals
> > >>>>>> the final outstanding RPC is complete) in
> > >>>>>> nfs_netfs_read_completion(),
> > >>>>>> call netfs_subreq_terminated() with either the final error value
> > >>>>>> (if
> > >>>>>> one or more READs complete with an error) or the number of bytes
> > >>>>>> successfully transferred (if all RPCs complete successfully).
> > >>>>>> Note
> > >>>>>> that when all RPCs complete successfully, the number of bytes
> > >>>>>> transferred
> > >>>>>> is capped to the length of the subrequest.  Capping the
> > >>>>>> transferred
> > >>>>>> length
> > >>>>>> to the subrequest length prevents "Subreq overread" warnings from
> > >>>>>> netfs.
> > >>>>>> This is due to the "aligned_len" in nfs_pageio_add_page(), and
> > >>>>>> the
> > >>>>>> corner case where NFS requests a full page at the end of the
> > >>>>>> file,
> > >>>>>> even when i_size reflects only a partial page (NFS overread).
> > >>>>>>
> > >>>>>> Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
> > >>>>>> Reviewed-by: Jeff Layton <jlayton@kernel.org>
> > >>>>>
> > >>>>>
> > >>>>> This is not doing what I asked for, which was to separate out the
> > >>>>> fscache functionality, so that we can call that if and when it is
> > >>>>> available.
> > >>>>>
> > >>>> I must have misunderstood then.
> > >>>>
> > >>>> The last feedback I have from you was that you wanted it to be
> > >>>> an opt-in feature, and it was a comment on a previous patch
> > >>>> to Kconfig.  I was proceeding the best I knew how, but
> > >>>> let me try to get back on track.
> > >>>>
> > >>>>> Instead, it is just wrapping the NFS requests inside netfs
> > >>>>> requests. As
> > >>>>> it stands, that means it is just duplicating information, and
> > >>>>> adding
> > >>>>> unnecessary overhead to the standard I/O path (extra allocations,
> > >>>>> extra
> > >>>>> indirect calls, and extra bloat to the inode).
> > >>>>>
> > >>>> I think I understand what you're saying but I'm not sure.  Let me
> > >>>> ask some clarifying questions.
> > >>>>
> > >>>> Are you objecting to the code when CONFIG_NFS_FSCACHE is
> > >>>> configured?  Or when it is not?  Or both?  I think you're objecting
> > >>>> when it's configured, but not enabled (we mount without 'fsc').
> > >>>> Am I right?
> > >>>>
> > >>>> Also, are you objecting to the design that to use fcache we now
> > >>>> have to use netfs, specifically:
> > >>>> - call into netfs via either netfs_read_folio or netfs_readahead
> > >>>> - if fscache is enabled, then the IO can be satisfied from fscache
> > >>>> - if fscache is not enabled, or some of the IO cannot be satisfied
> > >>>> from the cache, then NFS is called back via netfs_issue_read
> > >>>> and we use the normal NFS read pageio interface.  This requires
> > >>>> we call netfs_subreq_terminated() when all the RPCs complete,
> > >>>> which is the reason for the small changes to pagelist.c
> > >>>
> > >>> I'm objecting to any middle layer "solution" that adds overhead to the
> > >>> NFS I/O paths.
> > >>>
> > >> Got it.
> > >>
> > >>> I'm willing to consider solutions that are specific only to the fscache
> > >>> use case (i.e. when the 'fsc' mount option is specified). However when
> > >>> I perform a normal NFS mount, and do I/O, then I don't want to see
> > >>> extra memory allocations, extra indirect calls and larger inode
> > >>> footprints.
> > >>>
> > >>> IOW: I want the code to optimise for the case of standard NFS, not for
> > >>> the case of 'NFS with cachefs additions'.
> > >>>
> > >> I agree completely.  Are you seeing extra memory allocations
> > >> happen on mounts without 'fsc' or is it more a concern or how
> > >> some of the patches look?  We should not be calling any netfs or
> > >> fscache code if 'fsc' is not on the mount and I don't see any in my
> > >> testing. So either there's a misunderstanding here, or there's a
> > >> bug I'm missing.
> > >>
> > >> If fscache is not configured, then nfs_netfs_read_folio() and
> > >> nfs_netfs_readahead() is a wrapper that returns -ENOBUFS.
> > >> If it's configured but not enabled, then the checks for
> > >> netfs_inode(inode)->cache should skip over any netfs code.
> > >> But maybe there's a non-obvious bug you're seeing and
> > >> somehow netfs is still getting called?  Because I cannot
> > >> see netfs getting called if 'fsc' is not on the mount in my
> > >> tests.
> > >>
> > >> int nfs_netfs_read_folio(struct file *file, struct folio *folio)
> > >> {
> > >>       if (!netfs_inode(folio_inode(folio))->cache)
> > >>               return -ENOBUFS;
> > >>
> > >>       return netfs_read_folio(file, folio);
> > >> }
> > >>
> > >> int nfs_netfs_readahead(struct readahead_control *ractl)
> > >> {
> > >>       struct inode *inode = ractl->mapping->host;
> > >>
> > >>       if (!netfs_inode(inode)->cache)
> > >>               return -ENOBUFS;
> > >>
> > >>       netfs_readahead(ractl);
> > >>       return 0;
> > >> }
> > >>
> > >>
> > >>>>
> > >>>> Can you be more specific as to the portions of the patch you don't
> > >>>> like
> > >>>> so I can move it in the right direction?
> > >>>>
> > >>>> This is from patch #2 which you didn't comment on.  I'm not sure
> > >>>> you're
> > >>>> ok with it though, since you mention "extra bloat to the inode".
> > >>>> Do you object to this even though it's wrapped in an
> > >>>> #ifdef CONFIG_NFS_FSCACHE?  If so, do you require no
> > >>>> extra size be added to nfs_inode?
> > >>>>
> > >>>> @@ -204,9 +208,11 @@ struct nfs_inode {
> > >>>>       __u64 write_io;
> > >>>>       __u64 read_io;
> > >>>> #ifdef CONFIG_NFS_FSCACHE
> > >>>> -       struct fscache_cookie   *fscache;
> > >>>> -#endif
> > >>>> +       struct netfs_inode      netfs; /* netfs context and VFS inode
> > >>>> */
> > >>>> +#else
> > >>>>       struct inode            vfs_inode;
> > >>>> +#endif
> > >>>> +
> > >>>
> > >>> Ideally, I'd prefer no extra size. I can live with it up to a certain
> > >>> point, however for now NFS is not unconditionally opting into the netfs
> > >>> project. If we're to ever do that, then I want to see streamlined code
> > >>> for the standard I/O case.
> > >>>
> > >> Ok and understood about standard I/O case.
> > >>
> > >> I was thinking how we might not increase the size, but I don't think
> > >> I can make it work.
> > >>
> > >> I thought we could change to something like the below, without an
> > >> embedded struct inode:
> > >>
> > >> @@ -204,9 +208,11 @@ struct nfs_inode {
> > >>       __u64 write_io;
> > >>       __u64 read_io;
> > >> #ifdef CONFIG_NFS_FSCACHE
> > >> -       struct fscache_cookie   *fscache;
> > >> -#endif
> > >> +       struct netfs_inode      *netfs; /* netfs context and VFS inode */
> > >> +#else
> > >>       struct inode            vfs_inode;
> > >> +#endif
> > >> +
> > >>
> > >> Then I would need to alloc/free a netfs_inode at the time of
> > >> nfs_inode initiation.  Unfortunately this has the issue that the NFS_I()
> > >> macro cannot work, because it requires an embedded "struct inode"
> > >> due to "container_of" use:
> > >>
> > >> +#ifdef CONFIG_NFS_FSCACHE
> > >> +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > >> +{
> > >> +       return &nfsi->netfs.inode;
> > >> +}
> > >> +static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > >> +{
> > >> +       return container_of(inode, struct nfs_inode, netfs.inode);
> > >> +}
> > >> +#else
> > >> +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > >> +{
> > >> +       return &nfsi->vfs_inode;
> > >> +}
> > >> static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > >> {
> > >>       return container_of(inode, struct nfs_inode, vfs_inode);
> > >> }
> > >> +#endif
> > >>
> > >>
> > >
> > > Actually Trond maybe we can achieve a "0 length increase" of
> > > nfs_inode if dhowells would take a patch to modify the definition
> > > of struct netfs_inode and netfs_inode_init(), something like the WIP
> > > patch below.  What do you think?
> >
> > That works for me.
> >
> > >
> > > I think maybe this could be a follow-on patch and if you/dhowells
> > > think it's an ok idea I can try to work out what is needed across
> > > the tree.  I thought about it more and I kinda agree that in the
> > > case for NFS where fscache is "configured but not enabled",
> > > then even though we're only adding 24 bytes to the nfs_inode
> > > each time, it will add up so it is worth at least a discussion.
> > >
> > > diff --git a/include/linux/netfs.h b/include/linux/netfs.h
> > > index f2402ddeafbf..195714f1c355 100644
> > > --- a/include/linux/netfs.h
> > > +++ b/include/linux/netfs.h
> > > @@ -118,11 +118,7 @@ enum netfs_io_source {
> > > typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error,
> > >                                     bool was_async);
> > >
> > > -/*
> > > - * Per-inode context.  This wraps the VFS inode.
> > > - */
> > > -struct netfs_inode {
> > > -       struct inode            inode;          /* The VFS inode */
> > > +struct netfs_info {
> > >       const struct netfs_request_ops *ops;
> > > #if IS_ENABLED(CONFIG_FSCACHE)
> > >       struct fscache_cookie   *cache;
> > > @@ -130,6 +126,14 @@ struct netfs_inode {
> > >       loff_t                  remote_i_size;  /* Size of the remote file */
> > > };
> > >
> > > +/*
> > > + * Per-inode context.  This wraps the VFS inode.
> > > + */
> > > +struct netfs_inode {
> > > +       struct inode            inode;          /* The VFS inode */
> > > +       struct netfs_info       *netfs;         /* Rest of netfs data */
> > > +};
> > > +
> > > /*
> > > * Resources required to do operations on a cache.
> > > */
> > > @@ -312,10 +316,12 @@ static inline struct netfs_inode
> > > *netfs_inode(struct inode *inode)
> > > static inline void netfs_inode_init(struct netfs_inode *ctx,
> > >                                   const struct netfs_request_ops *ops)
> > > {
> > > -       ctx->ops = ops;
> > > -       ctx->remote_i_size = i_size_read(&ctx->inode);
> > > +       ctx->netfs = kzalloc(sizeof(struct netfs_info)), GFP_KERNEL);
> > > +       /* FIXME: Check for NULL */
> > > +       ctx->netfs->ops = ops;
> > > +       ctx->netfs->remote_i_size = i_size_read(&ctx->inode);
> > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > -       ctx->cache = NULL;
> > > +       ctx->netfs->cache = NULL;
> > > #endif
> > > }
> > >
> > >
> > >
> > >>
> > >>>>
> > >>>>
> > >>>> Are you ok with the stub functions which are placed in fscache.h, and
> > >>>> when CONFIG_NFS_FSCACHE is not set, become either a no-op
> > >>>> or a 1-liner (nfs_netfs_readpage_release)?
> > >>>>
> > >>>> #else /* CONFIG_NFS_FSCACHE */
> > >>>> +static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {}
> > >>>> +static inline void nfs_netfs_initiate_read(struct nfs_pgio_header
> > >>>> *hdr) {}
> > >>>> +static inline void nfs_netfs_read_completion(struct nfs_pgio_header
> > >>>> *hdr) {}
> > >>>> +static inline void nfs_netfs_readpage_release(struct nfs_page *req)
> > >>>> +{
> > >>>> +       unlock_page(req->wb_page);
> > >>>> +}
> > >>>> static inline void nfs_fscache_release_super_cookie(struct
> > >>>> super_block *sb) {}
> > >>>> static inline void nfs_fscache_init_inode(struct inode *inode) {}
> > >>>>
> > >>>>
> > >>>> Do you object to the below?  If so, then do you want
> > >>>> #ifdef CONFIG_NFS_FSCACHE here?
> > >>>>
> > >>>> -- a/fs/nfs/inode.c
> > >>>> +++ b/fs/nfs/inode.c
> > >>>> @@ -2249,6 +2249,8 @@ struct inode *nfs_alloc_inode(struct
> > >>>> super_block *sb)
> > >>>> #ifdef CONFIG_NFS_V4_2
> > >>>>       nfsi->xattr_cache = NULL;
> > >>>> #endif
> > >>>> +       nfs_netfs_inode_init(nfsi);
> > >>>> +
> > >>>>       return VFS_I(nfsi);
> > >>>> }
> > >>>> EXPORT_SYMBOL_GPL(nfs_alloc_i
> > >>>> node);
> > >>>>
> > >>>>
> > >>>> Do you object to the changes in fs/nfs/read.c?  Specifically,
> > >>>> how about the below calls to netfs from nfs_read_folio and
> > >>>> nfs_readahead into equivalent netfs calls?  So when
> > >>>> NFS_CONFIG_FSCACHE is set, but fscache is not enabled
> > >>>> ('fsc' not on mount), these netfs functions do immediately call
> > >>>> netfs_alloc_request().  But I wonder if we could simply add a
> > >>>> check to see if fscache is enabled on the mount, and skip
> > >>>> over to satisfy what you want.  Am I understanding what you
> > >>>> want?
> > >>>
> > >>> Quite frankly, I'd prefer that we just split out the functionality that
> > >>> is needed from the netfs code so that it can be optimised. However I'm
> > >>> not interested enough in the cachefs functionality to work on that
> > >>> myself. ...and as I indicated above, I might be OK with opting into the
> > >>> netfs project, once the overhead can be made to disappear.
> > >>>
> > >> Understood.
> > >>
> > >> If you think it makes more sense, I can move some of the nfs_netfs_*
> > >> functions into a netfs.c file as a starting point.  Or that can maybe
> > >> be done in a future patchset?
> > >>
> > >> For now I was equating netfs and fscache together so we can
> > >> move on from the much older and single-page limiting fscache
> > >> interface that is likely to go away soon.
> > >>
> > >>>>
> > >>>> @@ -355,6 +343,10 @@ int nfs_read_folio(struct file *file, struct
> > >>>> folio *folio)
> > >>>>       if (NFS_STALE(inode))
> > >>>>               goto out_unlock;
> > >>>>
> > >>>> +       ret = nfs_netfs_read_folio(file, folio);
> > >>>> +       if (!ret)
> > >>>> +               goto out;
> > >>>> +
> > >>>>
> > >>>> @@ -405,6 +399,10 @@ void nfs_readahead(struct readahead_control
> > >>>> *ractl)
> > >>>>       if (NFS_STALE(inode))
> > >>>>               goto out;
> > >>>>
> > >>>> +       ret = nfs_netfs_readahead(ractl);
> > >>>> +       if (!ret)
> > >>>> +               goto out;
> > >>>> +
> > >>>>
> > >> The above wrappers should prevent any additional overhead when fscache
> > >> is not enabled.  As far as I know these work to avoid calling netfs
> > >> when 'fsc' is not on the mount.
> > >>
> > >>>>
> > >>>> And how about these calls from different points in the read
> > >>>> path to the earlier mentioned stub functions?
> > >>>>
> > >>>> @@ -110,20 +110,13 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
> > >>>>
> > >>>> static void nfs_readpage_release(struct nfs_page *req, int error)
> > >>>> {
> > >>>> -       struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > >>>>       struct page *page = req->wb_page;
> > >>>>
> > >>>> -       dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb-
> > >>>>> s_id,
> > >>>> -               (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
> > >>>> -               (long long)req_offset(req));
> > >>>> -
> > >>>>       if (nfs_error_is_fatal_on_server(error) && error != -
> > >>>> ETIMEDOUT)
> > >>>>               SetPageError(page);
> > >>>> -       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
> > >>>> -               if (PageUptodate(page))
> > >>>> -                       nfs_fscache_write_page(inode, page);
> > >>>> -               unlock_page(page);
> > >>>> -       }
> > >>>> +       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > >>>> +               nfs_netfs_readpage_release(req);
> > >>>> +
> > >>>
> > >>> I'm not seeing the value of wrapping unlock_page(), no... That code is
> > >>> going to need to change when we move it to use folios natively anyway.
> > >>>
> > >> Ok, how about I make it conditional on whether fscache is configured
> > >> and enabled then, similar to the nfs_netfs_read_folio() and
> > >> nfs_netfs_readahead()?  Below is what that would look like.
> > >> I could inline the code in nfs_netfs_readpage_release() if you
> > >> think it would be clearer.
> > >>
> > >> static void nfs_readpage_release(struct nfs_page *req, int error)
> > >> {
> > >>       struct page *page = req->wb_page;
> > >>
> > >>       if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
> > >>               SetPageError(page);
> > >>       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > >> #ifndef CONFIG_NFS_FSCACHE
> > >>               unlock_page(req->wb_page);
> > >> #else
> > >>               nfs_netfs_readpage_release(req);
> > >> #endif
> > >>       nfs_release_request(req);
> > >> }
> > >>
> > >>
> > >> void nfs_netfs_readpage_release(struct nfs_page *req)
> > >> {
> > >>   struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > >>
> > >>   /*
> > >>    * If fscache is enabled, netfs will unlock pages.
> > >>    */
> > >>   if (netfs_inode(inode)->cache)
> > >>       return;
> > >>
> > >>   unlock_page(req->wb_page);
> > >> }
> > >>
> > >>
> > >>>>       nfs_release_request(req);
> > >>>> }
> > >>>>
> > >>>> @@ -177,6 +170,8 @@ static void nfs_read_completion(struct
> > >>>> nfs_pgio_header *hdr)
> > >>>>               nfs_list_remove_request(req);
> > >>>>               nfs_readpage_release(req, error);
> > >>>>       }
> > >>>> +       nfs_netfs_read_completion(hdr);
> > >>>> +
> > >>>> out:
> > >>>>       hdr->release(hdr);
> > >>>> }
> > >>>> @@ -187,6 +182,7 @@ static void nfs_initiate_read(struct
> > >>>> nfs_pgio_header *hdr,
> > >>>>                             struct rpc_task_setup *task_setup_data,
> > >>>> int how)
> > >>>> {
> > >>>>       rpc_ops->read_setup(hdr, msg);
> > >>>> +       nfs_netfs_initiate_read(hdr);
> > >>>>       trace_nfs_initiate_read(hdr);
> > >>>> }
> > >>>>
> > >>>>
> > >>>> Are you ok with these additions?  Something like this would
> > >>>> be required in the case of fscache configured and enabled,
> > >>>> because we could have some of the data in a read in
> > >>>> fscache, and some not.  That is the reason for the netfs
> > >>>> design, and why we need to be able to call the normal
> > >>>> NFS read IO path (netfs calls into issue_read, and we call
> > >>>> back via netfs_subreq_terminated)?
> > >>>>
> > >>>> @@ -101,6 +101,9 @@ struct nfs_pageio_descriptor {
> > >>>>       struct pnfs_layout_segment *pg_lseg;
> > >>>>       struct nfs_io_completion *pg_io_completion;
> > >>>>       struct nfs_direct_req   *pg_dreq;
> > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > >>>> +       void                    *pg_netfs;
> > >>>> +#endif
> > >>>>
> > >>>> @@ -1619,6 +1619,9 @@ struct nfs_pgio_header {
> > >>>>       const struct nfs_rw_ops *rw_ops;
> > >>>>       struct nfs_io_completion *io_completion;
> > >>>>       struct nfs_direct_req   *dreq;
> > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > >>>> +       void                    *netfs;
> > >>>> +#endif
> > >>>>
> > >>>>
> > >>>> And these additions to pagelist.c?
> > >>>>
> > >>>> @@ -68,6 +69,10 @@ void nfs_pgheader_init(struct
> > >>>> nfs_pageio_descriptor *desc,
> > >>>>       hdr->good_bytes = mirror->pg_count;
> > >>>>       hdr->io_completion = desc->pg_io_completion;
> > >>>>       hdr->dreq = desc->pg_dreq;
> > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > >>>> +       if (desc->pg_netfs)
> > >>>> +               hdr->netfs = desc->pg_netfs;
> > >>>> +#endif
> > >>>
> > >>> Why the conditional?
> > >>>
> > >> Not really needed and I was thinking of removing it, so I'll do that.
> > >>
> > >>>>
> > >>>>
> > >>>> @@ -846,6 +851,9 @@ void nfs_pageio_init(struct nfs_pageio_descriptor
> > >>>> *desc,
> > >>>>       desc->pg_lseg = NULL;
> > >>>>       desc->pg_io_completion = NULL;
> > >>>>       desc->pg_dreq = NULL;
> > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > >>>> +       desc->pg_netfs = NULL;
> > >>>> +#endif
> > >>>>
> > >>>>
> > >>>> @@ -1360,6 +1369,9 @@ int nfs_pageio_resend(struct
> > >>>> nfs_pageio_descriptor *desc,
> > >>>>
> > >>>>       desc->pg_io_completion = hdr->io_completion;
> > >>>>       desc->pg_dreq = hdr->dreq;
> > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > >>>> +       desc->pg_netfs = hdr->netfs;
> > >>>> +#endif
> > >>>
> > >>> Those all need wrapper functions instead of embedding #ifdefs.
> > >>>
> > >> Ok.
> > >>
> > >>
> > >>
> > >>>>
> > >>>>
> > >>>>> My expectation is that the standard I/O path should have minimal
> > >>>>> overhead, and should certainly not increase the overhead that we
> > >>>>> already have. Will this be addressed in future iterations of these
> > >>>>> patches?
> > >>>>>
> > >>>>
> > >>>> I will do what I can to satisfy what you want, either by fixing up
> > >>>> this patch or follow-on patches.  Hopefully the above questions
> > >>>> will clarify the next steps.
> > >>>>
> > >>>
> > >>> --
> > >>> Trond Myklebust
> > >>> Linux NFS client maintainer, Hammerspace
> > >>> trond.myklebust@hammerspace.com
> >
> >
> >
> > Trond Myklebust
> > CTO, Hammerspace Inc
> > 1900 S Norfolk St, Suite 350 - #45
> > San Mateo, CA 94403
> >
> > www.hammer.space
> >
> >
>
Jeff Layton Nov. 14, 2022, 1:53 p.m. UTC | #14
Ok. You might be running into the problem of ext4 bumping the i_version
on atime updates. We have some patches in flight that should make v6.2
to fix that there.

fscache uses the NFS change attribute to tell whether the cached version
of the data is stale or not, so a false bump due to an atime update can
cause spurious cache invalidations.

You can mount the ext4 partition with "-o noatime" to work around the
issue (if it's the same problem), or try using btrfs (xfs doesn't have a
fix at the moment).

That said, you mentioned earlier:

> I then deleted that file from the NFS Client, and dropped the caches
> just to be safe (echo 3 > /proc/sys/vm/drop_caches on the NFS Client).
> 
> I then performed another copy of the 500Gb file on the NFS Client,
> again via the Re-Export Server. What I expected would happen is that I
> would see heavy reads from the /var/cache/fscache volume as the file
> should be served from FS-Cache.
> 
> However what I actually saw was no reads whatsoever, FS-Cache seems to
> be ignored and the file is pulled from the Source NFS Filer again. I
> also see heavy writes to /var/cache/fscache, so it appears that
> FS-Cache is overwriting its existing cache, and never using it.
> 

Why would there be any data to download from the source NFS server at
all if you deleted the file on the client? The reexporting server would
have unlinked the file too, and would (likely) have purged it from its
cache.


On Mon, 2022-11-14 at 13:14 +0000, Benjamin Maynard wrote:
> The source server is Linux, exporting an ext4 filesystem.
> 
> benmaynard@bjmtesting-source:~$ cat /etc/lsb-release
> DISTRIB_ID=Ubuntu
> DISTRIB_RELEASE=20.04
> DISTRIB_CODENAME=focal
> DISTRIB_DESCRIPTION="Ubuntu 20.04.5 LTS"
> 
> benmaynard@bjmtesting-source:~$ uname -r
> 5.15.0-1021-gcp
> 
> benmaynard@bjmtesting-source:~$ df -Th
> Filesystem     Type      Size  Used Avail Use% Mounted on
> /dev/root      ext4      194G  2.5G  192G   2% /
> /dev/sdb1      ext4      2.0T  501G  1.4T  27% /files
> 
> benmaynard@bjmtesting-source:~$ cat /etc/exports
> /files 10.0.0.0/8(rw,sync,wdelay,no_root_squash,no_all_squash,no_subtree_check,sec=sys,secure,nohide)
> 
> 
> Kind Regards
> Benjamin Maynard
> 
> 
> Kind Regards
> 
> Benjamin Maynard
> 
> Customer Engineer
> 
> benmaynard@google.com
> 
> Google, Inc.
> 
> 
> 
> 
> On Mon, 14 Nov 2022 at 13:07, Jeff Layton <jlayton@poochiereds.net> wrote:
> > 
> > On Mon, 2022-11-14 at 12:42 +0000, Benjamin Maynard wrote:
> > > Thanks Dave for getting back to me so quickly.
> > > 
> > > > Due to use of "drop_caches" this is almost certainly the known issue #1
> > > > I mentioned in the opening post of this series:
> > > > https://lore.kernel.org/all/20221103161637.1725471-1-dwysocha@redhat.com/
> > > 
> > > Apologies, I completely missed the known issues in the original
> > > opening message of the series. Just to clarify, I was only ever
> > > dropping the caches on the "NFS Client" in the below relationship:
> > > 
> > > Source NFS Server <-- Re-Export Server (with FS-Cache) <-- NFS Client.
> > > 
> > 
> > What sort of server is the Source NFS server here? If it's also Linux,
> > then what sort of filesystem is being exported?
> > 
> > > I never dropped the caches on the Re-Export Server (the server running
> > > FS-Cache) at any point.
> > > 
> > > However my rsize was lower than my readahead value. I've since corrected that:
> > > 
> > > benmaynard@demo-cluster-1-26hm:~$ cat /proc/mounts | grep nfs
> > > 10.0.0.49:/files /srv/nfs/files nfs
> > > rw,noatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,acregmin=600,acregmax=600,acdirmin=600,acdirmax=600,hard,nocto,proto=tcp,nconnect=16,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.49,mountvers=3,mountport=37485,mountproto=tcp,fsc,local_lock=none,addr=10.0.0.49
> > > 0 0
> > > 
> > > benmaynard@demo-cluster-1-26hm:~$ findmnt -rnu -t nfs,nfs4 -o MAJ:MIN,TARGET
> > > 0:52 /srv/nfs/files
> > > benmaynard@demo-cluster-1-26hm:~$ cat /sys/class/bdi/0\:52/read_ahead_kb
> > > 512
> > > 
> > > With this configuration I see the same issue, FS-Cache never reads
> > > from /var/cache/fscache, and copying the same file always leads to
> > > heavy writes to /var/cache/fscache (the cache is overwriting itself).
> > > 
> > > I have also tried this copy without clearing the caches on any server
> > > in the chain, and the same happens.
> > > 
> > > Would you expect this behaviour even though rsize > read ahead? Would
> > > you expect the referenced patch to fix this?
> > > 
> > > I tried to apply the patch you suggested
> > > (https://www.mail-archive.com/linux-cachefs@redhat.com/msg03043.html)
> > > but it did not apply cleanly, and I ran out of time to troubleshoot. I
> > > should get some more time on Wednesday and I can re-try.
> > > 
> > > 
> > > Kind Regards
> > > Benjamin Maynard
> > > 
> > > 
> > > Kind Regards
> > > 
> > > Benjamin Maynard
> > > 
> > > Customer Engineer
> > > 
> > > benmaynard@google.com
> > > 
> > > Google, Inc.
> > > 
> > > 
> > > 
> > > 
> > > On Mon, 14 Nov 2022 at 10:41, David Wysochanski <dwysocha@redhat.com> wrote:
> > > > 
> > > > Hi Ben,
> > > > 
> > > > Thanks for testing these patches.  More below.
> > > > 
> > > > On Sat, Nov 12, 2022 at 7:47 AM Benjamin Maynard <benmaynard@google.com> wrote:
> > > > > 
> > > > > Hi all,
> > > > > 
> > > > > I've been doing some more testing with these patches, I applied all of
> > > > > the patches (v10 from
> > > > > https://patchwork.kernel.org/project/linux-nfs/list/?series=691729)
> > > > > apart from Patch 6 (the RFC patch) to version 6.0.8 of the kernel.
> > > > > 
> > > > > I have the following setup:
> > > > > 
> > > > > Source NFS Server <-- Re-Export Server (with FS-Cache) <-- NFS Client.
> > > > > 
> > > > > I have a 500Gb file on the Source NFS Server, which I am then copying
> > > > > to the NFS Client via the Re-Export Server.
> > > > > 
> > > > > On the first copy, I see heavy writes to /var/cache/fscache on the
> > > > > re-export server, and once the file copy completes I see that
> > > > > /var/cache/fscache is approximately 500Gb in size. All good so far.
> > > > > 
> > > > > I then deleted that file from the NFS Client, and dropped the caches
> > > > > just to be safe (echo 3 > /proc/sys/vm/drop_caches on the NFS Client).
> > > > > 
> > > > > I then performed another copy of the 500Gb file on the NFS Client,
> > > > > again via the Re-Export Server. What I expected would happen is that I
> > > > > would see heavy reads from the /var/cache/fscache volume as the file
> > > > > should be served from FS-Cache.
> > > > > 
> > > > > However what I actually saw was no reads whatsoever, FS-Cache seems to
> > > > > be ignored and the file is pulled from the Source NFS Filer again. I
> > > > > also see heavy writes to /var/cache/fscache, so it appears that
> > > > > FS-Cache is overwriting its existing cache, and never using it.
> > > > > 
> > > > Due to use of "drop_caches" this is almost certainly the known issue #1
> > > > I mentioned in the opening post of this series:
> > > > https://lore.kernel.org/all/20221103161637.1725471-1-dwysocha@redhat.com/
> > > > 
> > > > The above issue will be fixed with the following patch which has not
> > > > been merged yet:
> > > > https://www.mail-archive.com/linux-cachefs@redhat.com/msg03043.html
> > > > 
> > > > Do you have time to do another test to verify that is the case?
> > > > If so, I can re-post that patch on top of the first 5 patches in this series,
> > > > as well as a second patch that allows NFS to use it.
> > > > 
> > > > 
> > > > > I only have 104Gb of memory on the Re-Export Server (with FS-Cache) so
> > > > > it is not possible that the file is being served from the page cache.
> > > > > 
> > > > > We saw this behaviour before on an older set of the patches when our
> > > > > mount between the Re-Export Server and the Source NFS Filer was using
> > > > > the "sync" option, but we are now using the "async" option and the
> > > > > same is happening.
> > > > > 
> > > > > Mount options:
> > > > > 
> > > > > Source NFS Server <-- Re-Export Server (with FS-Cache):
> > > > > 
> > > > > 10.0.0.49:/files /srv/nfs/files nfs
> > > > > rw,noatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,acregmin=600,acregmax=600,acdirmin=600,acdirmax=600,hard,nocto,proto=tcp,nconnect=16,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.49,mountvers=3,mountport=37485,mountproto=tcp,fsc,local_lock=none,addr=10.0.0.49
> > > > > 
> > > > > Re-Export Server (with FS-Cache) <-- NFS Client:
> > > > > 
> > > > > 10.0.0.3:/files /mnt/nfs nfs
> > > > > rw,relatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.3,mountvers=3,mountport=20048,mountproto=tcp,local_lock=none,addr=10.0.0.3
> > > > > 
> > > > > It is also worth noting this behaviour is not unique to the re-export
> > > > > use case. I see FS-Cache not being used with the following setup:
> > > > > 
> > > > > Source NFS Server <-- Client (with FS-Cache).
> > > > > 
> > > > > Thanks,
> > > > > Ben
> > > > > 
> > > > > 
> > > > > Kind Regards
> > > > > 
> > > > > Benjamin Maynard
> > > > > 
> > > > > Customer Engineer
> > > > > 
> > > > > benmaynard@google.com
> > > > > 
> > > > > Google, Inc.
> > > > > 
> > > > > 
> > > > > 
> > > > > 
> > > > > On Mon, 31 Oct 2022 at 22:22, Trond Myklebust <trondmy@hammerspace.com> wrote:
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > > On Oct 30, 2022, at 19:25, David Wysochanski <dwysocha@redhat.com> wrote:
> > > > > > > 
> > > > > > > On Sat, Oct 29, 2022 at 12:46 PM David Wysochanski <dwysocha@redhat.com> wrote:
> > > > > > > > 
> > > > > > > > On Fri, Oct 28, 2022 at 12:59 PM Trond Myklebust <trondmy@kernel.org> wrote:
> > > > > > > > > 
> > > > > > > > > On Fri, 2022-10-28 at 07:50 -0400, David Wysochanski wrote:
> > > > > > > > > > On Thu, Oct 27, 2022 at 3:16 PM Trond Myklebust <trondmy@kernel.org>
> > > > > > > > > > wrote:
> > > > > > > > > > > 
> > > > > > > > > > > On Mon, 2022-10-17 at 06:52 -0400, Dave Wysochanski wrote:
> > > > > > > > > > > > Convert the NFS buffered read code paths to corresponding netfs
> > > > > > > > > > > > APIs,
> > > > > > > > > > > > but only when fscache is configured and enabled.
> > > > > > > > > > > > 
> > > > > > > > > > > > The netfs API defines struct netfs_request_ops which must be
> > > > > > > > > > > > filled
> > > > > > > > > > > > in by the network filesystem.  For NFS, we only need to define 5
> > > > > > > > > > > > of
> > > > > > > > > > > > the functions, the main one being the issue_read() function.
> > > > > > > > > > > > The issue_read() function is called by the netfs layer when a
> > > > > > > > > > > > read
> > > > > > > > > > > > cannot be fulfilled locally, and must be sent to the server
> > > > > > > > > > > > (either
> > > > > > > > > > > > the cache is not active, or it is active but the data is not
> > > > > > > > > > > > available).
> > > > > > > > > > > > Once the read from the server is complete, netfs requires a call
> > > > > > > > > > > > to
> > > > > > > > > > > > netfs_subreq_terminated() which conveys either how many bytes
> > > > > > > > > > > > were
> > > > > > > > > > > > read
> > > > > > > > > > > > successfully, or an error.  Note that issue_read() is called with
> > > > > > > > > > > > a
> > > > > > > > > > > > structure, netfs_io_subrequest, which defines the IO requested,
> > > > > > > > > > > > and
> > > > > > > > > > > > contains a start and a length (both in bytes), and assumes the
> > > > > > > > > > > > underlying
> > > > > > > > > > > > netfs will return a either an error on the whole region, or the
> > > > > > > > > > > > number
> > > > > > > > > > > > of bytes successfully read.
> > > > > > > > > > > > 
> > > > > > > > > > > > The NFS IO path is page based and the main APIs are the pgio APIs
> > > > > > > > > > > > defined
> > > > > > > > > > > > in pagelist.c.  For the pgio APIs, there is no way for the caller
> > > > > > > > > > > > to
> > > > > > > > > > > > know how many RPCs will be sent and how the pages will be broken
> > > > > > > > > > > > up
> > > > > > > > > > > > into underlying RPCs, each of which will have their own
> > > > > > > > > > > > completion
> > > > > > > > > > > > and
> > > > > > > > > > > > return code.  In contrast, netfs is subrequest based, a single
> > > > > > > > > > > > subrequest may contain multiple pages, and a single subrequest is
> > > > > > > > > > > > initiated with issue_read() and terminated with
> > > > > > > > > > > > netfs_subreq_terminated().
> > > > > > > > > > > > Thus, to utilze the netfs APIs, NFS needs some way to accommodate
> > > > > > > > > > > > the netfs API requirement on the single response to the whole
> > > > > > > > > > > > subrequest, while also minimizing disruptive changes to the NFS
> > > > > > > > > > > > pgio layer.
> > > > > > > > > > > > 
> > > > > > > > > > > > The approach taken with this patch is to allocate a small
> > > > > > > > > > > > structure
> > > > > > > > > > > > for each nfs_netfs_issue_read() call, store the final error and
> > > > > > > > > > > > number
> > > > > > > > > > > > of bytes successfully transferred in the structure, and update
> > > > > > > > > > > > these
> > > > > > > > > > > > values
> > > > > > > > > > > > as each RPC completes.  The refcount on the structure is used as
> > > > > > > > > > > > a
> > > > > > > > > > > > marker
> > > > > > > > > > > > for the last RPC completion, is incremented in
> > > > > > > > > > > > nfs_netfs_read_initiate(),
> > > > > > > > > > > > and decremented inside nfs_netfs_read_completion(), when a
> > > > > > > > > > > > nfs_pgio_header
> > > > > > > > > > > > contains a valid pointer to the data.  On the final put (which
> > > > > > > > > > > > signals
> > > > > > > > > > > > the final outstanding RPC is complete) in
> > > > > > > > > > > > nfs_netfs_read_completion(),
> > > > > > > > > > > > call netfs_subreq_terminated() with either the final error value
> > > > > > > > > > > > (if
> > > > > > > > > > > > one or more READs complete with an error) or the number of bytes
> > > > > > > > > > > > successfully transferred (if all RPCs complete successfully).
> > > > > > > > > > > > Note
> > > > > > > > > > > > that when all RPCs complete successfully, the number of bytes
> > > > > > > > > > > > transferred
> > > > > > > > > > > > is capped to the length of the subrequest.  Capping the
> > > > > > > > > > > > transferred
> > > > > > > > > > > > length
> > > > > > > > > > > > to the subrequest length prevents "Subreq overread" warnings from
> > > > > > > > > > > > netfs.
> > > > > > > > > > > > This is due to the "aligned_len" in nfs_pageio_add_page(), and
> > > > > > > > > > > > the
> > > > > > > > > > > > corner case where NFS requests a full page at the end of the
> > > > > > > > > > > > file,
> > > > > > > > > > > > even when i_size reflects only a partial page (NFS overread).
> > > > > > > > > > > > 
> > > > > > > > > > > > Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
> > > > > > > > > > > > Reviewed-by: Jeff Layton <jlayton@kernel.org>
> > > > > > > > > > > 
> > > > > > > > > > > 
> > > > > > > > > > > This is not doing what I asked for, which was to separate out the
> > > > > > > > > > > fscache functionality, so that we can call that if and when it is
> > > > > > > > > > > available.
> > > > > > > > > > > 
> > > > > > > > > > I must have misunderstood then.
> > > > > > > > > > 
> > > > > > > > > > The last feedback I have from you was that you wanted it to be
> > > > > > > > > > an opt-in feature, and it was a comment on a previous patch
> > > > > > > > > > to Kconfig.  I was proceeding the best I knew how, but
> > > > > > > > > > let me try to get back on track.
> > > > > > > > > > 
> > > > > > > > > > > Instead, it is just wrapping the NFS requests inside netfs
> > > > > > > > > > > requests. As
> > > > > > > > > > > it stands, that means it is just duplicating information, and
> > > > > > > > > > > adding
> > > > > > > > > > > unnecessary overhead to the standard I/O path (extra allocations,
> > > > > > > > > > > extra
> > > > > > > > > > > indirect calls, and extra bloat to the inode).
> > > > > > > > > > > 
> > > > > > > > > > I think I understand what you're saying but I'm not sure.  Let me
> > > > > > > > > > ask some clarifying questions.
> > > > > > > > > > 
> > > > > > > > > > Are you objecting to the code when CONFIG_NFS_FSCACHE is
> > > > > > > > > > configured?  Or when it is not?  Or both?  I think you're objecting
> > > > > > > > > > when it's configured, but not enabled (we mount without 'fsc').
> > > > > > > > > > Am I right?
> > > > > > > > > > 
> > > > > > > > > > Also, are you objecting to the design that to use fcache we now
> > > > > > > > > > have to use netfs, specifically:
> > > > > > > > > > - call into netfs via either netfs_read_folio or netfs_readahead
> > > > > > > > > > - if fscache is enabled, then the IO can be satisfied from fscache
> > > > > > > > > > - if fscache is not enabled, or some of the IO cannot be satisfied
> > > > > > > > > > from the cache, then NFS is called back via netfs_issue_read
> > > > > > > > > > and we use the normal NFS read pageio interface.  This requires
> > > > > > > > > > we call netfs_subreq_terminated() when all the RPCs complete,
> > > > > > > > > > which is the reason for the small changes to pagelist.c
> > > > > > > > > 
> > > > > > > > > I'm objecting to any middle layer "solution" that adds overhead to the
> > > > > > > > > NFS I/O paths.
> > > > > > > > > 
> > > > > > > > Got it.
> > > > > > > > 
> > > > > > > > > I'm willing to consider solutions that are specific only to the fscache
> > > > > > > > > use case (i.e. when the 'fsc' mount option is specified). However when
> > > > > > > > > I perform a normal NFS mount, and do I/O, then I don't want to see
> > > > > > > > > extra memory allocations, extra indirect calls and larger inode
> > > > > > > > > footprints.
> > > > > > > > > 
> > > > > > > > > IOW: I want the code to optimise for the case of standard NFS, not for
> > > > > > > > > the case of 'NFS with cachefs additions'.
> > > > > > > > > 
> > > > > > > > I agree completely.  Are you seeing extra memory allocations
> > > > > > > > happen on mounts without 'fsc' or is it more a concern or how
> > > > > > > > some of the patches look?  We should not be calling any netfs or
> > > > > > > > fscache code if 'fsc' is not on the mount and I don't see any in my
> > > > > > > > testing. So either there's a misunderstanding here, or there's a
> > > > > > > > bug I'm missing.
> > > > > > > > 
> > > > > > > > If fscache is not configured, then nfs_netfs_read_folio() and
> > > > > > > > nfs_netfs_readahead() is a wrapper that returns -ENOBUFS.
> > > > > > > > If it's configured but not enabled, then the checks for
> > > > > > > > netfs_inode(inode)->cache should skip over any netfs code.
> > > > > > > > But maybe there's a non-obvious bug you're seeing and
> > > > > > > > somehow netfs is still getting called?  Because I cannot
> > > > > > > > see netfs getting called if 'fsc' is not on the mount in my
> > > > > > > > tests.
> > > > > > > > 
> > > > > > > > int nfs_netfs_read_folio(struct file *file, struct folio *folio)
> > > > > > > > {
> > > > > > > >       if (!netfs_inode(folio_inode(folio))->cache)
> > > > > > > >               return -ENOBUFS;
> > > > > > > > 
> > > > > > > >       return netfs_read_folio(file, folio);
> > > > > > > > }
> > > > > > > > 
> > > > > > > > int nfs_netfs_readahead(struct readahead_control *ractl)
> > > > > > > > {
> > > > > > > >       struct inode *inode = ractl->mapping->host;
> > > > > > > > 
> > > > > > > >       if (!netfs_inode(inode)->cache)
> > > > > > > >               return -ENOBUFS;
> > > > > > > > 
> > > > > > > >       netfs_readahead(ractl);
> > > > > > > >       return 0;
> > > > > > > > }
> > > > > > > > 
> > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > Can you be more specific as to the portions of the patch you don't
> > > > > > > > > > like
> > > > > > > > > > so I can move it in the right direction?
> > > > > > > > > > 
> > > > > > > > > > This is from patch #2 which you didn't comment on.  I'm not sure
> > > > > > > > > > you're
> > > > > > > > > > ok with it though, since you mention "extra bloat to the inode".
> > > > > > > > > > Do you object to this even though it's wrapped in an
> > > > > > > > > > #ifdef CONFIG_NFS_FSCACHE?  If so, do you require no
> > > > > > > > > > extra size be added to nfs_inode?
> > > > > > > > > > 
> > > > > > > > > > @@ -204,9 +208,11 @@ struct nfs_inode {
> > > > > > > > > >       __u64 write_io;
> > > > > > > > > >       __u64 read_io;
> > > > > > > > > > #ifdef CONFIG_NFS_FSCACHE
> > > > > > > > > > -       struct fscache_cookie   *fscache;
> > > > > > > > > > -#endif
> > > > > > > > > > +       struct netfs_inode      netfs; /* netfs context and VFS inode
> > > > > > > > > > */
> > > > > > > > > > +#else
> > > > > > > > > >       struct inode            vfs_inode;
> > > > > > > > > > +#endif
> > > > > > > > > > +
> > > > > > > > > 
> > > > > > > > > Ideally, I'd prefer no extra size. I can live with it up to a certain
> > > > > > > > > point, however for now NFS is not unconditionally opting into the netfs
> > > > > > > > > project. If we're to ever do that, then I want to see streamlined code
> > > > > > > > > for the standard I/O case.
> > > > > > > > > 
> > > > > > > > Ok and understood about standard I/O case.
> > > > > > > > 
> > > > > > > > I was thinking how we might not increase the size, but I don't think
> > > > > > > > I can make it work.
> > > > > > > > 
> > > > > > > > I thought we could change to something like the below, without an
> > > > > > > > embedded struct inode:
> > > > > > > > 
> > > > > > > > @@ -204,9 +208,11 @@ struct nfs_inode {
> > > > > > > >       __u64 write_io;
> > > > > > > >       __u64 read_io;
> > > > > > > > #ifdef CONFIG_NFS_FSCACHE
> > > > > > > > -       struct fscache_cookie   *fscache;
> > > > > > > > -#endif
> > > > > > > > +       struct netfs_inode      *netfs; /* netfs context and VFS inode */
> > > > > > > > +#else
> > > > > > > >       struct inode            vfs_inode;
> > > > > > > > +#endif
> > > > > > > > +
> > > > > > > > 
> > > > > > > > Then I would need to alloc/free a netfs_inode at the time of
> > > > > > > > nfs_inode initiation.  Unfortunately this has the issue that the NFS_I()
> > > > > > > > macro cannot work, because it requires an embedded "struct inode"
> > > > > > > > due to "container_of" use:
> > > > > > > > 
> > > > > > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > > > > > > > +{
> > > > > > > > +       return &nfsi->netfs.inode;
> > > > > > > > +}
> > > > > > > > +static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > > > > > > > +{
> > > > > > > > +       return container_of(inode, struct nfs_inode, netfs.inode);
> > > > > > > > +}
> > > > > > > > +#else
> > > > > > > > +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > > > > > > > +{
> > > > > > > > +       return &nfsi->vfs_inode;
> > > > > > > > +}
> > > > > > > > static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > > > > > > > {
> > > > > > > >       return container_of(inode, struct nfs_inode, vfs_inode);
> > > > > > > > }
> > > > > > > > +#endif
> > > > > > > > 
> > > > > > > > 
> > > > > > > 
> > > > > > > Actually Trond maybe we can achieve a "0 length increase" of
> > > > > > > nfs_inode if dhowells would take a patch to modify the definition
> > > > > > > of struct netfs_inode and netfs_inode_init(), something like the WIP
> > > > > > > patch below.  What do you think?
> > > > > > 
> > > > > > That works for me.
> > > > > > 
> > > > > > > 
> > > > > > > I think maybe this could be a follow-on patch and if you/dhowells
> > > > > > > think it's an ok idea I can try to work out what is needed across
> > > > > > > the tree.  I thought about it more and I kinda agree that in the
> > > > > > > case for NFS where fscache is "configured but not enabled",
> > > > > > > then even though we're only adding 24 bytes to the nfs_inode
> > > > > > > each time, it will add up so it is worth at least a discussion.
> > > > > > > 
> > > > > > > diff --git a/include/linux/netfs.h b/include/linux/netfs.h
> > > > > > > index f2402ddeafbf..195714f1c355 100644
> > > > > > > --- a/include/linux/netfs.h
> > > > > > > +++ b/include/linux/netfs.h
> > > > > > > @@ -118,11 +118,7 @@ enum netfs_io_source {
> > > > > > > typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error,
> > > > > > >                                     bool was_async);
> > > > > > > 
> > > > > > > -/*
> > > > > > > - * Per-inode context.  This wraps the VFS inode.
> > > > > > > - */
> > > > > > > -struct netfs_inode {
> > > > > > > -       struct inode            inode;          /* The VFS inode */
> > > > > > > +struct netfs_info {
> > > > > > >       const struct netfs_request_ops *ops;
> > > > > > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > > > > >       struct fscache_cookie   *cache;
> > > > > > > @@ -130,6 +126,14 @@ struct netfs_inode {
> > > > > > >       loff_t                  remote_i_size;  /* Size of the remote file */
> > > > > > > };
> > > > > > > 
> > > > > > > +/*
> > > > > > > + * Per-inode context.  This wraps the VFS inode.
> > > > > > > + */
> > > > > > > +struct netfs_inode {
> > > > > > > +       struct inode            inode;          /* The VFS inode */
> > > > > > > +       struct netfs_info       *netfs;         /* Rest of netfs data */
> > > > > > > +};
> > > > > > > +
> > > > > > > /*
> > > > > > > * Resources required to do operations on a cache.
> > > > > > > */
> > > > > > > @@ -312,10 +316,12 @@ static inline struct netfs_inode
> > > > > > > *netfs_inode(struct inode *inode)
> > > > > > > static inline void netfs_inode_init(struct netfs_inode *ctx,
> > > > > > >                                   const struct netfs_request_ops *ops)
> > > > > > > {
> > > > > > > -       ctx->ops = ops;
> > > > > > > -       ctx->remote_i_size = i_size_read(&ctx->inode);
> > > > > > > +       ctx->netfs = kzalloc(sizeof(struct netfs_info)), GFP_KERNEL);
> > > > > > > +       /* FIXME: Check for NULL */
> > > > > > > +       ctx->netfs->ops = ops;
> > > > > > > +       ctx->netfs->remote_i_size = i_size_read(&ctx->inode);
> > > > > > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > > > > > -       ctx->cache = NULL;
> > > > > > > +       ctx->netfs->cache = NULL;
> > > > > > > #endif
> > > > > > > }
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > Are you ok with the stub functions which are placed in fscache.h, and
> > > > > > > > > > when CONFIG_NFS_FSCACHE is not set, become either a no-op
> > > > > > > > > > or a 1-liner (nfs_netfs_readpage_release)?
> > > > > > > > > > 
> > > > > > > > > > #else /* CONFIG_NFS_FSCACHE */
> > > > > > > > > > +static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {}
> > > > > > > > > > +static inline void nfs_netfs_initiate_read(struct nfs_pgio_header
> > > > > > > > > > *hdr) {}
> > > > > > > > > > +static inline void nfs_netfs_read_completion(struct nfs_pgio_header
> > > > > > > > > > *hdr) {}
> > > > > > > > > > +static inline void nfs_netfs_readpage_release(struct nfs_page *req)
> > > > > > > > > > +{
> > > > > > > > > > +       unlock_page(req->wb_page);
> > > > > > > > > > +}
> > > > > > > > > > static inline void nfs_fscache_release_super_cookie(struct
> > > > > > > > > > super_block *sb) {}
> > > > > > > > > > static inline void nfs_fscache_init_inode(struct inode *inode) {}
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > Do you object to the below?  If so, then do you want
> > > > > > > > > > #ifdef CONFIG_NFS_FSCACHE here?
> > > > > > > > > > 
> > > > > > > > > > -- a/fs/nfs/inode.c
> > > > > > > > > > +++ b/fs/nfs/inode.c
> > > > > > > > > > @@ -2249,6 +2249,8 @@ struct inode *nfs_alloc_inode(struct
> > > > > > > > > > super_block *sb)
> > > > > > > > > > #ifdef CONFIG_NFS_V4_2
> > > > > > > > > >       nfsi->xattr_cache = NULL;
> > > > > > > > > > #endif
> > > > > > > > > > +       nfs_netfs_inode_init(nfsi);
> > > > > > > > > > +
> > > > > > > > > >       return VFS_I(nfsi);
> > > > > > > > > > }
> > > > > > > > > > EXPORT_SYMBOL_GPL(nfs_alloc_i
> > > > > > > > > > node);
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > Do you object to the changes in fs/nfs/read.c?  Specifically,
> > > > > > > > > > how about the below calls to netfs from nfs_read_folio and
> > > > > > > > > > nfs_readahead into equivalent netfs calls?  So when
> > > > > > > > > > NFS_CONFIG_FSCACHE is set, but fscache is not enabled
> > > > > > > > > > ('fsc' not on mount), these netfs functions do immediately call
> > > > > > > > > > netfs_alloc_request().  But I wonder if we could simply add a
> > > > > > > > > > check to see if fscache is enabled on the mount, and skip
> > > > > > > > > > over to satisfy what you want.  Am I understanding what you
> > > > > > > > > > want?
> > > > > > > > > 
> > > > > > > > > Quite frankly, I'd prefer that we just split out the functionality that
> > > > > > > > > is needed from the netfs code so that it can be optimised. However I'm
> > > > > > > > > not interested enough in the cachefs functionality to work on that
> > > > > > > > > myself. ...and as I indicated above, I might be OK with opting into the
> > > > > > > > > netfs project, once the overhead can be made to disappear.
> > > > > > > > > 
> > > > > > > > Understood.
> > > > > > > > 
> > > > > > > > If you think it makes more sense, I can move some of the nfs_netfs_*
> > > > > > > > functions into a netfs.c file as a starting point.  Or that can maybe
> > > > > > > > be done in a future patchset?
> > > > > > > > 
> > > > > > > > For now I was equating netfs and fscache together so we can
> > > > > > > > move on from the much older and single-page limiting fscache
> > > > > > > > interface that is likely to go away soon.
> > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > @@ -355,6 +343,10 @@ int nfs_read_folio(struct file *file, struct
> > > > > > > > > > folio *folio)
> > > > > > > > > >       if (NFS_STALE(inode))
> > > > > > > > > >               goto out_unlock;
> > > > > > > > > > 
> > > > > > > > > > +       ret = nfs_netfs_read_folio(file, folio);
> > > > > > > > > > +       if (!ret)
> > > > > > > > > > +               goto out;
> > > > > > > > > > +
> > > > > > > > > > 
> > > > > > > > > > @@ -405,6 +399,10 @@ void nfs_readahead(struct readahead_control
> > > > > > > > > > *ractl)
> > > > > > > > > >       if (NFS_STALE(inode))
> > > > > > > > > >               goto out;
> > > > > > > > > > 
> > > > > > > > > > +       ret = nfs_netfs_readahead(ractl);
> > > > > > > > > > +       if (!ret)
> > > > > > > > > > +               goto out;
> > > > > > > > > > +
> > > > > > > > > > 
> > > > > > > > The above wrappers should prevent any additional overhead when fscache
> > > > > > > > is not enabled.  As far as I know these work to avoid calling netfs
> > > > > > > > when 'fsc' is not on the mount.
> > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > And how about these calls from different points in the read
> > > > > > > > > > path to the earlier mentioned stub functions?
> > > > > > > > > > 
> > > > > > > > > > @@ -110,20 +110,13 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
> > > > > > > > > > 
> > > > > > > > > > static void nfs_readpage_release(struct nfs_page *req, int error)
> > > > > > > > > > {
> > > > > > > > > > -       struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > > > > > > > > >       struct page *page = req->wb_page;
> > > > > > > > > > 
> > > > > > > > > > -       dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb-
> > > > > > > > > > > s_id,
> > > > > > > > > > -               (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
> > > > > > > > > > -               (long long)req_offset(req));
> > > > > > > > > > -
> > > > > > > > > >       if (nfs_error_is_fatal_on_server(error) && error != -
> > > > > > > > > > ETIMEDOUT)
> > > > > > > > > >               SetPageError(page);
> > > > > > > > > > -       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
> > > > > > > > > > -               if (PageUptodate(page))
> > > > > > > > > > -                       nfs_fscache_write_page(inode, page);
> > > > > > > > > > -               unlock_page(page);
> > > > > > > > > > -       }
> > > > > > > > > > +       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > > > > > > > > +               nfs_netfs_readpage_release(req);
> > > > > > > > > > +
> > > > > > > > > 
> > > > > > > > > I'm not seeing the value of wrapping unlock_page(), no... That code is
> > > > > > > > > going to need to change when we move it to use folios natively anyway.
> > > > > > > > > 
> > > > > > > > Ok, how about I make it conditional on whether fscache is configured
> > > > > > > > and enabled then, similar to the nfs_netfs_read_folio() and
> > > > > > > > nfs_netfs_readahead()?  Below is what that would look like.
> > > > > > > > I could inline the code in nfs_netfs_readpage_release() if you
> > > > > > > > think it would be clearer.
> > > > > > > > 
> > > > > > > > static void nfs_readpage_release(struct nfs_page *req, int error)
> > > > > > > > {
> > > > > > > >       struct page *page = req->wb_page;
> > > > > > > > 
> > > > > > > >       if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
> > > > > > > >               SetPageError(page);
> > > > > > > >       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > > > > > > #ifndef CONFIG_NFS_FSCACHE
> > > > > > > >               unlock_page(req->wb_page);
> > > > > > > > #else
> > > > > > > >               nfs_netfs_readpage_release(req);
> > > > > > > > #endif
> > > > > > > >       nfs_release_request(req);
> > > > > > > > }
> > > > > > > > 
> > > > > > > > 
> > > > > > > > void nfs_netfs_readpage_release(struct nfs_page *req)
> > > > > > > > {
> > > > > > > >   struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > > > > > > > 
> > > > > > > >   /*
> > > > > > > >    * If fscache is enabled, netfs will unlock pages.
> > > > > > > >    */
> > > > > > > >   if (netfs_inode(inode)->cache)
> > > > > > > >       return;
> > > > > > > > 
> > > > > > > >   unlock_page(req->wb_page);
> > > > > > > > }
> > > > > > > > 
> > > > > > > > 
> > > > > > > > > >       nfs_release_request(req);
> > > > > > > > > > }
> > > > > > > > > > 
> > > > > > > > > > @@ -177,6 +170,8 @@ static void nfs_read_completion(struct
> > > > > > > > > > nfs_pgio_header *hdr)
> > > > > > > > > >               nfs_list_remove_request(req);
> > > > > > > > > >               nfs_readpage_release(req, error);
> > > > > > > > > >       }
> > > > > > > > > > +       nfs_netfs_read_completion(hdr);
> > > > > > > > > > +
> > > > > > > > > > out:
> > > > > > > > > >       hdr->release(hdr);
> > > > > > > > > > }
> > > > > > > > > > @@ -187,6 +182,7 @@ static void nfs_initiate_read(struct
> > > > > > > > > > nfs_pgio_header *hdr,
> > > > > > > > > >                             struct rpc_task_setup *task_setup_data,
> > > > > > > > > > int how)
> > > > > > > > > > {
> > > > > > > > > >       rpc_ops->read_setup(hdr, msg);
> > > > > > > > > > +       nfs_netfs_initiate_read(hdr);
> > > > > > > > > >       trace_nfs_initiate_read(hdr);
> > > > > > > > > > }
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > Are you ok with these additions?  Something like this would
> > > > > > > > > > be required in the case of fscache configured and enabled,
> > > > > > > > > > because we could have some of the data in a read in
> > > > > > > > > > fscache, and some not.  That is the reason for the netfs
> > > > > > > > > > design, and why we need to be able to call the normal
> > > > > > > > > > NFS read IO path (netfs calls into issue_read, and we call
> > > > > > > > > > back via netfs_subreq_terminated)?
> > > > > > > > > > 
> > > > > > > > > > @@ -101,6 +101,9 @@ struct nfs_pageio_descriptor {
> > > > > > > > > >       struct pnfs_layout_segment *pg_lseg;
> > > > > > > > > >       struct nfs_io_completion *pg_io_completion;
> > > > > > > > > >       struct nfs_direct_req   *pg_dreq;
> > > > > > > > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > > > +       void                    *pg_netfs;
> > > > > > > > > > +#endif
> > > > > > > > > > 
> > > > > > > > > > @@ -1619,6 +1619,9 @@ struct nfs_pgio_header {
> > > > > > > > > >       const struct nfs_rw_ops *rw_ops;
> > > > > > > > > >       struct nfs_io_completion *io_completion;
> > > > > > > > > >       struct nfs_direct_req   *dreq;
> > > > > > > > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > > > +       void                    *netfs;
> > > > > > > > > > +#endif
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > And these additions to pagelist.c?
> > > > > > > > > > 
> > > > > > > > > > @@ -68,6 +69,10 @@ void nfs_pgheader_init(struct
> > > > > > > > > > nfs_pageio_descriptor *desc,
> > > > > > > > > >       hdr->good_bytes = mirror->pg_count;
> > > > > > > > > >       hdr->io_completion = desc->pg_io_completion;
> > > > > > > > > >       hdr->dreq = desc->pg_dreq;
> > > > > > > > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > > > +       if (desc->pg_netfs)
> > > > > > > > > > +               hdr->netfs = desc->pg_netfs;
> > > > > > > > > > +#endif
> > > > > > > > > 
> > > > > > > > > Why the conditional?
> > > > > > > > > 
> > > > > > > > Not really needed and I was thinking of removing it, so I'll do that.
> > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > @@ -846,6 +851,9 @@ void nfs_pageio_init(struct nfs_pageio_descriptor
> > > > > > > > > > *desc,
> > > > > > > > > >       desc->pg_lseg = NULL;
> > > > > > > > > >       desc->pg_io_completion = NULL;
> > > > > > > > > >       desc->pg_dreq = NULL;
> > > > > > > > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > > > +       desc->pg_netfs = NULL;
> > > > > > > > > > +#endif
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > @@ -1360,6 +1369,9 @@ int nfs_pageio_resend(struct
> > > > > > > > > > nfs_pageio_descriptor *desc,
> > > > > > > > > > 
> > > > > > > > > >       desc->pg_io_completion = hdr->io_completion;
> > > > > > > > > >       desc->pg_dreq = hdr->dreq;
> > > > > > > > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > > > +       desc->pg_netfs = hdr->netfs;
> > > > > > > > > > +#endif
> > > > > > > > > 
> > > > > > > > > Those all need wrapper functions instead of embedding #ifdefs.
> > > > > > > > > 
> > > > > > > > Ok.
> > > > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > > My expectation is that the standard I/O path should have minimal
> > > > > > > > > > > overhead, and should certainly not increase the overhead that we
> > > > > > > > > > > already have. Will this be addressed in future iterations of these
> > > > > > > > > > > patches?
> > > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > I will do what I can to satisfy what you want, either by fixing up
> > > > > > > > > > this patch or follow-on patches.  Hopefully the above questions
> > > > > > > > > > will clarify the next steps.
> > > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > --
> > > > > > > > > Trond Myklebust
> > > > > > > > > Linux NFS client maintainer, Hammerspace
> > > > > > > > > trond.myklebust@hammerspace.com
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > Trond Myklebust
> > > > > > CTO, Hammerspace Inc
> > > > > > 1900 S Norfolk St, Suite 350 - #45
> > > > > > San Mateo, CA 94403
> > > > > > 
> > > > > > www.hammer.space
> > > > > > 
> > > > > > 
> > > > > 
> > > > 
> > 
> > --
> > Jeff Layton <jlayton@poochiereds.net>
Benjamin Maynard Nov. 14, 2022, 4:03 p.m. UTC | #15
Hi Dave,

I've added responses to your questions inline below.

I also tried adding the noatime option to the mount on the source
filer as Jeff suggested, but this has not made any difference and the
issue is still persisting for me.

I created the following diagram that explains my setup, and the exact
tests I am performing:
https://drive.google.com/file/d/12Xf-9yHCKM4eMr2YGqdSAVfGcximW4OG/view?usp=sharing.

Hopefully this is clearer than my explanations below (let me know if
you'd prefer me to share an alternative way).

In order to remove the re-exporting layer of complexity, I also
performed the tests without the re-export server (architecture:
https://drive.google.com/file/d/1DQKhqo_UnQ8ul-z5Iram5LpisDmkKziQ/view?usp=share_link):

Source NFS Server <-- Client (with FS-Cache)

The same is happening, I cannot get FS-Cache to serve from cache.
Heavy writes, but no reads, even when the same file is copied many
times.

Hopefully something I am doing wrong on my end, but I can't figure out what.

Kind Regards
Benjamin Maynard


On Mon, 14 Nov 2022 at 13:47, David Wysochanski <dwysocha@redhat.com> wrote:
>
> I apologize I did not read carefully enough and I missed some details
> in your original post.
> More below.
>
> On Sat, Nov 12, 2022 at 7:47 AM Benjamin Maynard <benmaynard@google.com> wrote:
> >
> > Hi all,
> >
> > I've been doing some more testing with these patches, I applied all of
> > the patches (v10 from
> > https://patchwork.kernel.org/project/linux-nfs/list/?series=691729)
> > apart from Patch 6 (the RFC patch) to version 6.0.8 of the kernel.
> >
> > I have the following setup:
> >
> > Source NFS Server <-- Re-Export Server (with FS-Cache) <-- NFS Client.
> >
> > I have a 500Gb file on the Source NFS Server, which I am then copying
> > to the NFS Client via the Re-Export Server.
> >
> > On the first copy, I see heavy writes to /var/cache/fscache on the
> > re-export server, and once the file copy completes I see that
> > /var/cache/fscache is approximately 500Gb in size. All good so far.
> >
> > I then deleted that file from the NFS Client, and dropped the caches
> > just to be safe (echo 3 > /proc/sys/vm/drop_caches on the NFS Client).
> >
> If you delete the file from the NFS client, how does that not delete the
> file from the original NFS server?

Sorry - to be clear, I never deleted the file from the NFS mount
(which I know would in turn delete it from the re-export server and
the source filer).

In order to perform the performance test, I copied the file from the
NFS mount on the NFS Client, to a local directory (cp
/mnt/nfs/500gb.img /tmp).

When I said "I then deleted that file from the NFS Client", I meant I
deleted the local copy of that file. Not the file on the mount (rm
/tmp/500gb.img).

Just to also stress, I have never dropped the caches on the Re-Export
Server (the one with FS-Cache) at any point in any of these tests, so
I don't think this is the problem. I have only ever dropped the caches
on the NFS client that is mounting the Re-Export Server.

> > I then performed another copy of the 500Gb file on the NFS Client,
> > again via the Re-Export Server. What I expected would happen is that I
> > would see heavy reads from the /var/cache/fscache volume as the file
> > should be served from FS-Cache.
> >
> I don't understand this.  When you say you "performed another copy"
> of what file?  Wasn't the file deleted in the above step?

As above, only the local copy was deleted.

> > However what I actually saw was no reads whatsoever, FS-Cache seems to
> > be ignored and the file is pulled from the Source NFS Filer again. I
> > also see heavy writes to /var/cache/fscache, so it appears that
> > FS-Cache is overwriting its existing cache, and never using it.
>
> That would happen if the file was changed or re-created.
>
> > I only have 104Gb of memory on the Re-Export Server (with FS-Cache) so
> > it is not possible that the file is being served from the page cache.
> >
> > We saw this behaviour before on an older set of the patches when our
> > mount between the Re-Export Server and the Source NFS Filer was using
> > the "sync" option, but we are now using the "async" option and the
> > same is happening.
> >
> > Mount options:
> >
> > Source NFS Server <-- Re-Export Server (with FS-Cache):
> >
> > 10.0.0.49:/files /srv/nfs/files nfs
> > rw,noatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,acregmin=600,acregmax=600,acdirmin=600,acdirmax=600,hard,nocto,proto=tcp,nconnect=16,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.49,mountvers=3,mountport=37485,mountproto=tcp,fsc,local_lock=none,addr=10.0.0.49
> >
> > Re-Export Server (with FS-Cache) <-- NFS Client:
> >
> > 10.0.0.3:/files /mnt/nfs nfs
> > rw,relatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.3,mountvers=3,mountport=20048,mountproto=tcp,local_lock=none,addr=10.0.0.3
> >
> > It is also worth noting this behaviour is not unique to the re-export
> > use case. I see FS-Cache not being used with the following setup:
> >
> > Source NFS Server <-- Client (with FS-Cache).
> >
>
> This points at something more fundamental like something missed
> in the test or maybe a mount option.  Can you explain what test
> you're doing here when you say "this behavior is not unique"?

I've created the following diagram which explains the test I am
performing. I think it is a little easier to follow than explaining in
text. This should be viewable without any authentication:
https://drive.google.com/file/d/12Xf-9yHCKM4eMr2YGqdSAVfGcximW4OG/view?usp=sharing.

By "this behaviour is not unique to the re-export use case" I mean
that the same happens if I remove the re-export server completely, and
just have the following setup:

Source NFS Server <-- Client (with FS-Cache).

> Can you show the mount options for both:
> - fscache filesystem on the re-export server (/var/cache/fscache)

root@reexport:~$ mount | grep /var/cache/fscache
/dev/md127 on /var/cache/fscache type ext4
(rw,relatime,discard,nobarrier,stripe=1024)

> - exported filesystem on the NFS server (filesystem in /etc/exports)

I have tried both:

root@source:~$ mount | grep files
/dev/sdb1 on /files type ext4 (rw)

root@source:~$ cat /etc/exports
/files 10.0.0.0/8(rw,sync,wdelay,no_root_squash,no_all_squash,no_subtree_check,sec=sys,secure,nohide)

and (at Jeff's suggestion):

root@source:~$ mount | grep files
/dev/sdb1 on /files type ext4 (rw,noatime)

root@source:~$ cat /etc/exports
/files 10.0.0.0/8(rw,sync,wdelay,no_root_squash,no_all_squash,no_subtree_check,sec=sys,secure,nohide)


> Unfortunately the problem with drop_caches makes it more difficult
> to know when fscache is truly working.  But some other unit test
> I have shows fscache does work with this patchset so I'm puzzled why
> you're not seeing it work at all.
>
> I pinged dhowells on the drop_caches issue so maybe we can get
> that one sorted out soon but I'm not sure since it's part of a series
> and proposes changes in mm.

Just to be clear, I have never used drop_caches on the re-export
server in any of these tests. I have only ever done this on the NFS
Client.

>
> > Thanks,
> > Ben
> >
> >
> > Kind Regards
> >
> > Benjamin Maynard
> >
> > Customer Engineer
> >
> > benmaynard@google.com
> >
> > Google, Inc.
> >
> >
> >
> >
> > On Mon, 31 Oct 2022 at 22:22, Trond Myklebust <trondmy@hammerspace.com> wrote:
> > >
> > >
> > >
> > > > On Oct 30, 2022, at 19:25, David Wysochanski <dwysocha@redhat.com> wrote:
> > > >
> > > > On Sat, Oct 29, 2022 at 12:46 PM David Wysochanski <dwysocha@redhat.com> wrote:
> > > >>
> > > >> On Fri, Oct 28, 2022 at 12:59 PM Trond Myklebust <trondmy@kernel.org> wrote:
> > > >>>
> > > >>> On Fri, 2022-10-28 at 07:50 -0400, David Wysochanski wrote:
> > > >>>> On Thu, Oct 27, 2022 at 3:16 PM Trond Myklebust <trondmy@kernel.org>
> > > >>>> wrote:
> > > >>>>>
> > > >>>>> On Mon, 2022-10-17 at 06:52 -0400, Dave Wysochanski wrote:
> > > >>>>>> Convert the NFS buffered read code paths to corresponding netfs
> > > >>>>>> APIs,
> > > >>>>>> but only when fscache is configured and enabled.
> > > >>>>>>
> > > >>>>>> The netfs API defines struct netfs_request_ops which must be
> > > >>>>>> filled
> > > >>>>>> in by the network filesystem.  For NFS, we only need to define 5
> > > >>>>>> of
> > > >>>>>> the functions, the main one being the issue_read() function.
> > > >>>>>> The issue_read() function is called by the netfs layer when a
> > > >>>>>> read
> > > >>>>>> cannot be fulfilled locally, and must be sent to the server
> > > >>>>>> (either
> > > >>>>>> the cache is not active, or it is active but the data is not
> > > >>>>>> available).
> > > >>>>>> Once the read from the server is complete, netfs requires a call
> > > >>>>>> to
> > > >>>>>> netfs_subreq_terminated() which conveys either how many bytes
> > > >>>>>> were
> > > >>>>>> read
> > > >>>>>> successfully, or an error.  Note that issue_read() is called with
> > > >>>>>> a
> > > >>>>>> structure, netfs_io_subrequest, which defines the IO requested,
> > > >>>>>> and
> > > >>>>>> contains a start and a length (both in bytes), and assumes the
> > > >>>>>> underlying
> > > >>>>>> netfs will return a either an error on the whole region, or the
> > > >>>>>> number
> > > >>>>>> of bytes successfully read.
> > > >>>>>>
> > > >>>>>> The NFS IO path is page based and the main APIs are the pgio APIs
> > > >>>>>> defined
> > > >>>>>> in pagelist.c.  For the pgio APIs, there is no way for the caller
> > > >>>>>> to
> > > >>>>>> know how many RPCs will be sent and how the pages will be broken
> > > >>>>>> up
> > > >>>>>> into underlying RPCs, each of which will have their own
> > > >>>>>> completion
> > > >>>>>> and
> > > >>>>>> return code.  In contrast, netfs is subrequest based, a single
> > > >>>>>> subrequest may contain multiple pages, and a single subrequest is
> > > >>>>>> initiated with issue_read() and terminated with
> > > >>>>>> netfs_subreq_terminated().
> > > >>>>>> Thus, to utilze the netfs APIs, NFS needs some way to accommodate
> > > >>>>>> the netfs API requirement on the single response to the whole
> > > >>>>>> subrequest, while also minimizing disruptive changes to the NFS
> > > >>>>>> pgio layer.
> > > >>>>>>
> > > >>>>>> The approach taken with this patch is to allocate a small
> > > >>>>>> structure
> > > >>>>>> for each nfs_netfs_issue_read() call, store the final error and
> > > >>>>>> number
> > > >>>>>> of bytes successfully transferred in the structure, and update
> > > >>>>>> these
> > > >>>>>> values
> > > >>>>>> as each RPC completes.  The refcount on the structure is used as
> > > >>>>>> a
> > > >>>>>> marker
> > > >>>>>> for the last RPC completion, is incremented in
> > > >>>>>> nfs_netfs_read_initiate(),
> > > >>>>>> and decremented inside nfs_netfs_read_completion(), when a
> > > >>>>>> nfs_pgio_header
> > > >>>>>> contains a valid pointer to the data.  On the final put (which
> > > >>>>>> signals
> > > >>>>>> the final outstanding RPC is complete) in
> > > >>>>>> nfs_netfs_read_completion(),
> > > >>>>>> call netfs_subreq_terminated() with either the final error value
> > > >>>>>> (if
> > > >>>>>> one or more READs complete with an error) or the number of bytes
> > > >>>>>> successfully transferred (if all RPCs complete successfully).
> > > >>>>>> Note
> > > >>>>>> that when all RPCs complete successfully, the number of bytes
> > > >>>>>> transferred
> > > >>>>>> is capped to the length of the subrequest.  Capping the
> > > >>>>>> transferred
> > > >>>>>> length
> > > >>>>>> to the subrequest length prevents "Subreq overread" warnings from
> > > >>>>>> netfs.
> > > >>>>>> This is due to the "aligned_len" in nfs_pageio_add_page(), and
> > > >>>>>> the
> > > >>>>>> corner case where NFS requests a full page at the end of the
> > > >>>>>> file,
> > > >>>>>> even when i_size reflects only a partial page (NFS overread).
> > > >>>>>>
> > > >>>>>> Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
> > > >>>>>> Reviewed-by: Jeff Layton <jlayton@kernel.org>
> > > >>>>>
> > > >>>>>
> > > >>>>> This is not doing what I asked for, which was to separate out the
> > > >>>>> fscache functionality, so that we can call that if and when it is
> > > >>>>> available.
> > > >>>>>
> > > >>>> I must have misunderstood then.
> > > >>>>
> > > >>>> The last feedback I have from you was that you wanted it to be
> > > >>>> an opt-in feature, and it was a comment on a previous patch
> > > >>>> to Kconfig.  I was proceeding the best I knew how, but
> > > >>>> let me try to get back on track.
> > > >>>>
> > > >>>>> Instead, it is just wrapping the NFS requests inside netfs
> > > >>>>> requests. As
> > > >>>>> it stands, that means it is just duplicating information, and
> > > >>>>> adding
> > > >>>>> unnecessary overhead to the standard I/O path (extra allocations,
> > > >>>>> extra
> > > >>>>> indirect calls, and extra bloat to the inode).
> > > >>>>>
> > > >>>> I think I understand what you're saying but I'm not sure.  Let me
> > > >>>> ask some clarifying questions.
> > > >>>>
> > > >>>> Are you objecting to the code when CONFIG_NFS_FSCACHE is
> > > >>>> configured?  Or when it is not?  Or both?  I think you're objecting
> > > >>>> when it's configured, but not enabled (we mount without 'fsc').
> > > >>>> Am I right?
> > > >>>>
> > > >>>> Also, are you objecting to the design that to use fcache we now
> > > >>>> have to use netfs, specifically:
> > > >>>> - call into netfs via either netfs_read_folio or netfs_readahead
> > > >>>> - if fscache is enabled, then the IO can be satisfied from fscache
> > > >>>> - if fscache is not enabled, or some of the IO cannot be satisfied
> > > >>>> from the cache, then NFS is called back via netfs_issue_read
> > > >>>> and we use the normal NFS read pageio interface.  This requires
> > > >>>> we call netfs_subreq_terminated() when all the RPCs complete,
> > > >>>> which is the reason for the small changes to pagelist.c
> > > >>>
> > > >>> I'm objecting to any middle layer "solution" that adds overhead to the
> > > >>> NFS I/O paths.
> > > >>>
> > > >> Got it.
> > > >>
> > > >>> I'm willing to consider solutions that are specific only to the fscache
> > > >>> use case (i.e. when the 'fsc' mount option is specified). However when
> > > >>> I perform a normal NFS mount, and do I/O, then I don't want to see
> > > >>> extra memory allocations, extra indirect calls and larger inode
> > > >>> footprints.
> > > >>>
> > > >>> IOW: I want the code to optimise for the case of standard NFS, not for
> > > >>> the case of 'NFS with cachefs additions'.
> > > >>>
> > > >> I agree completely.  Are you seeing extra memory allocations
> > > >> happen on mounts without 'fsc' or is it more a concern or how
> > > >> some of the patches look?  We should not be calling any netfs or
> > > >> fscache code if 'fsc' is not on the mount and I don't see any in my
> > > >> testing. So either there's a misunderstanding here, or there's a
> > > >> bug I'm missing.
> > > >>
> > > >> If fscache is not configured, then nfs_netfs_read_folio() and
> > > >> nfs_netfs_readahead() is a wrapper that returns -ENOBUFS.
> > > >> If it's configured but not enabled, then the checks for
> > > >> netfs_inode(inode)->cache should skip over any netfs code.
> > > >> But maybe there's a non-obvious bug you're seeing and
> > > >> somehow netfs is still getting called?  Because I cannot
> > > >> see netfs getting called if 'fsc' is not on the mount in my
> > > >> tests.
> > > >>
> > > >> int nfs_netfs_read_folio(struct file *file, struct folio *folio)
> > > >> {
> > > >>       if (!netfs_inode(folio_inode(folio))->cache)
> > > >>               return -ENOBUFS;
> > > >>
> > > >>       return netfs_read_folio(file, folio);
> > > >> }
> > > >>
> > > >> int nfs_netfs_readahead(struct readahead_control *ractl)
> > > >> {
> > > >>       struct inode *inode = ractl->mapping->host;
> > > >>
> > > >>       if (!netfs_inode(inode)->cache)
> > > >>               return -ENOBUFS;
> > > >>
> > > >>       netfs_readahead(ractl);
> > > >>       return 0;
> > > >> }
> > > >>
> > > >>
> > > >>>>
> > > >>>> Can you be more specific as to the portions of the patch you don't
> > > >>>> like
> > > >>>> so I can move it in the right direction?
> > > >>>>
> > > >>>> This is from patch #2 which you didn't comment on.  I'm not sure
> > > >>>> you're
> > > >>>> ok with it though, since you mention "extra bloat to the inode".
> > > >>>> Do you object to this even though it's wrapped in an
> > > >>>> #ifdef CONFIG_NFS_FSCACHE?  If so, do you require no
> > > >>>> extra size be added to nfs_inode?
> > > >>>>
> > > >>>> @@ -204,9 +208,11 @@ struct nfs_inode {
> > > >>>>       __u64 write_io;
> > > >>>>       __u64 read_io;
> > > >>>> #ifdef CONFIG_NFS_FSCACHE
> > > >>>> -       struct fscache_cookie   *fscache;
> > > >>>> -#endif
> > > >>>> +       struct netfs_inode      netfs; /* netfs context and VFS inode
> > > >>>> */
> > > >>>> +#else
> > > >>>>       struct inode            vfs_inode;
> > > >>>> +#endif
> > > >>>> +
> > > >>>
> > > >>> Ideally, I'd prefer no extra size. I can live with it up to a certain
> > > >>> point, however for now NFS is not unconditionally opting into the netfs
> > > >>> project. If we're to ever do that, then I want to see streamlined code
> > > >>> for the standard I/O case.
> > > >>>
> > > >> Ok and understood about standard I/O case.
> > > >>
> > > >> I was thinking how we might not increase the size, but I don't think
> > > >> I can make it work.
> > > >>
> > > >> I thought we could change to something like the below, without an
> > > >> embedded struct inode:
> > > >>
> > > >> @@ -204,9 +208,11 @@ struct nfs_inode {
> > > >>       __u64 write_io;
> > > >>       __u64 read_io;
> > > >> #ifdef CONFIG_NFS_FSCACHE
> > > >> -       struct fscache_cookie   *fscache;
> > > >> -#endif
> > > >> +       struct netfs_inode      *netfs; /* netfs context and VFS inode */
> > > >> +#else
> > > >>       struct inode            vfs_inode;
> > > >> +#endif
> > > >> +
> > > >>
> > > >> Then I would need to alloc/free a netfs_inode at the time of
> > > >> nfs_inode initiation.  Unfortunately this has the issue that the NFS_I()
> > > >> macro cannot work, because it requires an embedded "struct inode"
> > > >> due to "container_of" use:
> > > >>
> > > >> +#ifdef CONFIG_NFS_FSCACHE
> > > >> +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > > >> +{
> > > >> +       return &nfsi->netfs.inode;
> > > >> +}
> > > >> +static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > > >> +{
> > > >> +       return container_of(inode, struct nfs_inode, netfs.inode);
> > > >> +}
> > > >> +#else
> > > >> +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > > >> +{
> > > >> +       return &nfsi->vfs_inode;
> > > >> +}
> > > >> static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > > >> {
> > > >>       return container_of(inode, struct nfs_inode, vfs_inode);
> > > >> }
> > > >> +#endif
> > > >>
> > > >>
> > > >
> > > > Actually Trond maybe we can achieve a "0 length increase" of
> > > > nfs_inode if dhowells would take a patch to modify the definition
> > > > of struct netfs_inode and netfs_inode_init(), something like the WIP
> > > > patch below.  What do you think?
> > >
> > > That works for me.
> > >
> > > >
> > > > I think maybe this could be a follow-on patch and if you/dhowells
> > > > think it's an ok idea I can try to work out what is needed across
> > > > the tree.  I thought about it more and I kinda agree that in the
> > > > case for NFS where fscache is "configured but not enabled",
> > > > then even though we're only adding 24 bytes to the nfs_inode
> > > > each time, it will add up so it is worth at least a discussion.
> > > >
> > > > diff --git a/include/linux/netfs.h b/include/linux/netfs.h
> > > > index f2402ddeafbf..195714f1c355 100644
> > > > --- a/include/linux/netfs.h
> > > > +++ b/include/linux/netfs.h
> > > > @@ -118,11 +118,7 @@ enum netfs_io_source {
> > > > typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error,
> > > >                                     bool was_async);
> > > >
> > > > -/*
> > > > - * Per-inode context.  This wraps the VFS inode.
> > > > - */
> > > > -struct netfs_inode {
> > > > -       struct inode            inode;          /* The VFS inode */
> > > > +struct netfs_info {
> > > >       const struct netfs_request_ops *ops;
> > > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > >       struct fscache_cookie   *cache;
> > > > @@ -130,6 +126,14 @@ struct netfs_inode {
> > > >       loff_t                  remote_i_size;  /* Size of the remote file */
> > > > };
> > > >
> > > > +/*
> > > > + * Per-inode context.  This wraps the VFS inode.
> > > > + */
> > > > +struct netfs_inode {
> > > > +       struct inode            inode;          /* The VFS inode */
> > > > +       struct netfs_info       *netfs;         /* Rest of netfs data */
> > > > +};
> > > > +
> > > > /*
> > > > * Resources required to do operations on a cache.
> > > > */
> > > > @@ -312,10 +316,12 @@ static inline struct netfs_inode
> > > > *netfs_inode(struct inode *inode)
> > > > static inline void netfs_inode_init(struct netfs_inode *ctx,
> > > >                                   const struct netfs_request_ops *ops)
> > > > {
> > > > -       ctx->ops = ops;
> > > > -       ctx->remote_i_size = i_size_read(&ctx->inode);
> > > > +       ctx->netfs = kzalloc(sizeof(struct netfs_info)), GFP_KERNEL);
> > > > +       /* FIXME: Check for NULL */
> > > > +       ctx->netfs->ops = ops;
> > > > +       ctx->netfs->remote_i_size = i_size_read(&ctx->inode);
> > > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > > -       ctx->cache = NULL;
> > > > +       ctx->netfs->cache = NULL;
> > > > #endif
> > > > }
> > > >
> > > >
> > > >
> > > >>
> > > >>>>
> > > >>>>
> > > >>>> Are you ok with the stub functions which are placed in fscache.h, and
> > > >>>> when CONFIG_NFS_FSCACHE is not set, become either a no-op
> > > >>>> or a 1-liner (nfs_netfs_readpage_release)?
> > > >>>>
> > > >>>> #else /* CONFIG_NFS_FSCACHE */
> > > >>>> +static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {}
> > > >>>> +static inline void nfs_netfs_initiate_read(struct nfs_pgio_header
> > > >>>> *hdr) {}
> > > >>>> +static inline void nfs_netfs_read_completion(struct nfs_pgio_header
> > > >>>> *hdr) {}
> > > >>>> +static inline void nfs_netfs_readpage_release(struct nfs_page *req)
> > > >>>> +{
> > > >>>> +       unlock_page(req->wb_page);
> > > >>>> +}
> > > >>>> static inline void nfs_fscache_release_super_cookie(struct
> > > >>>> super_block *sb) {}
> > > >>>> static inline void nfs_fscache_init_inode(struct inode *inode) {}
> > > >>>>
> > > >>>>
> > > >>>> Do you object to the below?  If so, then do you want
> > > >>>> #ifdef CONFIG_NFS_FSCACHE here?
> > > >>>>
> > > >>>> -- a/fs/nfs/inode.c
> > > >>>> +++ b/fs/nfs/inode.c
> > > >>>> @@ -2249,6 +2249,8 @@ struct inode *nfs_alloc_inode(struct
> > > >>>> super_block *sb)
> > > >>>> #ifdef CONFIG_NFS_V4_2
> > > >>>>       nfsi->xattr_cache = NULL;
> > > >>>> #endif
> > > >>>> +       nfs_netfs_inode_init(nfsi);
> > > >>>> +
> > > >>>>       return VFS_I(nfsi);
> > > >>>> }
> > > >>>> EXPORT_SYMBOL_GPL(nfs_alloc_i
> > > >>>> node);
> > > >>>>
> > > >>>>
> > > >>>> Do you object to the changes in fs/nfs/read.c?  Specifically,
> > > >>>> how about the below calls to netfs from nfs_read_folio and
> > > >>>> nfs_readahead into equivalent netfs calls?  So when
> > > >>>> NFS_CONFIG_FSCACHE is set, but fscache is not enabled
> > > >>>> ('fsc' not on mount), these netfs functions do immediately call
> > > >>>> netfs_alloc_request().  But I wonder if we could simply add a
> > > >>>> check to see if fscache is enabled on the mount, and skip
> > > >>>> over to satisfy what you want.  Am I understanding what you
> > > >>>> want?
> > > >>>
> > > >>> Quite frankly, I'd prefer that we just split out the functionality that
> > > >>> is needed from the netfs code so that it can be optimised. However I'm
> > > >>> not interested enough in the cachefs functionality to work on that
> > > >>> myself. ...and as I indicated above, I might be OK with opting into the
> > > >>> netfs project, once the overhead can be made to disappear.
> > > >>>
> > > >> Understood.
> > > >>
> > > >> If you think it makes more sense, I can move some of the nfs_netfs_*
> > > >> functions into a netfs.c file as a starting point.  Or that can maybe
> > > >> be done in a future patchset?
> > > >>
> > > >> For now I was equating netfs and fscache together so we can
> > > >> move on from the much older and single-page limiting fscache
> > > >> interface that is likely to go away soon.
> > > >>
> > > >>>>
> > > >>>> @@ -355,6 +343,10 @@ int nfs_read_folio(struct file *file, struct
> > > >>>> folio *folio)
> > > >>>>       if (NFS_STALE(inode))
> > > >>>>               goto out_unlock;
> > > >>>>
> > > >>>> +       ret = nfs_netfs_read_folio(file, folio);
> > > >>>> +       if (!ret)
> > > >>>> +               goto out;
> > > >>>> +
> > > >>>>
> > > >>>> @@ -405,6 +399,10 @@ void nfs_readahead(struct readahead_control
> > > >>>> *ractl)
> > > >>>>       if (NFS_STALE(inode))
> > > >>>>               goto out;
> > > >>>>
> > > >>>> +       ret = nfs_netfs_readahead(ractl);
> > > >>>> +       if (!ret)
> > > >>>> +               goto out;
> > > >>>> +
> > > >>>>
> > > >> The above wrappers should prevent any additional overhead when fscache
> > > >> is not enabled.  As far as I know these work to avoid calling netfs
> > > >> when 'fsc' is not on the mount.
> > > >>
> > > >>>>
> > > >>>> And how about these calls from different points in the read
> > > >>>> path to the earlier mentioned stub functions?
> > > >>>>
> > > >>>> @@ -110,20 +110,13 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
> > > >>>>
> > > >>>> static void nfs_readpage_release(struct nfs_page *req, int error)
> > > >>>> {
> > > >>>> -       struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > > >>>>       struct page *page = req->wb_page;
> > > >>>>
> > > >>>> -       dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb-
> > > >>>>> s_id,
> > > >>>> -               (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
> > > >>>> -               (long long)req_offset(req));
> > > >>>> -
> > > >>>>       if (nfs_error_is_fatal_on_server(error) && error != -
> > > >>>> ETIMEDOUT)
> > > >>>>               SetPageError(page);
> > > >>>> -       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
> > > >>>> -               if (PageUptodate(page))
> > > >>>> -                       nfs_fscache_write_page(inode, page);
> > > >>>> -               unlock_page(page);
> > > >>>> -       }
> > > >>>> +       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > >>>> +               nfs_netfs_readpage_release(req);
> > > >>>> +
> > > >>>
> > > >>> I'm not seeing the value of wrapping unlock_page(), no... That code is
> > > >>> going to need to change when we move it to use folios natively anyway.
> > > >>>
> > > >> Ok, how about I make it conditional on whether fscache is configured
> > > >> and enabled then, similar to the nfs_netfs_read_folio() and
> > > >> nfs_netfs_readahead()?  Below is what that would look like.
> > > >> I could inline the code in nfs_netfs_readpage_release() if you
> > > >> think it would be clearer.
> > > >>
> > > >> static void nfs_readpage_release(struct nfs_page *req, int error)
> > > >> {
> > > >>       struct page *page = req->wb_page;
> > > >>
> > > >>       if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
> > > >>               SetPageError(page);
> > > >>       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > >> #ifndef CONFIG_NFS_FSCACHE
> > > >>               unlock_page(req->wb_page);
> > > >> #else
> > > >>               nfs_netfs_readpage_release(req);
> > > >> #endif
> > > >>       nfs_release_request(req);
> > > >> }
> > > >>
> > > >>
> > > >> void nfs_netfs_readpage_release(struct nfs_page *req)
> > > >> {
> > > >>   struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > > >>
> > > >>   /*
> > > >>    * If fscache is enabled, netfs will unlock pages.
> > > >>    */
> > > >>   if (netfs_inode(inode)->cache)
> > > >>       return;
> > > >>
> > > >>   unlock_page(req->wb_page);
> > > >> }
> > > >>
> > > >>
> > > >>>>       nfs_release_request(req);
> > > >>>> }
> > > >>>>
> > > >>>> @@ -177,6 +170,8 @@ static void nfs_read_completion(struct
> > > >>>> nfs_pgio_header *hdr)
> > > >>>>               nfs_list_remove_request(req);
> > > >>>>               nfs_readpage_release(req, error);
> > > >>>>       }
> > > >>>> +       nfs_netfs_read_completion(hdr);
> > > >>>> +
> > > >>>> out:
> > > >>>>       hdr->release(hdr);
> > > >>>> }
> > > >>>> @@ -187,6 +182,7 @@ static void nfs_initiate_read(struct
> > > >>>> nfs_pgio_header *hdr,
> > > >>>>                             struct rpc_task_setup *task_setup_data,
> > > >>>> int how)
> > > >>>> {
> > > >>>>       rpc_ops->read_setup(hdr, msg);
> > > >>>> +       nfs_netfs_initiate_read(hdr);
> > > >>>>       trace_nfs_initiate_read(hdr);
> > > >>>> }
> > > >>>>
> > > >>>>
> > > >>>> Are you ok with these additions?  Something like this would
> > > >>>> be required in the case of fscache configured and enabled,
> > > >>>> because we could have some of the data in a read in
> > > >>>> fscache, and some not.  That is the reason for the netfs
> > > >>>> design, and why we need to be able to call the normal
> > > >>>> NFS read IO path (netfs calls into issue_read, and we call
> > > >>>> back via netfs_subreq_terminated)?
> > > >>>>
> > > >>>> @@ -101,6 +101,9 @@ struct nfs_pageio_descriptor {
> > > >>>>       struct pnfs_layout_segment *pg_lseg;
> > > >>>>       struct nfs_io_completion *pg_io_completion;
> > > >>>>       struct nfs_direct_req   *pg_dreq;
> > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > >>>> +       void                    *pg_netfs;
> > > >>>> +#endif
> > > >>>>
> > > >>>> @@ -1619,6 +1619,9 @@ struct nfs_pgio_header {
> > > >>>>       const struct nfs_rw_ops *rw_ops;
> > > >>>>       struct nfs_io_completion *io_completion;
> > > >>>>       struct nfs_direct_req   *dreq;
> > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > >>>> +       void                    *netfs;
> > > >>>> +#endif
> > > >>>>
> > > >>>>
> > > >>>> And these additions to pagelist.c?
> > > >>>>
> > > >>>> @@ -68,6 +69,10 @@ void nfs_pgheader_init(struct
> > > >>>> nfs_pageio_descriptor *desc,
> > > >>>>       hdr->good_bytes = mirror->pg_count;
> > > >>>>       hdr->io_completion = desc->pg_io_completion;
> > > >>>>       hdr->dreq = desc->pg_dreq;
> > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > >>>> +       if (desc->pg_netfs)
> > > >>>> +               hdr->netfs = desc->pg_netfs;
> > > >>>> +#endif
> > > >>>
> > > >>> Why the conditional?
> > > >>>
> > > >> Not really needed and I was thinking of removing it, so I'll do that.
> > > >>
> > > >>>>
> > > >>>>
> > > >>>> @@ -846,6 +851,9 @@ void nfs_pageio_init(struct nfs_pageio_descriptor
> > > >>>> *desc,
> > > >>>>       desc->pg_lseg = NULL;
> > > >>>>       desc->pg_io_completion = NULL;
> > > >>>>       desc->pg_dreq = NULL;
> > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > >>>> +       desc->pg_netfs = NULL;
> > > >>>> +#endif
> > > >>>>
> > > >>>>
> > > >>>> @@ -1360,6 +1369,9 @@ int nfs_pageio_resend(struct
> > > >>>> nfs_pageio_descriptor *desc,
> > > >>>>
> > > >>>>       desc->pg_io_completion = hdr->io_completion;
> > > >>>>       desc->pg_dreq = hdr->dreq;
> > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > >>>> +       desc->pg_netfs = hdr->netfs;
> > > >>>> +#endif
> > > >>>
> > > >>> Those all need wrapper functions instead of embedding #ifdefs.
> > > >>>
> > > >> Ok.
> > > >>
> > > >>
> > > >>
> > > >>>>
> > > >>>>
> > > >>>>> My expectation is that the standard I/O path should have minimal
> > > >>>>> overhead, and should certainly not increase the overhead that we
> > > >>>>> already have. Will this be addressed in future iterations of these
> > > >>>>> patches?
> > > >>>>>
> > > >>>>
> > > >>>> I will do what I can to satisfy what you want, either by fixing up
> > > >>>> this patch or follow-on patches.  Hopefully the above questions
> > > >>>> will clarify the next steps.
> > > >>>>
> > > >>>
> > > >>> --
> > > >>> Trond Myklebust
> > > >>> Linux NFS client maintainer, Hammerspace
> > > >>> trond.myklebust@hammerspace.com
> > >
> > >
> > >
> > > Trond Myklebust
> > > CTO, Hammerspace Inc
> > > 1900 S Norfolk St, Suite 350 - #45
> > > San Mateo, CA 94403
> > >
> > > www.hammer.space
> > >
> > >
> >
>
Jeff Layton Nov. 14, 2022, 5:11 p.m. UTC | #16
On Mon, 2022-11-14 at 16:03 +0000, Benjamin Maynard wrote:
> Hi Dave,
> 
> I've added responses to your questions inline below.
> 
> I also tried adding the noatime option to the mount on the source
> filer as Jeff suggested, but this has not made any difference and the
> issue is still persisting for me.
> 

My mistake. I didn't realize you were using v3 exclusively. The change
attr doesn't exist there, so this shouldn't be a factor.

> I created the following diagram that explains my setup, and the exact
> tests I am performing:
> https://drive.google.com/file/d/12Xf-9yHCKM4eMr2YGqdSAVfGcximW4OG/view?usp=sharing.
> 
> Hopefully this is clearer than my explanations below (let me know if
> you'd prefer me to share an alternative way).
> 
> In order to remove the re-exporting layer of complexity, I also
> performed the tests without the re-export server (architecture:
> https://drive.google.com/file/d/1DQKhqo_UnQ8ul-z5Iram5LpisDmkKziQ/view?usp=share_link):
> 
> Source NFS Server <-- Client (with FS-Cache)
> 
> The same is happening, I cannot get FS-Cache to serve from cache.
> Heavy writes, but no reads, even when the same file is copied many
> times.
> 
> Hopefully something I am doing wrong on my end, but I can't figure out what.
> 
> 

I don't think you're doing anything wrong. We'll probably need to dig
into why netfs/fscache decided to go to the server instead of using the
cache.

It might be interesting to turn up the cachefiles_prep_read tracepoint
during this and see why it's not opting to read from cache. David and
David may have other tracepoints they recommend turning on too.

> 
> On Mon, 14 Nov 2022 at 13:47, David Wysochanski <dwysocha@redhat.com> wrote:
> > 
> > I apologize I did not read carefully enough and I missed some details
> > in your original post.
> > More below.
> > 
> > On Sat, Nov 12, 2022 at 7:47 AM Benjamin Maynard <benmaynard@google.com> wrote:
> > > 
> > > Hi all,
> > > 
> > > I've been doing some more testing with these patches, I applied all of
> > > the patches (v10 from
> > > https://patchwork.kernel.org/project/linux-nfs/list/?series=691729)
> > > apart from Patch 6 (the RFC patch) to version 6.0.8 of the kernel.
> > > 
> > > I have the following setup:
> > > 
> > > Source NFS Server <-- Re-Export Server (with FS-Cache) <-- NFS Client.
> > > 
> > > I have a 500Gb file on the Source NFS Server, which I am then copying
> > > to the NFS Client via the Re-Export Server.
> > > 
> > > On the first copy, I see heavy writes to /var/cache/fscache on the
> > > re-export server, and once the file copy completes I see that
> > > /var/cache/fscache is approximately 500Gb in size. All good so far.
> > > 
> > > I then deleted that file from the NFS Client, and dropped the caches
> > > just to be safe (echo 3 > /proc/sys/vm/drop_caches on the NFS Client).
> > > 
> > If you delete the file from the NFS client, how does that not delete the
> > file from the original NFS server?
> 
> Sorry - to be clear, I never deleted the file from the NFS mount
> (which I know would in turn delete it from the re-export server and
> the source filer).
> 
> In order to perform the performance test, I copied the file from the
> NFS mount on the NFS Client, to a local directory (cp
> /mnt/nfs/500gb.img /tmp).
> 
> When I said "I then deleted that file from the NFS Client", I meant I
> deleted the local copy of that file. Not the file on the mount (rm
> /tmp/500gb.img).
> 
> Just to also stress, I have never dropped the caches on the Re-Export
> Server (the one with FS-Cache) at any point in any of these tests, so
> I don't think this is the problem. I have only ever dropped the caches
> on the NFS client that is mounting the Re-Export Server.
> 
> > > I then performed another copy of the 500Gb file on the NFS Client,
> > > again via the Re-Export Server. What I expected would happen is that I
> > > would see heavy reads from the /var/cache/fscache volume as the file
> > > should be served from FS-Cache.
> > > 
> > I don't understand this.  When you say you "performed another copy"
> > of what file?  Wasn't the file deleted in the above step?
> 
> As above, only the local copy was deleted.
> 
> > > However what I actually saw was no reads whatsoever, FS-Cache seems to
> > > be ignored and the file is pulled from the Source NFS Filer again. I
> > > also see heavy writes to /var/cache/fscache, so it appears that
> > > FS-Cache is overwriting its existing cache, and never using it.
> > 
> > That would happen if the file was changed or re-created.
> > 
> > > I only have 104Gb of memory on the Re-Export Server (with FS-Cache) so
> > > it is not possible that the file is being served from the page cache.
> > > 
> > > We saw this behaviour before on an older set of the patches when our
> > > mount between the Re-Export Server and the Source NFS Filer was using
> > > the "sync" option, but we are now using the "async" option and the
> > > same is happening.
> > > 
> > > Mount options:
> > > 
> > > Source NFS Server <-- Re-Export Server (with FS-Cache):
> > > 
> > > 10.0.0.49:/files /srv/nfs/files nfs
> > > rw,noatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,acregmin=600,acregmax=600,acdirmin=600,acdirmax=600,hard,nocto,proto=tcp,nconnect=16,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.49,mountvers=3,mountport=37485,mountproto=tcp,fsc,local_lock=none,addr=10.0.0.49
> > > 
> > > Re-Export Server (with FS-Cache) <-- NFS Client:
> > > 
> > > 10.0.0.3:/files /mnt/nfs nfs
> > > rw,relatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.3,mountvers=3,mountport=20048,mountproto=tcp,local_lock=none,addr=10.0.0.3
> > > 
> > > It is also worth noting this behaviour is not unique to the re-export
> > > use case. I see FS-Cache not being used with the following setup:
> > > 
> > > Source NFS Server <-- Client (with FS-Cache).
> > > 
> > 
> > This points at something more fundamental like something missed
> > in the test or maybe a mount option.  Can you explain what test
> > you're doing here when you say "this behavior is not unique"?
> 
> I've created the following diagram which explains the test I am
> performing. I think it is a little easier to follow than explaining in
> text. This should be viewable without any authentication:
> https://drive.google.com/file/d/12Xf-9yHCKM4eMr2YGqdSAVfGcximW4OG/view?usp=sharing.
> 
> By "this behaviour is not unique to the re-export use case" I mean
> that the same happens if I remove the re-export server completely, and
> just have the following setup:
> 
> Source NFS Server <-- Client (with FS-Cache).
> 
> > Can you show the mount options for both:
> > - fscache filesystem on the re-export server (/var/cache/fscache)
> 
> root@reexport:~$ mount | grep /var/cache/fscache
> /dev/md127 on /var/cache/fscache type ext4
> (rw,relatime,discard,nobarrier,stripe=1024)
> 
> > - exported filesystem on the NFS server (filesystem in /etc/exports)
> 
> I have tried both:
> 
> root@source:~$ mount | grep files
> /dev/sdb1 on /files type ext4 (rw)
> 
> root@source:~$ cat /etc/exports
> /files 10.0.0.0/8(rw,sync,wdelay,no_root_squash,no_all_squash,no_subtree_check,sec=sys,secure,nohide)
> 
> and (at Jeff's suggestion):
> 
> root@source:~$ mount | grep files
> /dev/sdb1 on /files type ext4 (rw,noatime)
> 
> root@source:~$ cat /etc/exports
> /files 10.0.0.0/8(rw,sync,wdelay,no_root_squash,no_all_squash,no_subtree_check,sec=sys,secure,nohide)
> 
> 
> > Unfortunately the problem with drop_caches makes it more difficult
> > to know when fscache is truly working.  But some other unit test
> > I have shows fscache does work with this patchset so I'm puzzled why
> > you're not seeing it work at all.
> > 
> > I pinged dhowells on the drop_caches issue so maybe we can get
> > that one sorted out soon but I'm not sure since it's part of a series
> > and proposes changes in mm.
> 
> Just to be clear, I have never used drop_caches on the re-export
> server in any of these tests. I have only ever done this on the NFS
> Client.
> 
> > 
> > > Thanks,
> > > Ben
> > > 
> > > 
> > > Kind Regards
> > > 
> > > Benjamin Maynard
> > > 
> > > Customer Engineer
> > > 
> > > benmaynard@google.com
> > > 
> > > Google, Inc.
> > > 
> > > 
> > > 
> > > 
> > > On Mon, 31 Oct 2022 at 22:22, Trond Myklebust <trondmy@hammerspace.com> wrote:
> > > > 
> > > > 
> > > > 
> > > > > On Oct 30, 2022, at 19:25, David Wysochanski <dwysocha@redhat.com> wrote:
> > > > > 
> > > > > On Sat, Oct 29, 2022 at 12:46 PM David Wysochanski <dwysocha@redhat.com> wrote:
> > > > > > 
> > > > > > On Fri, Oct 28, 2022 at 12:59 PM Trond Myklebust <trondmy@kernel.org> wrote:
> > > > > > > 
> > > > > > > On Fri, 2022-10-28 at 07:50 -0400, David Wysochanski wrote:
> > > > > > > > On Thu, Oct 27, 2022 at 3:16 PM Trond Myklebust <trondmy@kernel.org>
> > > > > > > > wrote:
> > > > > > > > > 
> > > > > > > > > On Mon, 2022-10-17 at 06:52 -0400, Dave Wysochanski wrote:
> > > > > > > > > > Convert the NFS buffered read code paths to corresponding netfs
> > > > > > > > > > APIs,
> > > > > > > > > > but only when fscache is configured and enabled.
> > > > > > > > > > 
> > > > > > > > > > The netfs API defines struct netfs_request_ops which must be
> > > > > > > > > > filled
> > > > > > > > > > in by the network filesystem.  For NFS, we only need to define 5
> > > > > > > > > > of
> > > > > > > > > > the functions, the main one being the issue_read() function.
> > > > > > > > > > The issue_read() function is called by the netfs layer when a
> > > > > > > > > > read
> > > > > > > > > > cannot be fulfilled locally, and must be sent to the server
> > > > > > > > > > (either
> > > > > > > > > > the cache is not active, or it is active but the data is not
> > > > > > > > > > available).
> > > > > > > > > > Once the read from the server is complete, netfs requires a call
> > > > > > > > > > to
> > > > > > > > > > netfs_subreq_terminated() which conveys either how many bytes
> > > > > > > > > > were
> > > > > > > > > > read
> > > > > > > > > > successfully, or an error.  Note that issue_read() is called with
> > > > > > > > > > a
> > > > > > > > > > structure, netfs_io_subrequest, which defines the IO requested,
> > > > > > > > > > and
> > > > > > > > > > contains a start and a length (both in bytes), and assumes the
> > > > > > > > > > underlying
> > > > > > > > > > netfs will return a either an error on the whole region, or the
> > > > > > > > > > number
> > > > > > > > > > of bytes successfully read.
> > > > > > > > > > 
> > > > > > > > > > The NFS IO path is page based and the main APIs are the pgio APIs
> > > > > > > > > > defined
> > > > > > > > > > in pagelist.c.  For the pgio APIs, there is no way for the caller
> > > > > > > > > > to
> > > > > > > > > > know how many RPCs will be sent and how the pages will be broken
> > > > > > > > > > up
> > > > > > > > > > into underlying RPCs, each of which will have their own
> > > > > > > > > > completion
> > > > > > > > > > and
> > > > > > > > > > return code.  In contrast, netfs is subrequest based, a single
> > > > > > > > > > subrequest may contain multiple pages, and a single subrequest is
> > > > > > > > > > initiated with issue_read() and terminated with
> > > > > > > > > > netfs_subreq_terminated().
> > > > > > > > > > Thus, to utilze the netfs APIs, NFS needs some way to accommodate
> > > > > > > > > > the netfs API requirement on the single response to the whole
> > > > > > > > > > subrequest, while also minimizing disruptive changes to the NFS
> > > > > > > > > > pgio layer.
> > > > > > > > > > 
> > > > > > > > > > The approach taken with this patch is to allocate a small
> > > > > > > > > > structure
> > > > > > > > > > for each nfs_netfs_issue_read() call, store the final error and
> > > > > > > > > > number
> > > > > > > > > > of bytes successfully transferred in the structure, and update
> > > > > > > > > > these
> > > > > > > > > > values
> > > > > > > > > > as each RPC completes.  The refcount on the structure is used as
> > > > > > > > > > a
> > > > > > > > > > marker
> > > > > > > > > > for the last RPC completion, is incremented in
> > > > > > > > > > nfs_netfs_read_initiate(),
> > > > > > > > > > and decremented inside nfs_netfs_read_completion(), when a
> > > > > > > > > > nfs_pgio_header
> > > > > > > > > > contains a valid pointer to the data.  On the final put (which
> > > > > > > > > > signals
> > > > > > > > > > the final outstanding RPC is complete) in
> > > > > > > > > > nfs_netfs_read_completion(),
> > > > > > > > > > call netfs_subreq_terminated() with either the final error value
> > > > > > > > > > (if
> > > > > > > > > > one or more READs complete with an error) or the number of bytes
> > > > > > > > > > successfully transferred (if all RPCs complete successfully).
> > > > > > > > > > Note
> > > > > > > > > > that when all RPCs complete successfully, the number of bytes
> > > > > > > > > > transferred
> > > > > > > > > > is capped to the length of the subrequest.  Capping the
> > > > > > > > > > transferred
> > > > > > > > > > length
> > > > > > > > > > to the subrequest length prevents "Subreq overread" warnings from
> > > > > > > > > > netfs.
> > > > > > > > > > This is due to the "aligned_len" in nfs_pageio_add_page(), and
> > > > > > > > > > the
> > > > > > > > > > corner case where NFS requests a full page at the end of the
> > > > > > > > > > file,
> > > > > > > > > > even when i_size reflects only a partial page (NFS overread).
> > > > > > > > > > 
> > > > > > > > > > Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
> > > > > > > > > > Reviewed-by: Jeff Layton <jlayton@kernel.org>
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > This is not doing what I asked for, which was to separate out the
> > > > > > > > > fscache functionality, so that we can call that if and when it is
> > > > > > > > > available.
> > > > > > > > > 
> > > > > > > > I must have misunderstood then.
> > > > > > > > 
> > > > > > > > The last feedback I have from you was that you wanted it to be
> > > > > > > > an opt-in feature, and it was a comment on a previous patch
> > > > > > > > to Kconfig.  I was proceeding the best I knew how, but
> > > > > > > > let me try to get back on track.
> > > > > > > > 
> > > > > > > > > Instead, it is just wrapping the NFS requests inside netfs
> > > > > > > > > requests. As
> > > > > > > > > it stands, that means it is just duplicating information, and
> > > > > > > > > adding
> > > > > > > > > unnecessary overhead to the standard I/O path (extra allocations,
> > > > > > > > > extra
> > > > > > > > > indirect calls, and extra bloat to the inode).
> > > > > > > > > 
> > > > > > > > I think I understand what you're saying but I'm not sure.  Let me
> > > > > > > > ask some clarifying questions.
> > > > > > > > 
> > > > > > > > Are you objecting to the code when CONFIG_NFS_FSCACHE is
> > > > > > > > configured?  Or when it is not?  Or both?  I think you're objecting
> > > > > > > > when it's configured, but not enabled (we mount without 'fsc').
> > > > > > > > Am I right?
> > > > > > > > 
> > > > > > > > Also, are you objecting to the design that to use fcache we now
> > > > > > > > have to use netfs, specifically:
> > > > > > > > - call into netfs via either netfs_read_folio or netfs_readahead
> > > > > > > > - if fscache is enabled, then the IO can be satisfied from fscache
> > > > > > > > - if fscache is not enabled, or some of the IO cannot be satisfied
> > > > > > > > from the cache, then NFS is called back via netfs_issue_read
> > > > > > > > and we use the normal NFS read pageio interface.  This requires
> > > > > > > > we call netfs_subreq_terminated() when all the RPCs complete,
> > > > > > > > which is the reason for the small changes to pagelist.c
> > > > > > > 
> > > > > > > I'm objecting to any middle layer "solution" that adds overhead to the
> > > > > > > NFS I/O paths.
> > > > > > > 
> > > > > > Got it.
> > > > > > 
> > > > > > > I'm willing to consider solutions that are specific only to the fscache
> > > > > > > use case (i.e. when the 'fsc' mount option is specified). However when
> > > > > > > I perform a normal NFS mount, and do I/O, then I don't want to see
> > > > > > > extra memory allocations, extra indirect calls and larger inode
> > > > > > > footprints.
> > > > > > > 
> > > > > > > IOW: I want the code to optimise for the case of standard NFS, not for
> > > > > > > the case of 'NFS with cachefs additions'.
> > > > > > > 
> > > > > > I agree completely.  Are you seeing extra memory allocations
> > > > > > happen on mounts without 'fsc' or is it more a concern or how
> > > > > > some of the patches look?  We should not be calling any netfs or
> > > > > > fscache code if 'fsc' is not on the mount and I don't see any in my
> > > > > > testing. So either there's a misunderstanding here, or there's a
> > > > > > bug I'm missing.
> > > > > > 
> > > > > > If fscache is not configured, then nfs_netfs_read_folio() and
> > > > > > nfs_netfs_readahead() is a wrapper that returns -ENOBUFS.
> > > > > > If it's configured but not enabled, then the checks for
> > > > > > netfs_inode(inode)->cache should skip over any netfs code.
> > > > > > But maybe there's a non-obvious bug you're seeing and
> > > > > > somehow netfs is still getting called?  Because I cannot
> > > > > > see netfs getting called if 'fsc' is not on the mount in my
> > > > > > tests.
> > > > > > 
> > > > > > int nfs_netfs_read_folio(struct file *file, struct folio *folio)
> > > > > > {
> > > > > >       if (!netfs_inode(folio_inode(folio))->cache)
> > > > > >               return -ENOBUFS;
> > > > > > 
> > > > > >       return netfs_read_folio(file, folio);
> > > > > > }
> > > > > > 
> > > > > > int nfs_netfs_readahead(struct readahead_control *ractl)
> > > > > > {
> > > > > >       struct inode *inode = ractl->mapping->host;
> > > > > > 
> > > > > >       if (!netfs_inode(inode)->cache)
> > > > > >               return -ENOBUFS;
> > > > > > 
> > > > > >       netfs_readahead(ractl);
> > > > > >       return 0;
> > > > > > }
> > > > > > 
> > > > > > 
> > > > > > > > 
> > > > > > > > Can you be more specific as to the portions of the patch you don't
> > > > > > > > like
> > > > > > > > so I can move it in the right direction?
> > > > > > > > 
> > > > > > > > This is from patch #2 which you didn't comment on.  I'm not sure
> > > > > > > > you're
> > > > > > > > ok with it though, since you mention "extra bloat to the inode".
> > > > > > > > Do you object to this even though it's wrapped in an
> > > > > > > > #ifdef CONFIG_NFS_FSCACHE?  If so, do you require no
> > > > > > > > extra size be added to nfs_inode?
> > > > > > > > 
> > > > > > > > @@ -204,9 +208,11 @@ struct nfs_inode {
> > > > > > > >       __u64 write_io;
> > > > > > > >       __u64 read_io;
> > > > > > > > #ifdef CONFIG_NFS_FSCACHE
> > > > > > > > -       struct fscache_cookie   *fscache;
> > > > > > > > -#endif
> > > > > > > > +       struct netfs_inode      netfs; /* netfs context and VFS inode
> > > > > > > > */
> > > > > > > > +#else
> > > > > > > >       struct inode            vfs_inode;
> > > > > > > > +#endif
> > > > > > > > +
> > > > > > > 
> > > > > > > Ideally, I'd prefer no extra size. I can live with it up to a certain
> > > > > > > point, however for now NFS is not unconditionally opting into the netfs
> > > > > > > project. If we're to ever do that, then I want to see streamlined code
> > > > > > > for the standard I/O case.
> > > > > > > 
> > > > > > Ok and understood about standard I/O case.
> > > > > > 
> > > > > > I was thinking how we might not increase the size, but I don't think
> > > > > > I can make it work.
> > > > > > 
> > > > > > I thought we could change to something like the below, without an
> > > > > > embedded struct inode:
> > > > > > 
> > > > > > @@ -204,9 +208,11 @@ struct nfs_inode {
> > > > > >       __u64 write_io;
> > > > > >       __u64 read_io;
> > > > > > #ifdef CONFIG_NFS_FSCACHE
> > > > > > -       struct fscache_cookie   *fscache;
> > > > > > -#endif
> > > > > > +       struct netfs_inode      *netfs; /* netfs context and VFS inode */
> > > > > > +#else
> > > > > >       struct inode            vfs_inode;
> > > > > > +#endif
> > > > > > +
> > > > > > 
> > > > > > Then I would need to alloc/free a netfs_inode at the time of
> > > > > > nfs_inode initiation.  Unfortunately this has the issue that the NFS_I()
> > > > > > macro cannot work, because it requires an embedded "struct inode"
> > > > > > due to "container_of" use:
> > > > > > 
> > > > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > > > +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > > > > > +{
> > > > > > +       return &nfsi->netfs.inode;
> > > > > > +}
> > > > > > +static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > > > > > +{
> > > > > > +       return container_of(inode, struct nfs_inode, netfs.inode);
> > > > > > +}
> > > > > > +#else
> > > > > > +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > > > > > +{
> > > > > > +       return &nfsi->vfs_inode;
> > > > > > +}
> > > > > > static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > > > > > {
> > > > > >       return container_of(inode, struct nfs_inode, vfs_inode);
> > > > > > }
> > > > > > +#endif
> > > > > > 
> > > > > > 
> > > > > 
> > > > > Actually Trond maybe we can achieve a "0 length increase" of
> > > > > nfs_inode if dhowells would take a patch to modify the definition
> > > > > of struct netfs_inode and netfs_inode_init(), something like the WIP
> > > > > patch below.  What do you think?
> > > > 
> > > > That works for me.
> > > > 
> > > > > 
> > > > > I think maybe this could be a follow-on patch and if you/dhowells
> > > > > think it's an ok idea I can try to work out what is needed across
> > > > > the tree.  I thought about it more and I kinda agree that in the
> > > > > case for NFS where fscache is "configured but not enabled",
> > > > > then even though we're only adding 24 bytes to the nfs_inode
> > > > > each time, it will add up so it is worth at least a discussion.
> > > > > 
> > > > > diff --git a/include/linux/netfs.h b/include/linux/netfs.h
> > > > > index f2402ddeafbf..195714f1c355 100644
> > > > > --- a/include/linux/netfs.h
> > > > > +++ b/include/linux/netfs.h
> > > > > @@ -118,11 +118,7 @@ enum netfs_io_source {
> > > > > typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error,
> > > > >                                     bool was_async);
> > > > > 
> > > > > -/*
> > > > > - * Per-inode context.  This wraps the VFS inode.
> > > > > - */
> > > > > -struct netfs_inode {
> > > > > -       struct inode            inode;          /* The VFS inode */
> > > > > +struct netfs_info {
> > > > >       const struct netfs_request_ops *ops;
> > > > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > > >       struct fscache_cookie   *cache;
> > > > > @@ -130,6 +126,14 @@ struct netfs_inode {
> > > > >       loff_t                  remote_i_size;  /* Size of the remote file */
> > > > > };
> > > > > 
> > > > > +/*
> > > > > + * Per-inode context.  This wraps the VFS inode.
> > > > > + */
> > > > > +struct netfs_inode {
> > > > > +       struct inode            inode;          /* The VFS inode */
> > > > > +       struct netfs_info       *netfs;         /* Rest of netfs data */
> > > > > +};
> > > > > +
> > > > > /*
> > > > > * Resources required to do operations on a cache.
> > > > > */
> > > > > @@ -312,10 +316,12 @@ static inline struct netfs_inode
> > > > > *netfs_inode(struct inode *inode)
> > > > > static inline void netfs_inode_init(struct netfs_inode *ctx,
> > > > >                                   const struct netfs_request_ops *ops)
> > > > > {
> > > > > -       ctx->ops = ops;
> > > > > -       ctx->remote_i_size = i_size_read(&ctx->inode);
> > > > > +       ctx->netfs = kzalloc(sizeof(struct netfs_info)), GFP_KERNEL);
> > > > > +       /* FIXME: Check for NULL */
> > > > > +       ctx->netfs->ops = ops;
> > > > > +       ctx->netfs->remote_i_size = i_size_read(&ctx->inode);
> > > > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > > > -       ctx->cache = NULL;
> > > > > +       ctx->netfs->cache = NULL;
> > > > > #endif
> > > > > }
> > > > > 
> > > > > 
> > > > > 
> > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > Are you ok with the stub functions which are placed in fscache.h, and
> > > > > > > > when CONFIG_NFS_FSCACHE is not set, become either a no-op
> > > > > > > > or a 1-liner (nfs_netfs_readpage_release)?
> > > > > > > > 
> > > > > > > > #else /* CONFIG_NFS_FSCACHE */
> > > > > > > > +static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {}
> > > > > > > > +static inline void nfs_netfs_initiate_read(struct nfs_pgio_header
> > > > > > > > *hdr) {}
> > > > > > > > +static inline void nfs_netfs_read_completion(struct nfs_pgio_header
> > > > > > > > *hdr) {}
> > > > > > > > +static inline void nfs_netfs_readpage_release(struct nfs_page *req)
> > > > > > > > +{
> > > > > > > > +       unlock_page(req->wb_page);
> > > > > > > > +}
> > > > > > > > static inline void nfs_fscache_release_super_cookie(struct
> > > > > > > > super_block *sb) {}
> > > > > > > > static inline void nfs_fscache_init_inode(struct inode *inode) {}
> > > > > > > > 
> > > > > > > > 
> > > > > > > > Do you object to the below?  If so, then do you want
> > > > > > > > #ifdef CONFIG_NFS_FSCACHE here?
> > > > > > > > 
> > > > > > > > -- a/fs/nfs/inode.c
> > > > > > > > +++ b/fs/nfs/inode.c
> > > > > > > > @@ -2249,6 +2249,8 @@ struct inode *nfs_alloc_inode(struct
> > > > > > > > super_block *sb)
> > > > > > > > #ifdef CONFIG_NFS_V4_2
> > > > > > > >       nfsi->xattr_cache = NULL;
> > > > > > > > #endif
> > > > > > > > +       nfs_netfs_inode_init(nfsi);
> > > > > > > > +
> > > > > > > >       return VFS_I(nfsi);
> > > > > > > > }
> > > > > > > > EXPORT_SYMBOL_GPL(nfs_alloc_i
> > > > > > > > node);
> > > > > > > > 
> > > > > > > > 
> > > > > > > > Do you object to the changes in fs/nfs/read.c?  Specifically,
> > > > > > > > how about the below calls to netfs from nfs_read_folio and
> > > > > > > > nfs_readahead into equivalent netfs calls?  So when
> > > > > > > > NFS_CONFIG_FSCACHE is set, but fscache is not enabled
> > > > > > > > ('fsc' not on mount), these netfs functions do immediately call
> > > > > > > > netfs_alloc_request().  But I wonder if we could simply add a
> > > > > > > > check to see if fscache is enabled on the mount, and skip
> > > > > > > > over to satisfy what you want.  Am I understanding what you
> > > > > > > > want?
> > > > > > > 
> > > > > > > Quite frankly, I'd prefer that we just split out the functionality that
> > > > > > > is needed from the netfs code so that it can be optimised. However I'm
> > > > > > > not interested enough in the cachefs functionality to work on that
> > > > > > > myself. ...and as I indicated above, I might be OK with opting into the
> > > > > > > netfs project, once the overhead can be made to disappear.
> > > > > > > 
> > > > > > Understood.
> > > > > > 
> > > > > > If you think it makes more sense, I can move some of the nfs_netfs_*
> > > > > > functions into a netfs.c file as a starting point.  Or that can maybe
> > > > > > be done in a future patchset?
> > > > > > 
> > > > > > For now I was equating netfs and fscache together so we can
> > > > > > move on from the much older and single-page limiting fscache
> > > > > > interface that is likely to go away soon.
> > > > > > 
> > > > > > > > 
> > > > > > > > @@ -355,6 +343,10 @@ int nfs_read_folio(struct file *file, struct
> > > > > > > > folio *folio)
> > > > > > > >       if (NFS_STALE(inode))
> > > > > > > >               goto out_unlock;
> > > > > > > > 
> > > > > > > > +       ret = nfs_netfs_read_folio(file, folio);
> > > > > > > > +       if (!ret)
> > > > > > > > +               goto out;
> > > > > > > > +
> > > > > > > > 
> > > > > > > > @@ -405,6 +399,10 @@ void nfs_readahead(struct readahead_control
> > > > > > > > *ractl)
> > > > > > > >       if (NFS_STALE(inode))
> > > > > > > >               goto out;
> > > > > > > > 
> > > > > > > > +       ret = nfs_netfs_readahead(ractl);
> > > > > > > > +       if (!ret)
> > > > > > > > +               goto out;
> > > > > > > > +
> > > > > > > > 
> > > > > > The above wrappers should prevent any additional overhead when fscache
> > > > > > is not enabled.  As far as I know these work to avoid calling netfs
> > > > > > when 'fsc' is not on the mount.
> > > > > > 
> > > > > > > > 
> > > > > > > > And how about these calls from different points in the read
> > > > > > > > path to the earlier mentioned stub functions?
> > > > > > > > 
> > > > > > > > @@ -110,20 +110,13 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
> > > > > > > > 
> > > > > > > > static void nfs_readpage_release(struct nfs_page *req, int error)
> > > > > > > > {
> > > > > > > > -       struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > > > > > > >       struct page *page = req->wb_page;
> > > > > > > > 
> > > > > > > > -       dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb-
> > > > > > > > > s_id,
> > > > > > > > -               (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
> > > > > > > > -               (long long)req_offset(req));
> > > > > > > > -
> > > > > > > >       if (nfs_error_is_fatal_on_server(error) && error != -
> > > > > > > > ETIMEDOUT)
> > > > > > > >               SetPageError(page);
> > > > > > > > -       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
> > > > > > > > -               if (PageUptodate(page))
> > > > > > > > -                       nfs_fscache_write_page(inode, page);
> > > > > > > > -               unlock_page(page);
> > > > > > > > -       }
> > > > > > > > +       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > > > > > > +               nfs_netfs_readpage_release(req);
> > > > > > > > +
> > > > > > > 
> > > > > > > I'm not seeing the value of wrapping unlock_page(), no... That code is
> > > > > > > going to need to change when we move it to use folios natively anyway.
> > > > > > > 
> > > > > > Ok, how about I make it conditional on whether fscache is configured
> > > > > > and enabled then, similar to the nfs_netfs_read_folio() and
> > > > > > nfs_netfs_readahead()?  Below is what that would look like.
> > > > > > I could inline the code in nfs_netfs_readpage_release() if you
> > > > > > think it would be clearer.
> > > > > > 
> > > > > > static void nfs_readpage_release(struct nfs_page *req, int error)
> > > > > > {
> > > > > >       struct page *page = req->wb_page;
> > > > > > 
> > > > > >       if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
> > > > > >               SetPageError(page);
> > > > > >       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > > > > #ifndef CONFIG_NFS_FSCACHE
> > > > > >               unlock_page(req->wb_page);
> > > > > > #else
> > > > > >               nfs_netfs_readpage_release(req);
> > > > > > #endif
> > > > > >       nfs_release_request(req);
> > > > > > }
> > > > > > 
> > > > > > 
> > > > > > void nfs_netfs_readpage_release(struct nfs_page *req)
> > > > > > {
> > > > > >   struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > > > > > 
> > > > > >   /*
> > > > > >    * If fscache is enabled, netfs will unlock pages.
> > > > > >    */
> > > > > >   if (netfs_inode(inode)->cache)
> > > > > >       return;
> > > > > > 
> > > > > >   unlock_page(req->wb_page);
> > > > > > }
> > > > > > 
> > > > > > 
> > > > > > > >       nfs_release_request(req);
> > > > > > > > }
> > > > > > > > 
> > > > > > > > @@ -177,6 +170,8 @@ static void nfs_read_completion(struct
> > > > > > > > nfs_pgio_header *hdr)
> > > > > > > >               nfs_list_remove_request(req);
> > > > > > > >               nfs_readpage_release(req, error);
> > > > > > > >       }
> > > > > > > > +       nfs_netfs_read_completion(hdr);
> > > > > > > > +
> > > > > > > > out:
> > > > > > > >       hdr->release(hdr);
> > > > > > > > }
> > > > > > > > @@ -187,6 +182,7 @@ static void nfs_initiate_read(struct
> > > > > > > > nfs_pgio_header *hdr,
> > > > > > > >                             struct rpc_task_setup *task_setup_data,
> > > > > > > > int how)
> > > > > > > > {
> > > > > > > >       rpc_ops->read_setup(hdr, msg);
> > > > > > > > +       nfs_netfs_initiate_read(hdr);
> > > > > > > >       trace_nfs_initiate_read(hdr);
> > > > > > > > }
> > > > > > > > 
> > > > > > > > 
> > > > > > > > Are you ok with these additions?  Something like this would
> > > > > > > > be required in the case of fscache configured and enabled,
> > > > > > > > because we could have some of the data in a read in
> > > > > > > > fscache, and some not.  That is the reason for the netfs
> > > > > > > > design, and why we need to be able to call the normal
> > > > > > > > NFS read IO path (netfs calls into issue_read, and we call
> > > > > > > > back via netfs_subreq_terminated)?
> > > > > > > > 
> > > > > > > > @@ -101,6 +101,9 @@ struct nfs_pageio_descriptor {
> > > > > > > >       struct pnfs_layout_segment *pg_lseg;
> > > > > > > >       struct nfs_io_completion *pg_io_completion;
> > > > > > > >       struct nfs_direct_req   *pg_dreq;
> > > > > > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > +       void                    *pg_netfs;
> > > > > > > > +#endif
> > > > > > > > 
> > > > > > > > @@ -1619,6 +1619,9 @@ struct nfs_pgio_header {
> > > > > > > >       const struct nfs_rw_ops *rw_ops;
> > > > > > > >       struct nfs_io_completion *io_completion;
> > > > > > > >       struct nfs_direct_req   *dreq;
> > > > > > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > +       void                    *netfs;
> > > > > > > > +#endif
> > > > > > > > 
> > > > > > > > 
> > > > > > > > And these additions to pagelist.c?
> > > > > > > > 
> > > > > > > > @@ -68,6 +69,10 @@ void nfs_pgheader_init(struct
> > > > > > > > nfs_pageio_descriptor *desc,
> > > > > > > >       hdr->good_bytes = mirror->pg_count;
> > > > > > > >       hdr->io_completion = desc->pg_io_completion;
> > > > > > > >       hdr->dreq = desc->pg_dreq;
> > > > > > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > +       if (desc->pg_netfs)
> > > > > > > > +               hdr->netfs = desc->pg_netfs;
> > > > > > > > +#endif
> > > > > > > 
> > > > > > > Why the conditional?
> > > > > > > 
> > > > > > Not really needed and I was thinking of removing it, so I'll do that.
> > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > @@ -846,6 +851,9 @@ void nfs_pageio_init(struct nfs_pageio_descriptor
> > > > > > > > *desc,
> > > > > > > >       desc->pg_lseg = NULL;
> > > > > > > >       desc->pg_io_completion = NULL;
> > > > > > > >       desc->pg_dreq = NULL;
> > > > > > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > +       desc->pg_netfs = NULL;
> > > > > > > > +#endif
> > > > > > > > 
> > > > > > > > 
> > > > > > > > @@ -1360,6 +1369,9 @@ int nfs_pageio_resend(struct
> > > > > > > > nfs_pageio_descriptor *desc,
> > > > > > > > 
> > > > > > > >       desc->pg_io_completion = hdr->io_completion;
> > > > > > > >       desc->pg_dreq = hdr->dreq;
> > > > > > > > +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > +       desc->pg_netfs = hdr->netfs;
> > > > > > > > +#endif
> > > > > > > 
> > > > > > > Those all need wrapper functions instead of embedding #ifdefs.
> > > > > > > 
> > > > > > Ok.
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > > > 
> > > > > > > > 
> > > > > > > > > My expectation is that the standard I/O path should have minimal
> > > > > > > > > overhead, and should certainly not increase the overhead that we
> > > > > > > > > already have. Will this be addressed in future iterations of these
> > > > > > > > > patches?
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > I will do what I can to satisfy what you want, either by fixing up
> > > > > > > > this patch or follow-on patches.  Hopefully the above questions
> > > > > > > > will clarify the next steps.
> > > > > > > > 
> > > > > > > 
> > > > > > > --
> > > > > > > Trond Myklebust
> > > > > > > Linux NFS client maintainer, Hammerspace
> > > > > > > trond.myklebust@hammerspace.com
> > > > 
> > > > 
> > > > 
> > > > Trond Myklebust
> > > > CTO, Hammerspace Inc
> > > > 1900 S Norfolk St, Suite 350 - #45
> > > > San Mateo, CA 94403
> > > > 
> > > > www.hammer.space
> > > > 
> > > > 
> > > 
> >
David Wysochanski Nov. 14, 2022, 5:34 p.m. UTC | #17
On Mon, Nov 14, 2022 at 11:04 AM Benjamin Maynard <benmaynard@google.com> wrote:
>
> Hi Dave,
>
> I've added responses to your questions inline below.
>
> I also tried adding the noatime option to the mount on the source
> filer as Jeff suggested, but this has not made any difference and the
> issue is still persisting for me.
>
> I created the following diagram that explains my setup, and the exact
> tests I am performing:
> https://drive.google.com/file/d/12Xf-9yHCKM4eMr2YGqdSAVfGcximW4OG/view?usp=sharing.
>
> Hopefully this is clearer than my explanations below (let me know if
> you'd prefer me to share an alternative way).
>
Yes, that's very helpful.  Let me think about this one as I'm not sure.
As Jeff says we may need tracepoints to track it down if I cannot repro
it and/or nothing comes to mind.

> In order to remove the re-exporting layer of complexity, I also
> performed the tests without the re-export server (architecture:
> https://drive.google.com/file/d/1DQKhqo_UnQ8ul-z5Iram5LpisDmkKziQ/view?usp=share_link):
>
> Source NFS Server <-- Client (with FS-Cache)
>
> The same is happening, I cannot get FS-Cache to serve from cache.
> Heavy writes, but no reads, even when the same file is copied many
> times.
>
I'm pretty sure the above you're hitting the drop_caches /
"fscache read optimisation" issue #1 I mentioned.

I see dhowells just posted a v2 version of his previous patch:
https://lore.kernel.org/linux-mm/166844174069.1124521.10890506360974169994.stgit@warthog.procyon.org.uk/

I started with 6.1-rc5, added the above dhowells latest patch for that issue,
and then my 5 patches on top.  Then I added a small patch to utilize
dhowells patch to ensure the read optimisation is removed.  I ran my
unit test that has been failing all along and as expected it passes with
these patches.  I pushed the series to github:
https://github.com/DaveWysochanskiRH/kernel/commits/nfs-fscache-netfs
https://github.com/DaveWysochanskiRH/kernel/commit/42f58f3d36d83839022dc2617bb6c2d1b09db65f

I will also email you the series of patches on top of 6.1-rc5 so you
can just apply from your mailbox if you want.



> Hopefully something I am doing wrong on my end, but I can't figure out what.
>
> Kind Regards
> Benjamin Maynard
>
>
> On Mon, 14 Nov 2022 at 13:47, David Wysochanski <dwysocha@redhat.com> wrote:
> >
> > I apologize I did not read carefully enough and I missed some details
> > in your original post.
> > More below.
> >
> > On Sat, Nov 12, 2022 at 7:47 AM Benjamin Maynard <benmaynard@google.com> wrote:
> > >
> > > Hi all,
> > >
> > > I've been doing some more testing with these patches, I applied all of
> > > the patches (v10 from
> > > https://patchwork.kernel.org/project/linux-nfs/list/?series=691729)
> > > apart from Patch 6 (the RFC patch) to version 6.0.8 of the kernel.
> > >
> > > I have the following setup:
> > >
> > > Source NFS Server <-- Re-Export Server (with FS-Cache) <-- NFS Client.
> > >
> > > I have a 500Gb file on the Source NFS Server, which I am then copying
> > > to the NFS Client via the Re-Export Server.
> > >
> > > On the first copy, I see heavy writes to /var/cache/fscache on the
> > > re-export server, and once the file copy completes I see that
> > > /var/cache/fscache is approximately 500Gb in size. All good so far.
> > >
> > > I then deleted that file from the NFS Client, and dropped the caches
> > > just to be safe (echo 3 > /proc/sys/vm/drop_caches on the NFS Client).
> > >
> > If you delete the file from the NFS client, how does that not delete the
> > file from the original NFS server?
>
> Sorry - to be clear, I never deleted the file from the NFS mount
> (which I know would in turn delete it from the re-export server and
> the source filer).
>
> In order to perform the performance test, I copied the file from the
> NFS mount on the NFS Client, to a local directory (cp
> /mnt/nfs/500gb.img /tmp).
>
> When I said "I then deleted that file from the NFS Client", I meant I
> deleted the local copy of that file. Not the file on the mount (rm
> /tmp/500gb.img).
>
> Just to also stress, I have never dropped the caches on the Re-Export
> Server (the one with FS-Cache) at any point in any of these tests, so
> I don't think this is the problem. I have only ever dropped the caches
> on the NFS client that is mounting the Re-Export Server.
>
> > > I then performed another copy of the 500Gb file on the NFS Client,
> > > again via the Re-Export Server. What I expected would happen is that I
> > > would see heavy reads from the /var/cache/fscache volume as the file
> > > should be served from FS-Cache.
> > >
> > I don't understand this.  When you say you "performed another copy"
> > of what file?  Wasn't the file deleted in the above step?
>
> As above, only the local copy was deleted.
>
> > > However what I actually saw was no reads whatsoever, FS-Cache seems to
> > > be ignored and the file is pulled from the Source NFS Filer again. I
> > > also see heavy writes to /var/cache/fscache, so it appears that
> > > FS-Cache is overwriting its existing cache, and never using it.
> >
> > That would happen if the file was changed or re-created.
> >
> > > I only have 104Gb of memory on the Re-Export Server (with FS-Cache) so
> > > it is not possible that the file is being served from the page cache.
> > >
> > > We saw this behaviour before on an older set of the patches when our
> > > mount between the Re-Export Server and the Source NFS Filer was using
> > > the "sync" option, but we are now using the "async" option and the
> > > same is happening.
> > >
> > > Mount options:
> > >
> > > Source NFS Server <-- Re-Export Server (with FS-Cache):
> > >
> > > 10.0.0.49:/files /srv/nfs/files nfs
> > > rw,noatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,acregmin=600,acregmax=600,acdirmin=600,acdirmax=600,hard,nocto,proto=tcp,nconnect=16,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.49,mountvers=3,mountport=37485,mountproto=tcp,fsc,local_lock=none,addr=10.0.0.49
> > >
> > > Re-Export Server (with FS-Cache) <-- NFS Client:
> > >
> > > 10.0.0.3:/files /mnt/nfs nfs
> > > rw,relatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.3,mountvers=3,mountport=20048,mountproto=tcp,local_lock=none,addr=10.0.0.3
> > >
> > > It is also worth noting this behaviour is not unique to the re-export
> > > use case. I see FS-Cache not being used with the following setup:
> > >
> > > Source NFS Server <-- Client (with FS-Cache).
> > >
> >
> > This points at something more fundamental like something missed
> > in the test or maybe a mount option.  Can you explain what test
> > you're doing here when you say "this behavior is not unique"?
>
> I've created the following diagram which explains the test I am
> performing. I think it is a little easier to follow than explaining in
> text. This should be viewable without any authentication:
> https://drive.google.com/file/d/12Xf-9yHCKM4eMr2YGqdSAVfGcximW4OG/view?usp=sharing.
>
> By "this behaviour is not unique to the re-export use case" I mean
> that the same happens if I remove the re-export server completely, and
> just have the following setup:
>
> Source NFS Server <-- Client (with FS-Cache).
>
> > Can you show the mount options for both:
> > - fscache filesystem on the re-export server (/var/cache/fscache)
>
> root@reexport:~$ mount | grep /var/cache/fscache
> /dev/md127 on /var/cache/fscache type ext4
> (rw,relatime,discard,nobarrier,stripe=1024)
>
> > - exported filesystem on the NFS server (filesystem in /etc/exports)
>
> I have tried both:
>
> root@source:~$ mount | grep files
> /dev/sdb1 on /files type ext4 (rw)
>
> root@source:~$ cat /etc/exports
> /files 10.0.0.0/8(rw,sync,wdelay,no_root_squash,no_all_squash,no_subtree_check,sec=sys,secure,nohide)
>
> and (at Jeff's suggestion):
>
> root@source:~$ mount | grep files
> /dev/sdb1 on /files type ext4 (rw,noatime)
>
> root@source:~$ cat /etc/exports
> /files 10.0.0.0/8(rw,sync,wdelay,no_root_squash,no_all_squash,no_subtree_check,sec=sys,secure,nohide)
>
>
> > Unfortunately the problem with drop_caches makes it more difficult
> > to know when fscache is truly working.  But some other unit test
> > I have shows fscache does work with this patchset so I'm puzzled why
> > you're not seeing it work at all.
> >
> > I pinged dhowells on the drop_caches issue so maybe we can get
> > that one sorted out soon but I'm not sure since it's part of a series
> > and proposes changes in mm.
>
> Just to be clear, I have never used drop_caches on the re-export
> server in any of these tests. I have only ever done this on the NFS
> Client.
>
> >
> > > Thanks,
> > > Ben
> > >
> > >
> > > Kind Regards
> > >
> > > Benjamin Maynard
> > >
> > > Customer Engineer
> > >
> > > benmaynard@google.com
> > >
> > > Google, Inc.
> > >
> > >
> > >
> > >
> > > On Mon, 31 Oct 2022 at 22:22, Trond Myklebust <trondmy@hammerspace.com> wrote:
> > > >
> > > >
> > > >
> > > > > On Oct 30, 2022, at 19:25, David Wysochanski <dwysocha@redhat.com> wrote:
> > > > >
> > > > > On Sat, Oct 29, 2022 at 12:46 PM David Wysochanski <dwysocha@redhat.com> wrote:
> > > > >>
> > > > >> On Fri, Oct 28, 2022 at 12:59 PM Trond Myklebust <trondmy@kernel.org> wrote:
> > > > >>>
> > > > >>> On Fri, 2022-10-28 at 07:50 -0400, David Wysochanski wrote:
> > > > >>>> On Thu, Oct 27, 2022 at 3:16 PM Trond Myklebust <trondmy@kernel.org>
> > > > >>>> wrote:
> > > > >>>>>
> > > > >>>>> On Mon, 2022-10-17 at 06:52 -0400, Dave Wysochanski wrote:
> > > > >>>>>> Convert the NFS buffered read code paths to corresponding netfs
> > > > >>>>>> APIs,
> > > > >>>>>> but only when fscache is configured and enabled.
> > > > >>>>>>
> > > > >>>>>> The netfs API defines struct netfs_request_ops which must be
> > > > >>>>>> filled
> > > > >>>>>> in by the network filesystem.  For NFS, we only need to define 5
> > > > >>>>>> of
> > > > >>>>>> the functions, the main one being the issue_read() function.
> > > > >>>>>> The issue_read() function is called by the netfs layer when a
> > > > >>>>>> read
> > > > >>>>>> cannot be fulfilled locally, and must be sent to the server
> > > > >>>>>> (either
> > > > >>>>>> the cache is not active, or it is active but the data is not
> > > > >>>>>> available).
> > > > >>>>>> Once the read from the server is complete, netfs requires a call
> > > > >>>>>> to
> > > > >>>>>> netfs_subreq_terminated() which conveys either how many bytes
> > > > >>>>>> were
> > > > >>>>>> read
> > > > >>>>>> successfully, or an error.  Note that issue_read() is called with
> > > > >>>>>> a
> > > > >>>>>> structure, netfs_io_subrequest, which defines the IO requested,
> > > > >>>>>> and
> > > > >>>>>> contains a start and a length (both in bytes), and assumes the
> > > > >>>>>> underlying
> > > > >>>>>> netfs will return a either an error on the whole region, or the
> > > > >>>>>> number
> > > > >>>>>> of bytes successfully read.
> > > > >>>>>>
> > > > >>>>>> The NFS IO path is page based and the main APIs are the pgio APIs
> > > > >>>>>> defined
> > > > >>>>>> in pagelist.c.  For the pgio APIs, there is no way for the caller
> > > > >>>>>> to
> > > > >>>>>> know how many RPCs will be sent and how the pages will be broken
> > > > >>>>>> up
> > > > >>>>>> into underlying RPCs, each of which will have their own
> > > > >>>>>> completion
> > > > >>>>>> and
> > > > >>>>>> return code.  In contrast, netfs is subrequest based, a single
> > > > >>>>>> subrequest may contain multiple pages, and a single subrequest is
> > > > >>>>>> initiated with issue_read() and terminated with
> > > > >>>>>> netfs_subreq_terminated().
> > > > >>>>>> Thus, to utilze the netfs APIs, NFS needs some way to accommodate
> > > > >>>>>> the netfs API requirement on the single response to the whole
> > > > >>>>>> subrequest, while also minimizing disruptive changes to the NFS
> > > > >>>>>> pgio layer.
> > > > >>>>>>
> > > > >>>>>> The approach taken with this patch is to allocate a small
> > > > >>>>>> structure
> > > > >>>>>> for each nfs_netfs_issue_read() call, store the final error and
> > > > >>>>>> number
> > > > >>>>>> of bytes successfully transferred in the structure, and update
> > > > >>>>>> these
> > > > >>>>>> values
> > > > >>>>>> as each RPC completes.  The refcount on the structure is used as
> > > > >>>>>> a
> > > > >>>>>> marker
> > > > >>>>>> for the last RPC completion, is incremented in
> > > > >>>>>> nfs_netfs_read_initiate(),
> > > > >>>>>> and decremented inside nfs_netfs_read_completion(), when a
> > > > >>>>>> nfs_pgio_header
> > > > >>>>>> contains a valid pointer to the data.  On the final put (which
> > > > >>>>>> signals
> > > > >>>>>> the final outstanding RPC is complete) in
> > > > >>>>>> nfs_netfs_read_completion(),
> > > > >>>>>> call netfs_subreq_terminated() with either the final error value
> > > > >>>>>> (if
> > > > >>>>>> one or more READs complete with an error) or the number of bytes
> > > > >>>>>> successfully transferred (if all RPCs complete successfully).
> > > > >>>>>> Note
> > > > >>>>>> that when all RPCs complete successfully, the number of bytes
> > > > >>>>>> transferred
> > > > >>>>>> is capped to the length of the subrequest.  Capping the
> > > > >>>>>> transferred
> > > > >>>>>> length
> > > > >>>>>> to the subrequest length prevents "Subreq overread" warnings from
> > > > >>>>>> netfs.
> > > > >>>>>> This is due to the "aligned_len" in nfs_pageio_add_page(), and
> > > > >>>>>> the
> > > > >>>>>> corner case where NFS requests a full page at the end of the
> > > > >>>>>> file,
> > > > >>>>>> even when i_size reflects only a partial page (NFS overread).
> > > > >>>>>>
> > > > >>>>>> Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
> > > > >>>>>> Reviewed-by: Jeff Layton <jlayton@kernel.org>
> > > > >>>>>
> > > > >>>>>
> > > > >>>>> This is not doing what I asked for, which was to separate out the
> > > > >>>>> fscache functionality, so that we can call that if and when it is
> > > > >>>>> available.
> > > > >>>>>
> > > > >>>> I must have misunderstood then.
> > > > >>>>
> > > > >>>> The last feedback I have from you was that you wanted it to be
> > > > >>>> an opt-in feature, and it was a comment on a previous patch
> > > > >>>> to Kconfig.  I was proceeding the best I knew how, but
> > > > >>>> let me try to get back on track.
> > > > >>>>
> > > > >>>>> Instead, it is just wrapping the NFS requests inside netfs
> > > > >>>>> requests. As
> > > > >>>>> it stands, that means it is just duplicating information, and
> > > > >>>>> adding
> > > > >>>>> unnecessary overhead to the standard I/O path (extra allocations,
> > > > >>>>> extra
> > > > >>>>> indirect calls, and extra bloat to the inode).
> > > > >>>>>
> > > > >>>> I think I understand what you're saying but I'm not sure.  Let me
> > > > >>>> ask some clarifying questions.
> > > > >>>>
> > > > >>>> Are you objecting to the code when CONFIG_NFS_FSCACHE is
> > > > >>>> configured?  Or when it is not?  Or both?  I think you're objecting
> > > > >>>> when it's configured, but not enabled (we mount without 'fsc').
> > > > >>>> Am I right?
> > > > >>>>
> > > > >>>> Also, are you objecting to the design that to use fcache we now
> > > > >>>> have to use netfs, specifically:
> > > > >>>> - call into netfs via either netfs_read_folio or netfs_readahead
> > > > >>>> - if fscache is enabled, then the IO can be satisfied from fscache
> > > > >>>> - if fscache is not enabled, or some of the IO cannot be satisfied
> > > > >>>> from the cache, then NFS is called back via netfs_issue_read
> > > > >>>> and we use the normal NFS read pageio interface.  This requires
> > > > >>>> we call netfs_subreq_terminated() when all the RPCs complete,
> > > > >>>> which is the reason for the small changes to pagelist.c
> > > > >>>
> > > > >>> I'm objecting to any middle layer "solution" that adds overhead to the
> > > > >>> NFS I/O paths.
> > > > >>>
> > > > >> Got it.
> > > > >>
> > > > >>> I'm willing to consider solutions that are specific only to the fscache
> > > > >>> use case (i.e. when the 'fsc' mount option is specified). However when
> > > > >>> I perform a normal NFS mount, and do I/O, then I don't want to see
> > > > >>> extra memory allocations, extra indirect calls and larger inode
> > > > >>> footprints.
> > > > >>>
> > > > >>> IOW: I want the code to optimise for the case of standard NFS, not for
> > > > >>> the case of 'NFS with cachefs additions'.
> > > > >>>
> > > > >> I agree completely.  Are you seeing extra memory allocations
> > > > >> happen on mounts without 'fsc' or is it more a concern or how
> > > > >> some of the patches look?  We should not be calling any netfs or
> > > > >> fscache code if 'fsc' is not on the mount and I don't see any in my
> > > > >> testing. So either there's a misunderstanding here, or there's a
> > > > >> bug I'm missing.
> > > > >>
> > > > >> If fscache is not configured, then nfs_netfs_read_folio() and
> > > > >> nfs_netfs_readahead() is a wrapper that returns -ENOBUFS.
> > > > >> If it's configured but not enabled, then the checks for
> > > > >> netfs_inode(inode)->cache should skip over any netfs code.
> > > > >> But maybe there's a non-obvious bug you're seeing and
> > > > >> somehow netfs is still getting called?  Because I cannot
> > > > >> see netfs getting called if 'fsc' is not on the mount in my
> > > > >> tests.
> > > > >>
> > > > >> int nfs_netfs_read_folio(struct file *file, struct folio *folio)
> > > > >> {
> > > > >>       if (!netfs_inode(folio_inode(folio))->cache)
> > > > >>               return -ENOBUFS;
> > > > >>
> > > > >>       return netfs_read_folio(file, folio);
> > > > >> }
> > > > >>
> > > > >> int nfs_netfs_readahead(struct readahead_control *ractl)
> > > > >> {
> > > > >>       struct inode *inode = ractl->mapping->host;
> > > > >>
> > > > >>       if (!netfs_inode(inode)->cache)
> > > > >>               return -ENOBUFS;
> > > > >>
> > > > >>       netfs_readahead(ractl);
> > > > >>       return 0;
> > > > >> }
> > > > >>
> > > > >>
> > > > >>>>
> > > > >>>> Can you be more specific as to the portions of the patch you don't
> > > > >>>> like
> > > > >>>> so I can move it in the right direction?
> > > > >>>>
> > > > >>>> This is from patch #2 which you didn't comment on.  I'm not sure
> > > > >>>> you're
> > > > >>>> ok with it though, since you mention "extra bloat to the inode".
> > > > >>>> Do you object to this even though it's wrapped in an
> > > > >>>> #ifdef CONFIG_NFS_FSCACHE?  If so, do you require no
> > > > >>>> extra size be added to nfs_inode?
> > > > >>>>
> > > > >>>> @@ -204,9 +208,11 @@ struct nfs_inode {
> > > > >>>>       __u64 write_io;
> > > > >>>>       __u64 read_io;
> > > > >>>> #ifdef CONFIG_NFS_FSCACHE
> > > > >>>> -       struct fscache_cookie   *fscache;
> > > > >>>> -#endif
> > > > >>>> +       struct netfs_inode      netfs; /* netfs context and VFS inode
> > > > >>>> */
> > > > >>>> +#else
> > > > >>>>       struct inode            vfs_inode;
> > > > >>>> +#endif
> > > > >>>> +
> > > > >>>
> > > > >>> Ideally, I'd prefer no extra size. I can live with it up to a certain
> > > > >>> point, however for now NFS is not unconditionally opting into the netfs
> > > > >>> project. If we're to ever do that, then I want to see streamlined code
> > > > >>> for the standard I/O case.
> > > > >>>
> > > > >> Ok and understood about standard I/O case.
> > > > >>
> > > > >> I was thinking how we might not increase the size, but I don't think
> > > > >> I can make it work.
> > > > >>
> > > > >> I thought we could change to something like the below, without an
> > > > >> embedded struct inode:
> > > > >>
> > > > >> @@ -204,9 +208,11 @@ struct nfs_inode {
> > > > >>       __u64 write_io;
> > > > >>       __u64 read_io;
> > > > >> #ifdef CONFIG_NFS_FSCACHE
> > > > >> -       struct fscache_cookie   *fscache;
> > > > >> -#endif
> > > > >> +       struct netfs_inode      *netfs; /* netfs context and VFS inode */
> > > > >> +#else
> > > > >>       struct inode            vfs_inode;
> > > > >> +#endif
> > > > >> +
> > > > >>
> > > > >> Then I would need to alloc/free a netfs_inode at the time of
> > > > >> nfs_inode initiation.  Unfortunately this has the issue that the NFS_I()
> > > > >> macro cannot work, because it requires an embedded "struct inode"
> > > > >> due to "container_of" use:
> > > > >>
> > > > >> +#ifdef CONFIG_NFS_FSCACHE
> > > > >> +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > > > >> +{
> > > > >> +       return &nfsi->netfs.inode;
> > > > >> +}
> > > > >> +static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > > > >> +{
> > > > >> +       return container_of(inode, struct nfs_inode, netfs.inode);
> > > > >> +}
> > > > >> +#else
> > > > >> +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > > > >> +{
> > > > >> +       return &nfsi->vfs_inode;
> > > > >> +}
> > > > >> static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > > > >> {
> > > > >>       return container_of(inode, struct nfs_inode, vfs_inode);
> > > > >> }
> > > > >> +#endif
> > > > >>
> > > > >>
> > > > >
> > > > > Actually Trond maybe we can achieve a "0 length increase" of
> > > > > nfs_inode if dhowells would take a patch to modify the definition
> > > > > of struct netfs_inode and netfs_inode_init(), something like the WIP
> > > > > patch below.  What do you think?
> > > >
> > > > That works for me.
> > > >
> > > > >
> > > > > I think maybe this could be a follow-on patch and if you/dhowells
> > > > > think it's an ok idea I can try to work out what is needed across
> > > > > the tree.  I thought about it more and I kinda agree that in the
> > > > > case for NFS where fscache is "configured but not enabled",
> > > > > then even though we're only adding 24 bytes to the nfs_inode
> > > > > each time, it will add up so it is worth at least a discussion.
> > > > >
> > > > > diff --git a/include/linux/netfs.h b/include/linux/netfs.h
> > > > > index f2402ddeafbf..195714f1c355 100644
> > > > > --- a/include/linux/netfs.h
> > > > > +++ b/include/linux/netfs.h
> > > > > @@ -118,11 +118,7 @@ enum netfs_io_source {
> > > > > typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error,
> > > > >                                     bool was_async);
> > > > >
> > > > > -/*
> > > > > - * Per-inode context.  This wraps the VFS inode.
> > > > > - */
> > > > > -struct netfs_inode {
> > > > > -       struct inode            inode;          /* The VFS inode */
> > > > > +struct netfs_info {
> > > > >       const struct netfs_request_ops *ops;
> > > > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > > >       struct fscache_cookie   *cache;
> > > > > @@ -130,6 +126,14 @@ struct netfs_inode {
> > > > >       loff_t                  remote_i_size;  /* Size of the remote file */
> > > > > };
> > > > >
> > > > > +/*
> > > > > + * Per-inode context.  This wraps the VFS inode.
> > > > > + */
> > > > > +struct netfs_inode {
> > > > > +       struct inode            inode;          /* The VFS inode */
> > > > > +       struct netfs_info       *netfs;         /* Rest of netfs data */
> > > > > +};
> > > > > +
> > > > > /*
> > > > > * Resources required to do operations on a cache.
> > > > > */
> > > > > @@ -312,10 +316,12 @@ static inline struct netfs_inode
> > > > > *netfs_inode(struct inode *inode)
> > > > > static inline void netfs_inode_init(struct netfs_inode *ctx,
> > > > >                                   const struct netfs_request_ops *ops)
> > > > > {
> > > > > -       ctx->ops = ops;
> > > > > -       ctx->remote_i_size = i_size_read(&ctx->inode);
> > > > > +       ctx->netfs = kzalloc(sizeof(struct netfs_info)), GFP_KERNEL);
> > > > > +       /* FIXME: Check for NULL */
> > > > > +       ctx->netfs->ops = ops;
> > > > > +       ctx->netfs->remote_i_size = i_size_read(&ctx->inode);
> > > > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > > > -       ctx->cache = NULL;
> > > > > +       ctx->netfs->cache = NULL;
> > > > > #endif
> > > > > }
> > > > >
> > > > >
> > > > >
> > > > >>
> > > > >>>>
> > > > >>>>
> > > > >>>> Are you ok with the stub functions which are placed in fscache.h, and
> > > > >>>> when CONFIG_NFS_FSCACHE is not set, become either a no-op
> > > > >>>> or a 1-liner (nfs_netfs_readpage_release)?
> > > > >>>>
> > > > >>>> #else /* CONFIG_NFS_FSCACHE */
> > > > >>>> +static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {}
> > > > >>>> +static inline void nfs_netfs_initiate_read(struct nfs_pgio_header
> > > > >>>> *hdr) {}
> > > > >>>> +static inline void nfs_netfs_read_completion(struct nfs_pgio_header
> > > > >>>> *hdr) {}
> > > > >>>> +static inline void nfs_netfs_readpage_release(struct nfs_page *req)
> > > > >>>> +{
> > > > >>>> +       unlock_page(req->wb_page);
> > > > >>>> +}
> > > > >>>> static inline void nfs_fscache_release_super_cookie(struct
> > > > >>>> super_block *sb) {}
> > > > >>>> static inline void nfs_fscache_init_inode(struct inode *inode) {}
> > > > >>>>
> > > > >>>>
> > > > >>>> Do you object to the below?  If so, then do you want
> > > > >>>> #ifdef CONFIG_NFS_FSCACHE here?
> > > > >>>>
> > > > >>>> -- a/fs/nfs/inode.c
> > > > >>>> +++ b/fs/nfs/inode.c
> > > > >>>> @@ -2249,6 +2249,8 @@ struct inode *nfs_alloc_inode(struct
> > > > >>>> super_block *sb)
> > > > >>>> #ifdef CONFIG_NFS_V4_2
> > > > >>>>       nfsi->xattr_cache = NULL;
> > > > >>>> #endif
> > > > >>>> +       nfs_netfs_inode_init(nfsi);
> > > > >>>> +
> > > > >>>>       return VFS_I(nfsi);
> > > > >>>> }
> > > > >>>> EXPORT_SYMBOL_GPL(nfs_alloc_i
> > > > >>>> node);
> > > > >>>>
> > > > >>>>
> > > > >>>> Do you object to the changes in fs/nfs/read.c?  Specifically,
> > > > >>>> how about the below calls to netfs from nfs_read_folio and
> > > > >>>> nfs_readahead into equivalent netfs calls?  So when
> > > > >>>> NFS_CONFIG_FSCACHE is set, but fscache is not enabled
> > > > >>>> ('fsc' not on mount), these netfs functions do immediately call
> > > > >>>> netfs_alloc_request().  But I wonder if we could simply add a
> > > > >>>> check to see if fscache is enabled on the mount, and skip
> > > > >>>> over to satisfy what you want.  Am I understanding what you
> > > > >>>> want?
> > > > >>>
> > > > >>> Quite frankly, I'd prefer that we just split out the functionality that
> > > > >>> is needed from the netfs code so that it can be optimised. However I'm
> > > > >>> not interested enough in the cachefs functionality to work on that
> > > > >>> myself. ...and as I indicated above, I might be OK with opting into the
> > > > >>> netfs project, once the overhead can be made to disappear.
> > > > >>>
> > > > >> Understood.
> > > > >>
> > > > >> If you think it makes more sense, I can move some of the nfs_netfs_*
> > > > >> functions into a netfs.c file as a starting point.  Or that can maybe
> > > > >> be done in a future patchset?
> > > > >>
> > > > >> For now I was equating netfs and fscache together so we can
> > > > >> move on from the much older and single-page limiting fscache
> > > > >> interface that is likely to go away soon.
> > > > >>
> > > > >>>>
> > > > >>>> @@ -355,6 +343,10 @@ int nfs_read_folio(struct file *file, struct
> > > > >>>> folio *folio)
> > > > >>>>       if (NFS_STALE(inode))
> > > > >>>>               goto out_unlock;
> > > > >>>>
> > > > >>>> +       ret = nfs_netfs_read_folio(file, folio);
> > > > >>>> +       if (!ret)
> > > > >>>> +               goto out;
> > > > >>>> +
> > > > >>>>
> > > > >>>> @@ -405,6 +399,10 @@ void nfs_readahead(struct readahead_control
> > > > >>>> *ractl)
> > > > >>>>       if (NFS_STALE(inode))
> > > > >>>>               goto out;
> > > > >>>>
> > > > >>>> +       ret = nfs_netfs_readahead(ractl);
> > > > >>>> +       if (!ret)
> > > > >>>> +               goto out;
> > > > >>>> +
> > > > >>>>
> > > > >> The above wrappers should prevent any additional overhead when fscache
> > > > >> is not enabled.  As far as I know these work to avoid calling netfs
> > > > >> when 'fsc' is not on the mount.
> > > > >>
> > > > >>>>
> > > > >>>> And how about these calls from different points in the read
> > > > >>>> path to the earlier mentioned stub functions?
> > > > >>>>
> > > > >>>> @@ -110,20 +110,13 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
> > > > >>>>
> > > > >>>> static void nfs_readpage_release(struct nfs_page *req, int error)
> > > > >>>> {
> > > > >>>> -       struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > > > >>>>       struct page *page = req->wb_page;
> > > > >>>>
> > > > >>>> -       dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb-
> > > > >>>>> s_id,
> > > > >>>> -               (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
> > > > >>>> -               (long long)req_offset(req));
> > > > >>>> -
> > > > >>>>       if (nfs_error_is_fatal_on_server(error) && error != -
> > > > >>>> ETIMEDOUT)
> > > > >>>>               SetPageError(page);
> > > > >>>> -       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
> > > > >>>> -               if (PageUptodate(page))
> > > > >>>> -                       nfs_fscache_write_page(inode, page);
> > > > >>>> -               unlock_page(page);
> > > > >>>> -       }
> > > > >>>> +       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > > >>>> +               nfs_netfs_readpage_release(req);
> > > > >>>> +
> > > > >>>
> > > > >>> I'm not seeing the value of wrapping unlock_page(), no... That code is
> > > > >>> going to need to change when we move it to use folios natively anyway.
> > > > >>>
> > > > >> Ok, how about I make it conditional on whether fscache is configured
> > > > >> and enabled then, similar to the nfs_netfs_read_folio() and
> > > > >> nfs_netfs_readahead()?  Below is what that would look like.
> > > > >> I could inline the code in nfs_netfs_readpage_release() if you
> > > > >> think it would be clearer.
> > > > >>
> > > > >> static void nfs_readpage_release(struct nfs_page *req, int error)
> > > > >> {
> > > > >>       struct page *page = req->wb_page;
> > > > >>
> > > > >>       if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
> > > > >>               SetPageError(page);
> > > > >>       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > > >> #ifndef CONFIG_NFS_FSCACHE
> > > > >>               unlock_page(req->wb_page);
> > > > >> #else
> > > > >>               nfs_netfs_readpage_release(req);
> > > > >> #endif
> > > > >>       nfs_release_request(req);
> > > > >> }
> > > > >>
> > > > >>
> > > > >> void nfs_netfs_readpage_release(struct nfs_page *req)
> > > > >> {
> > > > >>   struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > > > >>
> > > > >>   /*
> > > > >>    * If fscache is enabled, netfs will unlock pages.
> > > > >>    */
> > > > >>   if (netfs_inode(inode)->cache)
> > > > >>       return;
> > > > >>
> > > > >>   unlock_page(req->wb_page);
> > > > >> }
> > > > >>
> > > > >>
> > > > >>>>       nfs_release_request(req);
> > > > >>>> }
> > > > >>>>
> > > > >>>> @@ -177,6 +170,8 @@ static void nfs_read_completion(struct
> > > > >>>> nfs_pgio_header *hdr)
> > > > >>>>               nfs_list_remove_request(req);
> > > > >>>>               nfs_readpage_release(req, error);
> > > > >>>>       }
> > > > >>>> +       nfs_netfs_read_completion(hdr);
> > > > >>>> +
> > > > >>>> out:
> > > > >>>>       hdr->release(hdr);
> > > > >>>> }
> > > > >>>> @@ -187,6 +182,7 @@ static void nfs_initiate_read(struct
> > > > >>>> nfs_pgio_header *hdr,
> > > > >>>>                             struct rpc_task_setup *task_setup_data,
> > > > >>>> int how)
> > > > >>>> {
> > > > >>>>       rpc_ops->read_setup(hdr, msg);
> > > > >>>> +       nfs_netfs_initiate_read(hdr);
> > > > >>>>       trace_nfs_initiate_read(hdr);
> > > > >>>> }
> > > > >>>>
> > > > >>>>
> > > > >>>> Are you ok with these additions?  Something like this would
> > > > >>>> be required in the case of fscache configured and enabled,
> > > > >>>> because we could have some of the data in a read in
> > > > >>>> fscache, and some not.  That is the reason for the netfs
> > > > >>>> design, and why we need to be able to call the normal
> > > > >>>> NFS read IO path (netfs calls into issue_read, and we call
> > > > >>>> back via netfs_subreq_terminated)?
> > > > >>>>
> > > > >>>> @@ -101,6 +101,9 @@ struct nfs_pageio_descriptor {
> > > > >>>>       struct pnfs_layout_segment *pg_lseg;
> > > > >>>>       struct nfs_io_completion *pg_io_completion;
> > > > >>>>       struct nfs_direct_req   *pg_dreq;
> > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > >>>> +       void                    *pg_netfs;
> > > > >>>> +#endif
> > > > >>>>
> > > > >>>> @@ -1619,6 +1619,9 @@ struct nfs_pgio_header {
> > > > >>>>       const struct nfs_rw_ops *rw_ops;
> > > > >>>>       struct nfs_io_completion *io_completion;
> > > > >>>>       struct nfs_direct_req   *dreq;
> > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > >>>> +       void                    *netfs;
> > > > >>>> +#endif
> > > > >>>>
> > > > >>>>
> > > > >>>> And these additions to pagelist.c?
> > > > >>>>
> > > > >>>> @@ -68,6 +69,10 @@ void nfs_pgheader_init(struct
> > > > >>>> nfs_pageio_descriptor *desc,
> > > > >>>>       hdr->good_bytes = mirror->pg_count;
> > > > >>>>       hdr->io_completion = desc->pg_io_completion;
> > > > >>>>       hdr->dreq = desc->pg_dreq;
> > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > >>>> +       if (desc->pg_netfs)
> > > > >>>> +               hdr->netfs = desc->pg_netfs;
> > > > >>>> +#endif
> > > > >>>
> > > > >>> Why the conditional?
> > > > >>>
> > > > >> Not really needed and I was thinking of removing it, so I'll do that.
> > > > >>
> > > > >>>>
> > > > >>>>
> > > > >>>> @@ -846,6 +851,9 @@ void nfs_pageio_init(struct nfs_pageio_descriptor
> > > > >>>> *desc,
> > > > >>>>       desc->pg_lseg = NULL;
> > > > >>>>       desc->pg_io_completion = NULL;
> > > > >>>>       desc->pg_dreq = NULL;
> > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > >>>> +       desc->pg_netfs = NULL;
> > > > >>>> +#endif
> > > > >>>>
> > > > >>>>
> > > > >>>> @@ -1360,6 +1369,9 @@ int nfs_pageio_resend(struct
> > > > >>>> nfs_pageio_descriptor *desc,
> > > > >>>>
> > > > >>>>       desc->pg_io_completion = hdr->io_completion;
> > > > >>>>       desc->pg_dreq = hdr->dreq;
> > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > >>>> +       desc->pg_netfs = hdr->netfs;
> > > > >>>> +#endif
> > > > >>>
> > > > >>> Those all need wrapper functions instead of embedding #ifdefs.
> > > > >>>
> > > > >> Ok.
> > > > >>
> > > > >>
> > > > >>
> > > > >>>>
> > > > >>>>
> > > > >>>>> My expectation is that the standard I/O path should have minimal
> > > > >>>>> overhead, and should certainly not increase the overhead that we
> > > > >>>>> already have. Will this be addressed in future iterations of these
> > > > >>>>> patches?
> > > > >>>>>
> > > > >>>>
> > > > >>>> I will do what I can to satisfy what you want, either by fixing up
> > > > >>>> this patch or follow-on patches.  Hopefully the above questions
> > > > >>>> will clarify the next steps.
> > > > >>>>
> > > > >>>
> > > > >>> --
> > > > >>> Trond Myklebust
> > > > >>> Linux NFS client maintainer, Hammerspace
> > > > >>> trond.myklebust@hammerspace.com
> > > >
> > > >
> > > >
> > > > Trond Myklebust
> > > > CTO, Hammerspace Inc
> > > > 1900 S Norfolk St, Suite 350 - #45
> > > > San Mateo, CA 94403
> > > >
> > > > www.hammer.space
> > > >
> > > >
> > >
> >
>
Benjamin Maynard Nov. 14, 2022, 9:25 p.m. UTC | #18
Thanks Dave, that did the trick!

Building the kernel from
https://github.com/DaveWysochanskiRH/kernel/commit/42f58f3d36d83839022dc2617bb6c2d1b09db65f
and re-running the exact same tests yielded the expected results. Data
is now being served from /var/cache/fscache.

I also reverted my change to the read ahead, so that read ahead is now
greater than the rsize. Still works as expected.

I am also seeing much better single file read speeds, and culling is
working perfectly (not running into the issue we were seeing pre
5.17).

Thanks a lot Dave, Jeff and Daire for your help.

Kind Regards
Benjamin Maynard



Kind Regards

Benjamin Maynard

Customer Engineer

benmaynard@google.com

Google, Inc.




On Mon, 14 Nov 2022 at 17:35, David Wysochanski <dwysocha@redhat.com> wrote:
>
> On Mon, Nov 14, 2022 at 11:04 AM Benjamin Maynard <benmaynard@google.com> wrote:
> >
> > Hi Dave,
> >
> > I've added responses to your questions inline below.
> >
> > I also tried adding the noatime option to the mount on the source
> > filer as Jeff suggested, but this has not made any difference and the
> > issue is still persisting for me.
> >
> > I created the following diagram that explains my setup, and the exact
> > tests I am performing:
> > https://drive.google.com/file/d/12Xf-9yHCKM4eMr2YGqdSAVfGcximW4OG/view?usp=sharing.
> >
> > Hopefully this is clearer than my explanations below (let me know if
> > you'd prefer me to share an alternative way).
> >
> Yes, that's very helpful.  Let me think about this one as I'm not sure.
> As Jeff says we may need tracepoints to track it down if I cannot repro
> it and/or nothing comes to mind.
>
> > In order to remove the re-exporting layer of complexity, I also
> > performed the tests without the re-export server (architecture:
> > https://drive.google.com/file/d/1DQKhqo_UnQ8ul-z5Iram5LpisDmkKziQ/view?usp=share_link):
> >
> > Source NFS Server <-- Client (with FS-Cache)
> >
> > The same is happening, I cannot get FS-Cache to serve from cache.
> > Heavy writes, but no reads, even when the same file is copied many
> > times.
> >
> I'm pretty sure the above you're hitting the drop_caches /
> "fscache read optimisation" issue #1 I mentioned.
>
> I see dhowells just posted a v2 version of his previous patch:
> https://lore.kernel.org/linux-mm/166844174069.1124521.10890506360974169994.stgit@warthog.procyon.org.uk/
>
> I started with 6.1-rc5, added the above dhowells latest patch for that issue,
> and then my 5 patches on top.  Then I added a small patch to utilize
> dhowells patch to ensure the read optimisation is removed.  I ran my
> unit test that has been failing all along and as expected it passes with
> these patches.  I pushed the series to github:
> https://github.com/DaveWysochanskiRH/kernel/commits/nfs-fscache-netfs
> https://github.com/DaveWysochanskiRH/kernel/commit/42f58f3d36d83839022dc2617bb6c2d1b09db65f
>
> I will also email you the series of patches on top of 6.1-rc5 so you
> can just apply from your mailbox if you want.
>
>
>
> > Hopefully something I am doing wrong on my end, but I can't figure out what.
> >
> > Kind Regards
> > Benjamin Maynard
> >
> >
> > On Mon, 14 Nov 2022 at 13:47, David Wysochanski <dwysocha@redhat.com> wrote:
> > >
> > > I apologize I did not read carefully enough and I missed some details
> > > in your original post.
> > > More below.
> > >
> > > On Sat, Nov 12, 2022 at 7:47 AM Benjamin Maynard <benmaynard@google.com> wrote:
> > > >
> > > > Hi all,
> > > >
> > > > I've been doing some more testing with these patches, I applied all of
> > > > the patches (v10 from
> > > > https://patchwork.kernel.org/project/linux-nfs/list/?series=691729)
> > > > apart from Patch 6 (the RFC patch) to version 6.0.8 of the kernel.
> > > >
> > > > I have the following setup:
> > > >
> > > > Source NFS Server <-- Re-Export Server (with FS-Cache) <-- NFS Client.
> > > >
> > > > I have a 500Gb file on the Source NFS Server, which I am then copying
> > > > to the NFS Client via the Re-Export Server.
> > > >
> > > > On the first copy, I see heavy writes to /var/cache/fscache on the
> > > > re-export server, and once the file copy completes I see that
> > > > /var/cache/fscache is approximately 500Gb in size. All good so far.
> > > >
> > > > I then deleted that file from the NFS Client, and dropped the caches
> > > > just to be safe (echo 3 > /proc/sys/vm/drop_caches on the NFS Client).
> > > >
> > > If you delete the file from the NFS client, how does that not delete the
> > > file from the original NFS server?
> >
> > Sorry - to be clear, I never deleted the file from the NFS mount
> > (which I know would in turn delete it from the re-export server and
> > the source filer).
> >
> > In order to perform the performance test, I copied the file from the
> > NFS mount on the NFS Client, to a local directory (cp
> > /mnt/nfs/500gb.img /tmp).
> >
> > When I said "I then deleted that file from the NFS Client", I meant I
> > deleted the local copy of that file. Not the file on the mount (rm
> > /tmp/500gb.img).
> >
> > Just to also stress, I have never dropped the caches on the Re-Export
> > Server (the one with FS-Cache) at any point in any of these tests, so
> > I don't think this is the problem. I have only ever dropped the caches
> > on the NFS client that is mounting the Re-Export Server.
> >
> > > > I then performed another copy of the 500Gb file on the NFS Client,
> > > > again via the Re-Export Server. What I expected would happen is that I
> > > > would see heavy reads from the /var/cache/fscache volume as the file
> > > > should be served from FS-Cache.
> > > >
> > > I don't understand this.  When you say you "performed another copy"
> > > of what file?  Wasn't the file deleted in the above step?
> >
> > As above, only the local copy was deleted.
> >
> > > > However what I actually saw was no reads whatsoever, FS-Cache seems to
> > > > be ignored and the file is pulled from the Source NFS Filer again. I
> > > > also see heavy writes to /var/cache/fscache, so it appears that
> > > > FS-Cache is overwriting its existing cache, and never using it.
> > >
> > > That would happen if the file was changed or re-created.
> > >
> > > > I only have 104Gb of memory on the Re-Export Server (with FS-Cache) so
> > > > it is not possible that the file is being served from the page cache.
> > > >
> > > > We saw this behaviour before on an older set of the patches when our
> > > > mount between the Re-Export Server and the Source NFS Filer was using
> > > > the "sync" option, but we are now using the "async" option and the
> > > > same is happening.
> > > >
> > > > Mount options:
> > > >
> > > > Source NFS Server <-- Re-Export Server (with FS-Cache):
> > > >
> > > > 10.0.0.49:/files /srv/nfs/files nfs
> > > > rw,noatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,acregmin=600,acregmax=600,acdirmin=600,acdirmax=600,hard,nocto,proto=tcp,nconnect=16,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.49,mountvers=3,mountport=37485,mountproto=tcp,fsc,local_lock=none,addr=10.0.0.49
> > > >
> > > > Re-Export Server (with FS-Cache) <-- NFS Client:
> > > >
> > > > 10.0.0.3:/files /mnt/nfs nfs
> > > > rw,relatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.3,mountvers=3,mountport=20048,mountproto=tcp,local_lock=none,addr=10.0.0.3
> > > >
> > > > It is also worth noting this behaviour is not unique to the re-export
> > > > use case. I see FS-Cache not being used with the following setup:
> > > >
> > > > Source NFS Server <-- Client (with FS-Cache).
> > > >
> > >
> > > This points at something more fundamental like something missed
> > > in the test or maybe a mount option.  Can you explain what test
> > > you're doing here when you say "this behavior is not unique"?
> >
> > I've created the following diagram which explains the test I am
> > performing. I think it is a little easier to follow than explaining in
> > text. This should be viewable without any authentication:
> > https://drive.google.com/file/d/12Xf-9yHCKM4eMr2YGqdSAVfGcximW4OG/view?usp=sharing.
> >
> > By "this behaviour is not unique to the re-export use case" I mean
> > that the same happens if I remove the re-export server completely, and
> > just have the following setup:
> >
> > Source NFS Server <-- Client (with FS-Cache).
> >
> > > Can you show the mount options for both:
> > > - fscache filesystem on the re-export server (/var/cache/fscache)
> >
> > root@reexport:~$ mount | grep /var/cache/fscache
> > /dev/md127 on /var/cache/fscache type ext4
> > (rw,relatime,discard,nobarrier,stripe=1024)
> >
> > > - exported filesystem on the NFS server (filesystem in /etc/exports)
> >
> > I have tried both:
> >
> > root@source:~$ mount | grep files
> > /dev/sdb1 on /files type ext4 (rw)
> >
> > root@source:~$ cat /etc/exports
> > /files 10.0.0.0/8(rw,sync,wdelay,no_root_squash,no_all_squash,no_subtree_check,sec=sys,secure,nohide)
> >
> > and (at Jeff's suggestion):
> >
> > root@source:~$ mount | grep files
> > /dev/sdb1 on /files type ext4 (rw,noatime)
> >
> > root@source:~$ cat /etc/exports
> > /files 10.0.0.0/8(rw,sync,wdelay,no_root_squash,no_all_squash,no_subtree_check,sec=sys,secure,nohide)
> >
> >
> > > Unfortunately the problem with drop_caches makes it more difficult
> > > to know when fscache is truly working.  But some other unit test
> > > I have shows fscache does work with this patchset so I'm puzzled why
> > > you're not seeing it work at all.
> > >
> > > I pinged dhowells on the drop_caches issue so maybe we can get
> > > that one sorted out soon but I'm not sure since it's part of a series
> > > and proposes changes in mm.
> >
> > Just to be clear, I have never used drop_caches on the re-export
> > server in any of these tests. I have only ever done this on the NFS
> > Client.
> >
> > >
> > > > Thanks,
> > > > Ben
> > > >
> > > >
> > > > Kind Regards
> > > >
> > > > Benjamin Maynard
> > > >
> > > > Customer Engineer
> > > >
> > > > benmaynard@google.com
> > > >
> > > > Google, Inc.
> > > >
> > > >
> > > >
> > > >
> > > > On Mon, 31 Oct 2022 at 22:22, Trond Myklebust <trondmy@hammerspace.com> wrote:
> > > > >
> > > > >
> > > > >
> > > > > > On Oct 30, 2022, at 19:25, David Wysochanski <dwysocha@redhat.com> wrote:
> > > > > >
> > > > > > On Sat, Oct 29, 2022 at 12:46 PM David Wysochanski <dwysocha@redhat.com> wrote:
> > > > > >>
> > > > > >> On Fri, Oct 28, 2022 at 12:59 PM Trond Myklebust <trondmy@kernel.org> wrote:
> > > > > >>>
> > > > > >>> On Fri, 2022-10-28 at 07:50 -0400, David Wysochanski wrote:
> > > > > >>>> On Thu, Oct 27, 2022 at 3:16 PM Trond Myklebust <trondmy@kernel.org>
> > > > > >>>> wrote:
> > > > > >>>>>
> > > > > >>>>> On Mon, 2022-10-17 at 06:52 -0400, Dave Wysochanski wrote:
> > > > > >>>>>> Convert the NFS buffered read code paths to corresponding netfs
> > > > > >>>>>> APIs,
> > > > > >>>>>> but only when fscache is configured and enabled.
> > > > > >>>>>>
> > > > > >>>>>> The netfs API defines struct netfs_request_ops which must be
> > > > > >>>>>> filled
> > > > > >>>>>> in by the network filesystem.  For NFS, we only need to define 5
> > > > > >>>>>> of
> > > > > >>>>>> the functions, the main one being the issue_read() function.
> > > > > >>>>>> The issue_read() function is called by the netfs layer when a
> > > > > >>>>>> read
> > > > > >>>>>> cannot be fulfilled locally, and must be sent to the server
> > > > > >>>>>> (either
> > > > > >>>>>> the cache is not active, or it is active but the data is not
> > > > > >>>>>> available).
> > > > > >>>>>> Once the read from the server is complete, netfs requires a call
> > > > > >>>>>> to
> > > > > >>>>>> netfs_subreq_terminated() which conveys either how many bytes
> > > > > >>>>>> were
> > > > > >>>>>> read
> > > > > >>>>>> successfully, or an error.  Note that issue_read() is called with
> > > > > >>>>>> a
> > > > > >>>>>> structure, netfs_io_subrequest, which defines the IO requested,
> > > > > >>>>>> and
> > > > > >>>>>> contains a start and a length (both in bytes), and assumes the
> > > > > >>>>>> underlying
> > > > > >>>>>> netfs will return a either an error on the whole region, or the
> > > > > >>>>>> number
> > > > > >>>>>> of bytes successfully read.
> > > > > >>>>>>
> > > > > >>>>>> The NFS IO path is page based and the main APIs are the pgio APIs
> > > > > >>>>>> defined
> > > > > >>>>>> in pagelist.c.  For the pgio APIs, there is no way for the caller
> > > > > >>>>>> to
> > > > > >>>>>> know how many RPCs will be sent and how the pages will be broken
> > > > > >>>>>> up
> > > > > >>>>>> into underlying RPCs, each of which will have their own
> > > > > >>>>>> completion
> > > > > >>>>>> and
> > > > > >>>>>> return code.  In contrast, netfs is subrequest based, a single
> > > > > >>>>>> subrequest may contain multiple pages, and a single subrequest is
> > > > > >>>>>> initiated with issue_read() and terminated with
> > > > > >>>>>> netfs_subreq_terminated().
> > > > > >>>>>> Thus, to utilze the netfs APIs, NFS needs some way to accommodate
> > > > > >>>>>> the netfs API requirement on the single response to the whole
> > > > > >>>>>> subrequest, while also minimizing disruptive changes to the NFS
> > > > > >>>>>> pgio layer.
> > > > > >>>>>>
> > > > > >>>>>> The approach taken with this patch is to allocate a small
> > > > > >>>>>> structure
> > > > > >>>>>> for each nfs_netfs_issue_read() call, store the final error and
> > > > > >>>>>> number
> > > > > >>>>>> of bytes successfully transferred in the structure, and update
> > > > > >>>>>> these
> > > > > >>>>>> values
> > > > > >>>>>> as each RPC completes.  The refcount on the structure is used as
> > > > > >>>>>> a
> > > > > >>>>>> marker
> > > > > >>>>>> for the last RPC completion, is incremented in
> > > > > >>>>>> nfs_netfs_read_initiate(),
> > > > > >>>>>> and decremented inside nfs_netfs_read_completion(), when a
> > > > > >>>>>> nfs_pgio_header
> > > > > >>>>>> contains a valid pointer to the data.  On the final put (which
> > > > > >>>>>> signals
> > > > > >>>>>> the final outstanding RPC is complete) in
> > > > > >>>>>> nfs_netfs_read_completion(),
> > > > > >>>>>> call netfs_subreq_terminated() with either the final error value
> > > > > >>>>>> (if
> > > > > >>>>>> one or more READs complete with an error) or the number of bytes
> > > > > >>>>>> successfully transferred (if all RPCs complete successfully).
> > > > > >>>>>> Note
> > > > > >>>>>> that when all RPCs complete successfully, the number of bytes
> > > > > >>>>>> transferred
> > > > > >>>>>> is capped to the length of the subrequest.  Capping the
> > > > > >>>>>> transferred
> > > > > >>>>>> length
> > > > > >>>>>> to the subrequest length prevents "Subreq overread" warnings from
> > > > > >>>>>> netfs.
> > > > > >>>>>> This is due to the "aligned_len" in nfs_pageio_add_page(), and
> > > > > >>>>>> the
> > > > > >>>>>> corner case where NFS requests a full page at the end of the
> > > > > >>>>>> file,
> > > > > >>>>>> even when i_size reflects only a partial page (NFS overread).
> > > > > >>>>>>
> > > > > >>>>>> Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
> > > > > >>>>>> Reviewed-by: Jeff Layton <jlayton@kernel.org>
> > > > > >>>>>
> > > > > >>>>>
> > > > > >>>>> This is not doing what I asked for, which was to separate out the
> > > > > >>>>> fscache functionality, so that we can call that if and when it is
> > > > > >>>>> available.
> > > > > >>>>>
> > > > > >>>> I must have misunderstood then.
> > > > > >>>>
> > > > > >>>> The last feedback I have from you was that you wanted it to be
> > > > > >>>> an opt-in feature, and it was a comment on a previous patch
> > > > > >>>> to Kconfig.  I was proceeding the best I knew how, but
> > > > > >>>> let me try to get back on track.
> > > > > >>>>
> > > > > >>>>> Instead, it is just wrapping the NFS requests inside netfs
> > > > > >>>>> requests. As
> > > > > >>>>> it stands, that means it is just duplicating information, and
> > > > > >>>>> adding
> > > > > >>>>> unnecessary overhead to the standard I/O path (extra allocations,
> > > > > >>>>> extra
> > > > > >>>>> indirect calls, and extra bloat to the inode).
> > > > > >>>>>
> > > > > >>>> I think I understand what you're saying but I'm not sure.  Let me
> > > > > >>>> ask some clarifying questions.
> > > > > >>>>
> > > > > >>>> Are you objecting to the code when CONFIG_NFS_FSCACHE is
> > > > > >>>> configured?  Or when it is not?  Or both?  I think you're objecting
> > > > > >>>> when it's configured, but not enabled (we mount without 'fsc').
> > > > > >>>> Am I right?
> > > > > >>>>
> > > > > >>>> Also, are you objecting to the design that to use fcache we now
> > > > > >>>> have to use netfs, specifically:
> > > > > >>>> - call into netfs via either netfs_read_folio or netfs_readahead
> > > > > >>>> - if fscache is enabled, then the IO can be satisfied from fscache
> > > > > >>>> - if fscache is not enabled, or some of the IO cannot be satisfied
> > > > > >>>> from the cache, then NFS is called back via netfs_issue_read
> > > > > >>>> and we use the normal NFS read pageio interface.  This requires
> > > > > >>>> we call netfs_subreq_terminated() when all the RPCs complete,
> > > > > >>>> which is the reason for the small changes to pagelist.c
> > > > > >>>
> > > > > >>> I'm objecting to any middle layer "solution" that adds overhead to the
> > > > > >>> NFS I/O paths.
> > > > > >>>
> > > > > >> Got it.
> > > > > >>
> > > > > >>> I'm willing to consider solutions that are specific only to the fscache
> > > > > >>> use case (i.e. when the 'fsc' mount option is specified). However when
> > > > > >>> I perform a normal NFS mount, and do I/O, then I don't want to see
> > > > > >>> extra memory allocations, extra indirect calls and larger inode
> > > > > >>> footprints.
> > > > > >>>
> > > > > >>> IOW: I want the code to optimise for the case of standard NFS, not for
> > > > > >>> the case of 'NFS with cachefs additions'.
> > > > > >>>
> > > > > >> I agree completely.  Are you seeing extra memory allocations
> > > > > >> happen on mounts without 'fsc' or is it more a concern or how
> > > > > >> some of the patches look?  We should not be calling any netfs or
> > > > > >> fscache code if 'fsc' is not on the mount and I don't see any in my
> > > > > >> testing. So either there's a misunderstanding here, or there's a
> > > > > >> bug I'm missing.
> > > > > >>
> > > > > >> If fscache is not configured, then nfs_netfs_read_folio() and
> > > > > >> nfs_netfs_readahead() is a wrapper that returns -ENOBUFS.
> > > > > >> If it's configured but not enabled, then the checks for
> > > > > >> netfs_inode(inode)->cache should skip over any netfs code.
> > > > > >> But maybe there's a non-obvious bug you're seeing and
> > > > > >> somehow netfs is still getting called?  Because I cannot
> > > > > >> see netfs getting called if 'fsc' is not on the mount in my
> > > > > >> tests.
> > > > > >>
> > > > > >> int nfs_netfs_read_folio(struct file *file, struct folio *folio)
> > > > > >> {
> > > > > >>       if (!netfs_inode(folio_inode(folio))->cache)
> > > > > >>               return -ENOBUFS;
> > > > > >>
> > > > > >>       return netfs_read_folio(file, folio);
> > > > > >> }
> > > > > >>
> > > > > >> int nfs_netfs_readahead(struct readahead_control *ractl)
> > > > > >> {
> > > > > >>       struct inode *inode = ractl->mapping->host;
> > > > > >>
> > > > > >>       if (!netfs_inode(inode)->cache)
> > > > > >>               return -ENOBUFS;
> > > > > >>
> > > > > >>       netfs_readahead(ractl);
> > > > > >>       return 0;
> > > > > >> }
> > > > > >>
> > > > > >>
> > > > > >>>>
> > > > > >>>> Can you be more specific as to the portions of the patch you don't
> > > > > >>>> like
> > > > > >>>> so I can move it in the right direction?
> > > > > >>>>
> > > > > >>>> This is from patch #2 which you didn't comment on.  I'm not sure
> > > > > >>>> you're
> > > > > >>>> ok with it though, since you mention "extra bloat to the inode".
> > > > > >>>> Do you object to this even though it's wrapped in an
> > > > > >>>> #ifdef CONFIG_NFS_FSCACHE?  If so, do you require no
> > > > > >>>> extra size be added to nfs_inode?
> > > > > >>>>
> > > > > >>>> @@ -204,9 +208,11 @@ struct nfs_inode {
> > > > > >>>>       __u64 write_io;
> > > > > >>>>       __u64 read_io;
> > > > > >>>> #ifdef CONFIG_NFS_FSCACHE
> > > > > >>>> -       struct fscache_cookie   *fscache;
> > > > > >>>> -#endif
> > > > > >>>> +       struct netfs_inode      netfs; /* netfs context and VFS inode
> > > > > >>>> */
> > > > > >>>> +#else
> > > > > >>>>       struct inode            vfs_inode;
> > > > > >>>> +#endif
> > > > > >>>> +
> > > > > >>>
> > > > > >>> Ideally, I'd prefer no extra size. I can live with it up to a certain
> > > > > >>> point, however for now NFS is not unconditionally opting into the netfs
> > > > > >>> project. If we're to ever do that, then I want to see streamlined code
> > > > > >>> for the standard I/O case.
> > > > > >>>
> > > > > >> Ok and understood about standard I/O case.
> > > > > >>
> > > > > >> I was thinking how we might not increase the size, but I don't think
> > > > > >> I can make it work.
> > > > > >>
> > > > > >> I thought we could change to something like the below, without an
> > > > > >> embedded struct inode:
> > > > > >>
> > > > > >> @@ -204,9 +208,11 @@ struct nfs_inode {
> > > > > >>       __u64 write_io;
> > > > > >>       __u64 read_io;
> > > > > >> #ifdef CONFIG_NFS_FSCACHE
> > > > > >> -       struct fscache_cookie   *fscache;
> > > > > >> -#endif
> > > > > >> +       struct netfs_inode      *netfs; /* netfs context and VFS inode */
> > > > > >> +#else
> > > > > >>       struct inode            vfs_inode;
> > > > > >> +#endif
> > > > > >> +
> > > > > >>
> > > > > >> Then I would need to alloc/free a netfs_inode at the time of
> > > > > >> nfs_inode initiation.  Unfortunately this has the issue that the NFS_I()
> > > > > >> macro cannot work, because it requires an embedded "struct inode"
> > > > > >> due to "container_of" use:
> > > > > >>
> > > > > >> +#ifdef CONFIG_NFS_FSCACHE
> > > > > >> +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > > > > >> +{
> > > > > >> +       return &nfsi->netfs.inode;
> > > > > >> +}
> > > > > >> +static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > > > > >> +{
> > > > > >> +       return container_of(inode, struct nfs_inode, netfs.inode);
> > > > > >> +}
> > > > > >> +#else
> > > > > >> +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > > > > >> +{
> > > > > >> +       return &nfsi->vfs_inode;
> > > > > >> +}
> > > > > >> static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > > > > >> {
> > > > > >>       return container_of(inode, struct nfs_inode, vfs_inode);
> > > > > >> }
> > > > > >> +#endif
> > > > > >>
> > > > > >>
> > > > > >
> > > > > > Actually Trond maybe we can achieve a "0 length increase" of
> > > > > > nfs_inode if dhowells would take a patch to modify the definition
> > > > > > of struct netfs_inode and netfs_inode_init(), something like the WIP
> > > > > > patch below.  What do you think?
> > > > >
> > > > > That works for me.
> > > > >
> > > > > >
> > > > > > I think maybe this could be a follow-on patch and if you/dhowells
> > > > > > think it's an ok idea I can try to work out what is needed across
> > > > > > the tree.  I thought about it more and I kinda agree that in the
> > > > > > case for NFS where fscache is "configured but not enabled",
> > > > > > then even though we're only adding 24 bytes to the nfs_inode
> > > > > > each time, it will add up so it is worth at least a discussion.
> > > > > >
> > > > > > diff --git a/include/linux/netfs.h b/include/linux/netfs.h
> > > > > > index f2402ddeafbf..195714f1c355 100644
> > > > > > --- a/include/linux/netfs.h
> > > > > > +++ b/include/linux/netfs.h
> > > > > > @@ -118,11 +118,7 @@ enum netfs_io_source {
> > > > > > typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error,
> > > > > >                                     bool was_async);
> > > > > >
> > > > > > -/*
> > > > > > - * Per-inode context.  This wraps the VFS inode.
> > > > > > - */
> > > > > > -struct netfs_inode {
> > > > > > -       struct inode            inode;          /* The VFS inode */
> > > > > > +struct netfs_info {
> > > > > >       const struct netfs_request_ops *ops;
> > > > > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > > > >       struct fscache_cookie   *cache;
> > > > > > @@ -130,6 +126,14 @@ struct netfs_inode {
> > > > > >       loff_t                  remote_i_size;  /* Size of the remote file */
> > > > > > };
> > > > > >
> > > > > > +/*
> > > > > > + * Per-inode context.  This wraps the VFS inode.
> > > > > > + */
> > > > > > +struct netfs_inode {
> > > > > > +       struct inode            inode;          /* The VFS inode */
> > > > > > +       struct netfs_info       *netfs;         /* Rest of netfs data */
> > > > > > +};
> > > > > > +
> > > > > > /*
> > > > > > * Resources required to do operations on a cache.
> > > > > > */
> > > > > > @@ -312,10 +316,12 @@ static inline struct netfs_inode
> > > > > > *netfs_inode(struct inode *inode)
> > > > > > static inline void netfs_inode_init(struct netfs_inode *ctx,
> > > > > >                                   const struct netfs_request_ops *ops)
> > > > > > {
> > > > > > -       ctx->ops = ops;
> > > > > > -       ctx->remote_i_size = i_size_read(&ctx->inode);
> > > > > > +       ctx->netfs = kzalloc(sizeof(struct netfs_info)), GFP_KERNEL);
> > > > > > +       /* FIXME: Check for NULL */
> > > > > > +       ctx->netfs->ops = ops;
> > > > > > +       ctx->netfs->remote_i_size = i_size_read(&ctx->inode);
> > > > > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > > > > -       ctx->cache = NULL;
> > > > > > +       ctx->netfs->cache = NULL;
> > > > > > #endif
> > > > > > }
> > > > > >
> > > > > >
> > > > > >
> > > > > >>
> > > > > >>>>
> > > > > >>>>
> > > > > >>>> Are you ok with the stub functions which are placed in fscache.h, and
> > > > > >>>> when CONFIG_NFS_FSCACHE is not set, become either a no-op
> > > > > >>>> or a 1-liner (nfs_netfs_readpage_release)?
> > > > > >>>>
> > > > > >>>> #else /* CONFIG_NFS_FSCACHE */
> > > > > >>>> +static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {}
> > > > > >>>> +static inline void nfs_netfs_initiate_read(struct nfs_pgio_header
> > > > > >>>> *hdr) {}
> > > > > >>>> +static inline void nfs_netfs_read_completion(struct nfs_pgio_header
> > > > > >>>> *hdr) {}
> > > > > >>>> +static inline void nfs_netfs_readpage_release(struct nfs_page *req)
> > > > > >>>> +{
> > > > > >>>> +       unlock_page(req->wb_page);
> > > > > >>>> +}
> > > > > >>>> static inline void nfs_fscache_release_super_cookie(struct
> > > > > >>>> super_block *sb) {}
> > > > > >>>> static inline void nfs_fscache_init_inode(struct inode *inode) {}
> > > > > >>>>
> > > > > >>>>
> > > > > >>>> Do you object to the below?  If so, then do you want
> > > > > >>>> #ifdef CONFIG_NFS_FSCACHE here?
> > > > > >>>>
> > > > > >>>> -- a/fs/nfs/inode.c
> > > > > >>>> +++ b/fs/nfs/inode.c
> > > > > >>>> @@ -2249,6 +2249,8 @@ struct inode *nfs_alloc_inode(struct
> > > > > >>>> super_block *sb)
> > > > > >>>> #ifdef CONFIG_NFS_V4_2
> > > > > >>>>       nfsi->xattr_cache = NULL;
> > > > > >>>> #endif
> > > > > >>>> +       nfs_netfs_inode_init(nfsi);
> > > > > >>>> +
> > > > > >>>>       return VFS_I(nfsi);
> > > > > >>>> }
> > > > > >>>> EXPORT_SYMBOL_GPL(nfs_alloc_i
> > > > > >>>> node);
> > > > > >>>>
> > > > > >>>>
> > > > > >>>> Do you object to the changes in fs/nfs/read.c?  Specifically,
> > > > > >>>> how about the below calls to netfs from nfs_read_folio and
> > > > > >>>> nfs_readahead into equivalent netfs calls?  So when
> > > > > >>>> NFS_CONFIG_FSCACHE is set, but fscache is not enabled
> > > > > >>>> ('fsc' not on mount), these netfs functions do immediately call
> > > > > >>>> netfs_alloc_request().  But I wonder if we could simply add a
> > > > > >>>> check to see if fscache is enabled on the mount, and skip
> > > > > >>>> over to satisfy what you want.  Am I understanding what you
> > > > > >>>> want?
> > > > > >>>
> > > > > >>> Quite frankly, I'd prefer that we just split out the functionality that
> > > > > >>> is needed from the netfs code so that it can be optimised. However I'm
> > > > > >>> not interested enough in the cachefs functionality to work on that
> > > > > >>> myself. ...and as I indicated above, I might be OK with opting into the
> > > > > >>> netfs project, once the overhead can be made to disappear.
> > > > > >>>
> > > > > >> Understood.
> > > > > >>
> > > > > >> If you think it makes more sense, I can move some of the nfs_netfs_*
> > > > > >> functions into a netfs.c file as a starting point.  Or that can maybe
> > > > > >> be done in a future patchset?
> > > > > >>
> > > > > >> For now I was equating netfs and fscache together so we can
> > > > > >> move on from the much older and single-page limiting fscache
> > > > > >> interface that is likely to go away soon.
> > > > > >>
> > > > > >>>>
> > > > > >>>> @@ -355,6 +343,10 @@ int nfs_read_folio(struct file *file, struct
> > > > > >>>> folio *folio)
> > > > > >>>>       if (NFS_STALE(inode))
> > > > > >>>>               goto out_unlock;
> > > > > >>>>
> > > > > >>>> +       ret = nfs_netfs_read_folio(file, folio);
> > > > > >>>> +       if (!ret)
> > > > > >>>> +               goto out;
> > > > > >>>> +
> > > > > >>>>
> > > > > >>>> @@ -405,6 +399,10 @@ void nfs_readahead(struct readahead_control
> > > > > >>>> *ractl)
> > > > > >>>>       if (NFS_STALE(inode))
> > > > > >>>>               goto out;
> > > > > >>>>
> > > > > >>>> +       ret = nfs_netfs_readahead(ractl);
> > > > > >>>> +       if (!ret)
> > > > > >>>> +               goto out;
> > > > > >>>> +
> > > > > >>>>
> > > > > >> The above wrappers should prevent any additional overhead when fscache
> > > > > >> is not enabled.  As far as I know these work to avoid calling netfs
> > > > > >> when 'fsc' is not on the mount.
> > > > > >>
> > > > > >>>>
> > > > > >>>> And how about these calls from different points in the read
> > > > > >>>> path to the earlier mentioned stub functions?
> > > > > >>>>
> > > > > >>>> @@ -110,20 +110,13 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
> > > > > >>>>
> > > > > >>>> static void nfs_readpage_release(struct nfs_page *req, int error)
> > > > > >>>> {
> > > > > >>>> -       struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > > > > >>>>       struct page *page = req->wb_page;
> > > > > >>>>
> > > > > >>>> -       dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb-
> > > > > >>>>> s_id,
> > > > > >>>> -               (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
> > > > > >>>> -               (long long)req_offset(req));
> > > > > >>>> -
> > > > > >>>>       if (nfs_error_is_fatal_on_server(error) && error != -
> > > > > >>>> ETIMEDOUT)
> > > > > >>>>               SetPageError(page);
> > > > > >>>> -       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
> > > > > >>>> -               if (PageUptodate(page))
> > > > > >>>> -                       nfs_fscache_write_page(inode, page);
> > > > > >>>> -               unlock_page(page);
> > > > > >>>> -       }
> > > > > >>>> +       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > > > >>>> +               nfs_netfs_readpage_release(req);
> > > > > >>>> +
> > > > > >>>
> > > > > >>> I'm not seeing the value of wrapping unlock_page(), no... That code is
> > > > > >>> going to need to change when we move it to use folios natively anyway.
> > > > > >>>
> > > > > >> Ok, how about I make it conditional on whether fscache is configured
> > > > > >> and enabled then, similar to the nfs_netfs_read_folio() and
> > > > > >> nfs_netfs_readahead()?  Below is what that would look like.
> > > > > >> I could inline the code in nfs_netfs_readpage_release() if you
> > > > > >> think it would be clearer.
> > > > > >>
> > > > > >> static void nfs_readpage_release(struct nfs_page *req, int error)
> > > > > >> {
> > > > > >>       struct page *page = req->wb_page;
> > > > > >>
> > > > > >>       if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
> > > > > >>               SetPageError(page);
> > > > > >>       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > > > >> #ifndef CONFIG_NFS_FSCACHE
> > > > > >>               unlock_page(req->wb_page);
> > > > > >> #else
> > > > > >>               nfs_netfs_readpage_release(req);
> > > > > >> #endif
> > > > > >>       nfs_release_request(req);
> > > > > >> }
> > > > > >>
> > > > > >>
> > > > > >> void nfs_netfs_readpage_release(struct nfs_page *req)
> > > > > >> {
> > > > > >>   struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > > > > >>
> > > > > >>   /*
> > > > > >>    * If fscache is enabled, netfs will unlock pages.
> > > > > >>    */
> > > > > >>   if (netfs_inode(inode)->cache)
> > > > > >>       return;
> > > > > >>
> > > > > >>   unlock_page(req->wb_page);
> > > > > >> }
> > > > > >>
> > > > > >>
> > > > > >>>>       nfs_release_request(req);
> > > > > >>>> }
> > > > > >>>>
> > > > > >>>> @@ -177,6 +170,8 @@ static void nfs_read_completion(struct
> > > > > >>>> nfs_pgio_header *hdr)
> > > > > >>>>               nfs_list_remove_request(req);
> > > > > >>>>               nfs_readpage_release(req, error);
> > > > > >>>>       }
> > > > > >>>> +       nfs_netfs_read_completion(hdr);
> > > > > >>>> +
> > > > > >>>> out:
> > > > > >>>>       hdr->release(hdr);
> > > > > >>>> }
> > > > > >>>> @@ -187,6 +182,7 @@ static void nfs_initiate_read(struct
> > > > > >>>> nfs_pgio_header *hdr,
> > > > > >>>>                             struct rpc_task_setup *task_setup_data,
> > > > > >>>> int how)
> > > > > >>>> {
> > > > > >>>>       rpc_ops->read_setup(hdr, msg);
> > > > > >>>> +       nfs_netfs_initiate_read(hdr);
> > > > > >>>>       trace_nfs_initiate_read(hdr);
> > > > > >>>> }
> > > > > >>>>
> > > > > >>>>
> > > > > >>>> Are you ok with these additions?  Something like this would
> > > > > >>>> be required in the case of fscache configured and enabled,
> > > > > >>>> because we could have some of the data in a read in
> > > > > >>>> fscache, and some not.  That is the reason for the netfs
> > > > > >>>> design, and why we need to be able to call the normal
> > > > > >>>> NFS read IO path (netfs calls into issue_read, and we call
> > > > > >>>> back via netfs_subreq_terminated)?
> > > > > >>>>
> > > > > >>>> @@ -101,6 +101,9 @@ struct nfs_pageio_descriptor {
> > > > > >>>>       struct pnfs_layout_segment *pg_lseg;
> > > > > >>>>       struct nfs_io_completion *pg_io_completion;
> > > > > >>>>       struct nfs_direct_req   *pg_dreq;
> > > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > > >>>> +       void                    *pg_netfs;
> > > > > >>>> +#endif
> > > > > >>>>
> > > > > >>>> @@ -1619,6 +1619,9 @@ struct nfs_pgio_header {
> > > > > >>>>       const struct nfs_rw_ops *rw_ops;
> > > > > >>>>       struct nfs_io_completion *io_completion;
> > > > > >>>>       struct nfs_direct_req   *dreq;
> > > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > > >>>> +       void                    *netfs;
> > > > > >>>> +#endif
> > > > > >>>>
> > > > > >>>>
> > > > > >>>> And these additions to pagelist.c?
> > > > > >>>>
> > > > > >>>> @@ -68,6 +69,10 @@ void nfs_pgheader_init(struct
> > > > > >>>> nfs_pageio_descriptor *desc,
> > > > > >>>>       hdr->good_bytes = mirror->pg_count;
> > > > > >>>>       hdr->io_completion = desc->pg_io_completion;
> > > > > >>>>       hdr->dreq = desc->pg_dreq;
> > > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > > >>>> +       if (desc->pg_netfs)
> > > > > >>>> +               hdr->netfs = desc->pg_netfs;
> > > > > >>>> +#endif
> > > > > >>>
> > > > > >>> Why the conditional?
> > > > > >>>
> > > > > >> Not really needed and I was thinking of removing it, so I'll do that.
> > > > > >>
> > > > > >>>>
> > > > > >>>>
> > > > > >>>> @@ -846,6 +851,9 @@ void nfs_pageio_init(struct nfs_pageio_descriptor
> > > > > >>>> *desc,
> > > > > >>>>       desc->pg_lseg = NULL;
> > > > > >>>>       desc->pg_io_completion = NULL;
> > > > > >>>>       desc->pg_dreq = NULL;
> > > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > > >>>> +       desc->pg_netfs = NULL;
> > > > > >>>> +#endif
> > > > > >>>>
> > > > > >>>>
> > > > > >>>> @@ -1360,6 +1369,9 @@ int nfs_pageio_resend(struct
> > > > > >>>> nfs_pageio_descriptor *desc,
> > > > > >>>>
> > > > > >>>>       desc->pg_io_completion = hdr->io_completion;
> > > > > >>>>       desc->pg_dreq = hdr->dreq;
> > > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > > >>>> +       desc->pg_netfs = hdr->netfs;
> > > > > >>>> +#endif
> > > > > >>>
> > > > > >>> Those all need wrapper functions instead of embedding #ifdefs.
> > > > > >>>
> > > > > >> Ok.
> > > > > >>
> > > > > >>
> > > > > >>
> > > > > >>>>
> > > > > >>>>
> > > > > >>>>> My expectation is that the standard I/O path should have minimal
> > > > > >>>>> overhead, and should certainly not increase the overhead that we
> > > > > >>>>> already have. Will this be addressed in future iterations of these
> > > > > >>>>> patches?
> > > > > >>>>>
> > > > > >>>>
> > > > > >>>> I will do what I can to satisfy what you want, either by fixing up
> > > > > >>>> this patch or follow-on patches.  Hopefully the above questions
> > > > > >>>> will clarify the next steps.
> > > > > >>>>
> > > > > >>>
> > > > > >>> --
> > > > > >>> Trond Myklebust
> > > > > >>> Linux NFS client maintainer, Hammerspace
> > > > > >>> trond.myklebust@hammerspace.com
> > > > >
> > > > >
> > > > >
> > > > > Trond Myklebust
> > > > > CTO, Hammerspace Inc
> > > > > 1900 S Norfolk St, Suite 350 - #45
> > > > > San Mateo, CA 94403
> > > > >
> > > > > www.hammer.space
> > > > >
> > > > >
> > > >
> > >
> >
>
Daire Byrne Nov. 17, 2022, 11:03 a.m. UTC | #19
Hi,

I just wanted to take the opportunity to reiterate why these patches
are important to me (and others like Benjamin).

The "new" fscache that is now in mainline has a major NFS performance
regression from the previous fscache code in pre 5.17 kernels - single
file reads from cache.

Even if you have the fastest local disk (nvme/ssd) for your fscache,
reading back a cached file (via NFS) now tops out at around 40MB/s
whereas before (old fscache) the local fscache disk speed was the only
limit (e.g. 5000MB/s for NVMe).

So, in many cases, depending on what you are using fscache for, it can
be faster to read the file over the (gigabit) network than from the
local disk cache which somewhat negates its usefulness. As such, we
mostly use pre-5.17 kernels in production and the old fscache code
which maintains high cache read performance (but has other annoying
issues).

Now this performance regression might not be noticed too much by
desktop users looking to use fscache on their systems, but it sure
does affect servers (e.g. re-export servers) that want to use fscache
to achieve very high performance.

I can't really comment on these patches or the approach taken, but I
do hope that we can restore/improve the fscache read performance for
NFS in the mainline kernel as soon as possible (like these patches
do).

Daire


On Mon, 14 Nov 2022 at 21:26, Benjamin Maynard <benmaynard@google.com> wrote:
>
> Thanks Dave, that did the trick!
>
> Building the kernel from
> https://github.com/DaveWysochanskiRH/kernel/commit/42f58f3d36d83839022dc2617bb6c2d1b09db65f
> and re-running the exact same tests yielded the expected results. Data
> is now being served from /var/cache/fscache.
>
> I also reverted my change to the read ahead, so that read ahead is now
> greater than the rsize. Still works as expected.
>
> I am also seeing much better single file read speeds, and culling is
> working perfectly (not running into the issue we were seeing pre
> 5.17).
>
> Thanks a lot Dave, Jeff and Daire for your help.
>
> Kind Regards
> Benjamin Maynard
>
>
>
> Kind Regards
>
> Benjamin Maynard
>
> Customer Engineer
>
> benmaynard@google.com
>
> Google, Inc.
>
>
>
>
> On Mon, 14 Nov 2022 at 17:35, David Wysochanski <dwysocha@redhat.com> wrote:
> >
> > On Mon, Nov 14, 2022 at 11:04 AM Benjamin Maynard <benmaynard@google.com> wrote:
> > >
> > > Hi Dave,
> > >
> > > I've added responses to your questions inline below.
> > >
> > > I also tried adding the noatime option to the mount on the source
> > > filer as Jeff suggested, but this has not made any difference and the
> > > issue is still persisting for me.
> > >
> > > I created the following diagram that explains my setup, and the exact
> > > tests I am performing:
> > > https://drive.google.com/file/d/12Xf-9yHCKM4eMr2YGqdSAVfGcximW4OG/view?usp=sharing.
> > >
> > > Hopefully this is clearer than my explanations below (let me know if
> > > you'd prefer me to share an alternative way).
> > >
> > Yes, that's very helpful.  Let me think about this one as I'm not sure.
> > As Jeff says we may need tracepoints to track it down if I cannot repro
> > it and/or nothing comes to mind.
> >
> > > In order to remove the re-exporting layer of complexity, I also
> > > performed the tests without the re-export server (architecture:
> > > https://drive.google.com/file/d/1DQKhqo_UnQ8ul-z5Iram5LpisDmkKziQ/view?usp=share_link):
> > >
> > > Source NFS Server <-- Client (with FS-Cache)
> > >
> > > The same is happening, I cannot get FS-Cache to serve from cache.
> > > Heavy writes, but no reads, even when the same file is copied many
> > > times.
> > >
> > I'm pretty sure the above you're hitting the drop_caches /
> > "fscache read optimisation" issue #1 I mentioned.
> >
> > I see dhowells just posted a v2 version of his previous patch:
> > https://lore.kernel.org/linux-mm/166844174069.1124521.10890506360974169994.stgit@warthog.procyon.org.uk/
> >
> > I started with 6.1-rc5, added the above dhowells latest patch for that issue,
> > and then my 5 patches on top.  Then I added a small patch to utilize
> > dhowells patch to ensure the read optimisation is removed.  I ran my
> > unit test that has been failing all along and as expected it passes with
> > these patches.  I pushed the series to github:
> > https://github.com/DaveWysochanskiRH/kernel/commits/nfs-fscache-netfs
> > https://github.com/DaveWysochanskiRH/kernel/commit/42f58f3d36d83839022dc2617bb6c2d1b09db65f
> >
> > I will also email you the series of patches on top of 6.1-rc5 so you
> > can just apply from your mailbox if you want.
> >
> >
> >
> > > Hopefully something I am doing wrong on my end, but I can't figure out what.
> > >
> > > Kind Regards
> > > Benjamin Maynard
> > >
> > >
> > > On Mon, 14 Nov 2022 at 13:47, David Wysochanski <dwysocha@redhat.com> wrote:
> > > >
> > > > I apologize I did not read carefully enough and I missed some details
> > > > in your original post.
> > > > More below.
> > > >
> > > > On Sat, Nov 12, 2022 at 7:47 AM Benjamin Maynard <benmaynard@google.com> wrote:
> > > > >
> > > > > Hi all,
> > > > >
> > > > > I've been doing some more testing with these patches, I applied all of
> > > > > the patches (v10 from
> > > > > https://patchwork.kernel.org/project/linux-nfs/list/?series=691729)
> > > > > apart from Patch 6 (the RFC patch) to version 6.0.8 of the kernel.
> > > > >
> > > > > I have the following setup:
> > > > >
> > > > > Source NFS Server <-- Re-Export Server (with FS-Cache) <-- NFS Client.
> > > > >
> > > > > I have a 500Gb file on the Source NFS Server, which I am then copying
> > > > > to the NFS Client via the Re-Export Server.
> > > > >
> > > > > On the first copy, I see heavy writes to /var/cache/fscache on the
> > > > > re-export server, and once the file copy completes I see that
> > > > > /var/cache/fscache is approximately 500Gb in size. All good so far.
> > > > >
> > > > > I then deleted that file from the NFS Client, and dropped the caches
> > > > > just to be safe (echo 3 > /proc/sys/vm/drop_caches on the NFS Client).
> > > > >
> > > > If you delete the file from the NFS client, how does that not delete the
> > > > file from the original NFS server?
> > >
> > > Sorry - to be clear, I never deleted the file from the NFS mount
> > > (which I know would in turn delete it from the re-export server and
> > > the source filer).
> > >
> > > In order to perform the performance test, I copied the file from the
> > > NFS mount on the NFS Client, to a local directory (cp
> > > /mnt/nfs/500gb.img /tmp).
> > >
> > > When I said "I then deleted that file from the NFS Client", I meant I
> > > deleted the local copy of that file. Not the file on the mount (rm
> > > /tmp/500gb.img).
> > >
> > > Just to also stress, I have never dropped the caches on the Re-Export
> > > Server (the one with FS-Cache) at any point in any of these tests, so
> > > I don't think this is the problem. I have only ever dropped the caches
> > > on the NFS client that is mounting the Re-Export Server.
> > >
> > > > > I then performed another copy of the 500Gb file on the NFS Client,
> > > > > again via the Re-Export Server. What I expected would happen is that I
> > > > > would see heavy reads from the /var/cache/fscache volume as the file
> > > > > should be served from FS-Cache.
> > > > >
> > > > I don't understand this.  When you say you "performed another copy"
> > > > of what file?  Wasn't the file deleted in the above step?
> > >
> > > As above, only the local copy was deleted.
> > >
> > > > > However what I actually saw was no reads whatsoever, FS-Cache seems to
> > > > > be ignored and the file is pulled from the Source NFS Filer again. I
> > > > > also see heavy writes to /var/cache/fscache, so it appears that
> > > > > FS-Cache is overwriting its existing cache, and never using it.
> > > >
> > > > That would happen if the file was changed or re-created.
> > > >
> > > > > I only have 104Gb of memory on the Re-Export Server (with FS-Cache) so
> > > > > it is not possible that the file is being served from the page cache.
> > > > >
> > > > > We saw this behaviour before on an older set of the patches when our
> > > > > mount between the Re-Export Server and the Source NFS Filer was using
> > > > > the "sync" option, but we are now using the "async" option and the
> > > > > same is happening.
> > > > >
> > > > > Mount options:
> > > > >
> > > > > Source NFS Server <-- Re-Export Server (with FS-Cache):
> > > > >
> > > > > 10.0.0.49:/files /srv/nfs/files nfs
> > > > > rw,noatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,acregmin=600,acregmax=600,acdirmin=600,acdirmax=600,hard,nocto,proto=tcp,nconnect=16,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.49,mountvers=3,mountport=37485,mountproto=tcp,fsc,local_lock=none,addr=10.0.0.49
> > > > >
> > > > > Re-Export Server (with FS-Cache) <-- NFS Client:
> > > > >
> > > > > 10.0.0.3:/files /mnt/nfs nfs
> > > > > rw,relatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.3,mountvers=3,mountport=20048,mountproto=tcp,local_lock=none,addr=10.0.0.3
> > > > >
> > > > > It is also worth noting this behaviour is not unique to the re-export
> > > > > use case. I see FS-Cache not being used with the following setup:
> > > > >
> > > > > Source NFS Server <-- Client (with FS-Cache).
> > > > >
> > > >
> > > > This points at something more fundamental like something missed
> > > > in the test or maybe a mount option.  Can you explain what test
> > > > you're doing here when you say "this behavior is not unique"?
> > >
> > > I've created the following diagram which explains the test I am
> > > performing. I think it is a little easier to follow than explaining in
> > > text. This should be viewable without any authentication:
> > > https://drive.google.com/file/d/12Xf-9yHCKM4eMr2YGqdSAVfGcximW4OG/view?usp=sharing.
> > >
> > > By "this behaviour is not unique to the re-export use case" I mean
> > > that the same happens if I remove the re-export server completely, and
> > > just have the following setup:
> > >
> > > Source NFS Server <-- Client (with FS-Cache).
> > >
> > > > Can you show the mount options for both:
> > > > - fscache filesystem on the re-export server (/var/cache/fscache)
> > >
> > > root@reexport:~$ mount | grep /var/cache/fscache
> > > /dev/md127 on /var/cache/fscache type ext4
> > > (rw,relatime,discard,nobarrier,stripe=1024)
> > >
> > > > - exported filesystem on the NFS server (filesystem in /etc/exports)
> > >
> > > I have tried both:
> > >
> > > root@source:~$ mount | grep files
> > > /dev/sdb1 on /files type ext4 (rw)
> > >
> > > root@source:~$ cat /etc/exports
> > > /files 10.0.0.0/8(rw,sync,wdelay,no_root_squash,no_all_squash,no_subtree_check,sec=sys,secure,nohide)
> > >
> > > and (at Jeff's suggestion):
> > >
> > > root@source:~$ mount | grep files
> > > /dev/sdb1 on /files type ext4 (rw,noatime)
> > >
> > > root@source:~$ cat /etc/exports
> > > /files 10.0.0.0/8(rw,sync,wdelay,no_root_squash,no_all_squash,no_subtree_check,sec=sys,secure,nohide)
> > >
> > >
> > > > Unfortunately the problem with drop_caches makes it more difficult
> > > > to know when fscache is truly working.  But some other unit test
> > > > I have shows fscache does work with this patchset so I'm puzzled why
> > > > you're not seeing it work at all.
> > > >
> > > > I pinged dhowells on the drop_caches issue so maybe we can get
> > > > that one sorted out soon but I'm not sure since it's part of a series
> > > > and proposes changes in mm.
> > >
> > > Just to be clear, I have never used drop_caches on the re-export
> > > server in any of these tests. I have only ever done this on the NFS
> > > Client.
> > >
> > > >
> > > > > Thanks,
> > > > > Ben
> > > > >
> > > > >
> > > > > Kind Regards
> > > > >
> > > > > Benjamin Maynard
> > > > >
> > > > > Customer Engineer
> > > > >
> > > > > benmaynard@google.com
> > > > >
> > > > > Google, Inc.
> > > > >
> > > > >
> > > > >
> > > > >
> > > > > On Mon, 31 Oct 2022 at 22:22, Trond Myklebust <trondmy@hammerspace.com> wrote:
> > > > > >
> > > > > >
> > > > > >
> > > > > > > On Oct 30, 2022, at 19:25, David Wysochanski <dwysocha@redhat.com> wrote:
> > > > > > >
> > > > > > > On Sat, Oct 29, 2022 at 12:46 PM David Wysochanski <dwysocha@redhat.com> wrote:
> > > > > > >>
> > > > > > >> On Fri, Oct 28, 2022 at 12:59 PM Trond Myklebust <trondmy@kernel.org> wrote:
> > > > > > >>>
> > > > > > >>> On Fri, 2022-10-28 at 07:50 -0400, David Wysochanski wrote:
> > > > > > >>>> On Thu, Oct 27, 2022 at 3:16 PM Trond Myklebust <trondmy@kernel.org>
> > > > > > >>>> wrote:
> > > > > > >>>>>
> > > > > > >>>>> On Mon, 2022-10-17 at 06:52 -0400, Dave Wysochanski wrote:
> > > > > > >>>>>> Convert the NFS buffered read code paths to corresponding netfs
> > > > > > >>>>>> APIs,
> > > > > > >>>>>> but only when fscache is configured and enabled.
> > > > > > >>>>>>
> > > > > > >>>>>> The netfs API defines struct netfs_request_ops which must be
> > > > > > >>>>>> filled
> > > > > > >>>>>> in by the network filesystem.  For NFS, we only need to define 5
> > > > > > >>>>>> of
> > > > > > >>>>>> the functions, the main one being the issue_read() function.
> > > > > > >>>>>> The issue_read() function is called by the netfs layer when a
> > > > > > >>>>>> read
> > > > > > >>>>>> cannot be fulfilled locally, and must be sent to the server
> > > > > > >>>>>> (either
> > > > > > >>>>>> the cache is not active, or it is active but the data is not
> > > > > > >>>>>> available).
> > > > > > >>>>>> Once the read from the server is complete, netfs requires a call
> > > > > > >>>>>> to
> > > > > > >>>>>> netfs_subreq_terminated() which conveys either how many bytes
> > > > > > >>>>>> were
> > > > > > >>>>>> read
> > > > > > >>>>>> successfully, or an error.  Note that issue_read() is called with
> > > > > > >>>>>> a
> > > > > > >>>>>> structure, netfs_io_subrequest, which defines the IO requested,
> > > > > > >>>>>> and
> > > > > > >>>>>> contains a start and a length (both in bytes), and assumes the
> > > > > > >>>>>> underlying
> > > > > > >>>>>> netfs will return a either an error on the whole region, or the
> > > > > > >>>>>> number
> > > > > > >>>>>> of bytes successfully read.
> > > > > > >>>>>>
> > > > > > >>>>>> The NFS IO path is page based and the main APIs are the pgio APIs
> > > > > > >>>>>> defined
> > > > > > >>>>>> in pagelist.c.  For the pgio APIs, there is no way for the caller
> > > > > > >>>>>> to
> > > > > > >>>>>> know how many RPCs will be sent and how the pages will be broken
> > > > > > >>>>>> up
> > > > > > >>>>>> into underlying RPCs, each of which will have their own
> > > > > > >>>>>> completion
> > > > > > >>>>>> and
> > > > > > >>>>>> return code.  In contrast, netfs is subrequest based, a single
> > > > > > >>>>>> subrequest may contain multiple pages, and a single subrequest is
> > > > > > >>>>>> initiated with issue_read() and terminated with
> > > > > > >>>>>> netfs_subreq_terminated().
> > > > > > >>>>>> Thus, to utilze the netfs APIs, NFS needs some way to accommodate
> > > > > > >>>>>> the netfs API requirement on the single response to the whole
> > > > > > >>>>>> subrequest, while also minimizing disruptive changes to the NFS
> > > > > > >>>>>> pgio layer.
> > > > > > >>>>>>
> > > > > > >>>>>> The approach taken with this patch is to allocate a small
> > > > > > >>>>>> structure
> > > > > > >>>>>> for each nfs_netfs_issue_read() call, store the final error and
> > > > > > >>>>>> number
> > > > > > >>>>>> of bytes successfully transferred in the structure, and update
> > > > > > >>>>>> these
> > > > > > >>>>>> values
> > > > > > >>>>>> as each RPC completes.  The refcount on the structure is used as
> > > > > > >>>>>> a
> > > > > > >>>>>> marker
> > > > > > >>>>>> for the last RPC completion, is incremented in
> > > > > > >>>>>> nfs_netfs_read_initiate(),
> > > > > > >>>>>> and decremented inside nfs_netfs_read_completion(), when a
> > > > > > >>>>>> nfs_pgio_header
> > > > > > >>>>>> contains a valid pointer to the data.  On the final put (which
> > > > > > >>>>>> signals
> > > > > > >>>>>> the final outstanding RPC is complete) in
> > > > > > >>>>>> nfs_netfs_read_completion(),
> > > > > > >>>>>> call netfs_subreq_terminated() with either the final error value
> > > > > > >>>>>> (if
> > > > > > >>>>>> one or more READs complete with an error) or the number of bytes
> > > > > > >>>>>> successfully transferred (if all RPCs complete successfully).
> > > > > > >>>>>> Note
> > > > > > >>>>>> that when all RPCs complete successfully, the number of bytes
> > > > > > >>>>>> transferred
> > > > > > >>>>>> is capped to the length of the subrequest.  Capping the
> > > > > > >>>>>> transferred
> > > > > > >>>>>> length
> > > > > > >>>>>> to the subrequest length prevents "Subreq overread" warnings from
> > > > > > >>>>>> netfs.
> > > > > > >>>>>> This is due to the "aligned_len" in nfs_pageio_add_page(), and
> > > > > > >>>>>> the
> > > > > > >>>>>> corner case where NFS requests a full page at the end of the
> > > > > > >>>>>> file,
> > > > > > >>>>>> even when i_size reflects only a partial page (NFS overread).
> > > > > > >>>>>>
> > > > > > >>>>>> Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
> > > > > > >>>>>> Reviewed-by: Jeff Layton <jlayton@kernel.org>
> > > > > > >>>>>
> > > > > > >>>>>
> > > > > > >>>>> This is not doing what I asked for, which was to separate out the
> > > > > > >>>>> fscache functionality, so that we can call that if and when it is
> > > > > > >>>>> available.
> > > > > > >>>>>
> > > > > > >>>> I must have misunderstood then.
> > > > > > >>>>
> > > > > > >>>> The last feedback I have from you was that you wanted it to be
> > > > > > >>>> an opt-in feature, and it was a comment on a previous patch
> > > > > > >>>> to Kconfig.  I was proceeding the best I knew how, but
> > > > > > >>>> let me try to get back on track.
> > > > > > >>>>
> > > > > > >>>>> Instead, it is just wrapping the NFS requests inside netfs
> > > > > > >>>>> requests. As
> > > > > > >>>>> it stands, that means it is just duplicating information, and
> > > > > > >>>>> adding
> > > > > > >>>>> unnecessary overhead to the standard I/O path (extra allocations,
> > > > > > >>>>> extra
> > > > > > >>>>> indirect calls, and extra bloat to the inode).
> > > > > > >>>>>
> > > > > > >>>> I think I understand what you're saying but I'm not sure.  Let me
> > > > > > >>>> ask some clarifying questions.
> > > > > > >>>>
> > > > > > >>>> Are you objecting to the code when CONFIG_NFS_FSCACHE is
> > > > > > >>>> configured?  Or when it is not?  Or both?  I think you're objecting
> > > > > > >>>> when it's configured, but not enabled (we mount without 'fsc').
> > > > > > >>>> Am I right?
> > > > > > >>>>
> > > > > > >>>> Also, are you objecting to the design that to use fcache we now
> > > > > > >>>> have to use netfs, specifically:
> > > > > > >>>> - call into netfs via either netfs_read_folio or netfs_readahead
> > > > > > >>>> - if fscache is enabled, then the IO can be satisfied from fscache
> > > > > > >>>> - if fscache is not enabled, or some of the IO cannot be satisfied
> > > > > > >>>> from the cache, then NFS is called back via netfs_issue_read
> > > > > > >>>> and we use the normal NFS read pageio interface.  This requires
> > > > > > >>>> we call netfs_subreq_terminated() when all the RPCs complete,
> > > > > > >>>> which is the reason for the small changes to pagelist.c
> > > > > > >>>
> > > > > > >>> I'm objecting to any middle layer "solution" that adds overhead to the
> > > > > > >>> NFS I/O paths.
> > > > > > >>>
> > > > > > >> Got it.
> > > > > > >>
> > > > > > >>> I'm willing to consider solutions that are specific only to the fscache
> > > > > > >>> use case (i.e. when the 'fsc' mount option is specified). However when
> > > > > > >>> I perform a normal NFS mount, and do I/O, then I don't want to see
> > > > > > >>> extra memory allocations, extra indirect calls and larger inode
> > > > > > >>> footprints.
> > > > > > >>>
> > > > > > >>> IOW: I want the code to optimise for the case of standard NFS, not for
> > > > > > >>> the case of 'NFS with cachefs additions'.
> > > > > > >>>
> > > > > > >> I agree completely.  Are you seeing extra memory allocations
> > > > > > >> happen on mounts without 'fsc' or is it more a concern or how
> > > > > > >> some of the patches look?  We should not be calling any netfs or
> > > > > > >> fscache code if 'fsc' is not on the mount and I don't see any in my
> > > > > > >> testing. So either there's a misunderstanding here, or there's a
> > > > > > >> bug I'm missing.
> > > > > > >>
> > > > > > >> If fscache is not configured, then nfs_netfs_read_folio() and
> > > > > > >> nfs_netfs_readahead() is a wrapper that returns -ENOBUFS.
> > > > > > >> If it's configured but not enabled, then the checks for
> > > > > > >> netfs_inode(inode)->cache should skip over any netfs code.
> > > > > > >> But maybe there's a non-obvious bug you're seeing and
> > > > > > >> somehow netfs is still getting called?  Because I cannot
> > > > > > >> see netfs getting called if 'fsc' is not on the mount in my
> > > > > > >> tests.
> > > > > > >>
> > > > > > >> int nfs_netfs_read_folio(struct file *file, struct folio *folio)
> > > > > > >> {
> > > > > > >>       if (!netfs_inode(folio_inode(folio))->cache)
> > > > > > >>               return -ENOBUFS;
> > > > > > >>
> > > > > > >>       return netfs_read_folio(file, folio);
> > > > > > >> }
> > > > > > >>
> > > > > > >> int nfs_netfs_readahead(struct readahead_control *ractl)
> > > > > > >> {
> > > > > > >>       struct inode *inode = ractl->mapping->host;
> > > > > > >>
> > > > > > >>       if (!netfs_inode(inode)->cache)
> > > > > > >>               return -ENOBUFS;
> > > > > > >>
> > > > > > >>       netfs_readahead(ractl);
> > > > > > >>       return 0;
> > > > > > >> }
> > > > > > >>
> > > > > > >>
> > > > > > >>>>
> > > > > > >>>> Can you be more specific as to the portions of the patch you don't
> > > > > > >>>> like
> > > > > > >>>> so I can move it in the right direction?
> > > > > > >>>>
> > > > > > >>>> This is from patch #2 which you didn't comment on.  I'm not sure
> > > > > > >>>> you're
> > > > > > >>>> ok with it though, since you mention "extra bloat to the inode".
> > > > > > >>>> Do you object to this even though it's wrapped in an
> > > > > > >>>> #ifdef CONFIG_NFS_FSCACHE?  If so, do you require no
> > > > > > >>>> extra size be added to nfs_inode?
> > > > > > >>>>
> > > > > > >>>> @@ -204,9 +208,11 @@ struct nfs_inode {
> > > > > > >>>>       __u64 write_io;
> > > > > > >>>>       __u64 read_io;
> > > > > > >>>> #ifdef CONFIG_NFS_FSCACHE
> > > > > > >>>> -       struct fscache_cookie   *fscache;
> > > > > > >>>> -#endif
> > > > > > >>>> +       struct netfs_inode      netfs; /* netfs context and VFS inode
> > > > > > >>>> */
> > > > > > >>>> +#else
> > > > > > >>>>       struct inode            vfs_inode;
> > > > > > >>>> +#endif
> > > > > > >>>> +
> > > > > > >>>
> > > > > > >>> Ideally, I'd prefer no extra size. I can live with it up to a certain
> > > > > > >>> point, however for now NFS is not unconditionally opting into the netfs
> > > > > > >>> project. If we're to ever do that, then I want to see streamlined code
> > > > > > >>> for the standard I/O case.
> > > > > > >>>
> > > > > > >> Ok and understood about standard I/O case.
> > > > > > >>
> > > > > > >> I was thinking how we might not increase the size, but I don't think
> > > > > > >> I can make it work.
> > > > > > >>
> > > > > > >> I thought we could change to something like the below, without an
> > > > > > >> embedded struct inode:
> > > > > > >>
> > > > > > >> @@ -204,9 +208,11 @@ struct nfs_inode {
> > > > > > >>       __u64 write_io;
> > > > > > >>       __u64 read_io;
> > > > > > >> #ifdef CONFIG_NFS_FSCACHE
> > > > > > >> -       struct fscache_cookie   *fscache;
> > > > > > >> -#endif
> > > > > > >> +       struct netfs_inode      *netfs; /* netfs context and VFS inode */
> > > > > > >> +#else
> > > > > > >>       struct inode            vfs_inode;
> > > > > > >> +#endif
> > > > > > >> +
> > > > > > >>
> > > > > > >> Then I would need to alloc/free a netfs_inode at the time of
> > > > > > >> nfs_inode initiation.  Unfortunately this has the issue that the NFS_I()
> > > > > > >> macro cannot work, because it requires an embedded "struct inode"
> > > > > > >> due to "container_of" use:
> > > > > > >>
> > > > > > >> +#ifdef CONFIG_NFS_FSCACHE
> > > > > > >> +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > > > > > >> +{
> > > > > > >> +       return &nfsi->netfs.inode;
> > > > > > >> +}
> > > > > > >> +static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > > > > > >> +{
> > > > > > >> +       return container_of(inode, struct nfs_inode, netfs.inode);
> > > > > > >> +}
> > > > > > >> +#else
> > > > > > >> +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > > > > > >> +{
> > > > > > >> +       return &nfsi->vfs_inode;
> > > > > > >> +}
> > > > > > >> static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > > > > > >> {
> > > > > > >>       return container_of(inode, struct nfs_inode, vfs_inode);
> > > > > > >> }
> > > > > > >> +#endif
> > > > > > >>
> > > > > > >>
> > > > > > >
> > > > > > > Actually Trond maybe we can achieve a "0 length increase" of
> > > > > > > nfs_inode if dhowells would take a patch to modify the definition
> > > > > > > of struct netfs_inode and netfs_inode_init(), something like the WIP
> > > > > > > patch below.  What do you think?
> > > > > >
> > > > > > That works for me.
> > > > > >
> > > > > > >
> > > > > > > I think maybe this could be a follow-on patch and if you/dhowells
> > > > > > > think it's an ok idea I can try to work out what is needed across
> > > > > > > the tree.  I thought about it more and I kinda agree that in the
> > > > > > > case for NFS where fscache is "configured but not enabled",
> > > > > > > then even though we're only adding 24 bytes to the nfs_inode
> > > > > > > each time, it will add up so it is worth at least a discussion.
> > > > > > >
> > > > > > > diff --git a/include/linux/netfs.h b/include/linux/netfs.h
> > > > > > > index f2402ddeafbf..195714f1c355 100644
> > > > > > > --- a/include/linux/netfs.h
> > > > > > > +++ b/include/linux/netfs.h
> > > > > > > @@ -118,11 +118,7 @@ enum netfs_io_source {
> > > > > > > typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error,
> > > > > > >                                     bool was_async);
> > > > > > >
> > > > > > > -/*
> > > > > > > - * Per-inode context.  This wraps the VFS inode.
> > > > > > > - */
> > > > > > > -struct netfs_inode {
> > > > > > > -       struct inode            inode;          /* The VFS inode */
> > > > > > > +struct netfs_info {
> > > > > > >       const struct netfs_request_ops *ops;
> > > > > > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > > > > >       struct fscache_cookie   *cache;
> > > > > > > @@ -130,6 +126,14 @@ struct netfs_inode {
> > > > > > >       loff_t                  remote_i_size;  /* Size of the remote file */
> > > > > > > };
> > > > > > >
> > > > > > > +/*
> > > > > > > + * Per-inode context.  This wraps the VFS inode.
> > > > > > > + */
> > > > > > > +struct netfs_inode {
> > > > > > > +       struct inode            inode;          /* The VFS inode */
> > > > > > > +       struct netfs_info       *netfs;         /* Rest of netfs data */
> > > > > > > +};
> > > > > > > +
> > > > > > > /*
> > > > > > > * Resources required to do operations on a cache.
> > > > > > > */
> > > > > > > @@ -312,10 +316,12 @@ static inline struct netfs_inode
> > > > > > > *netfs_inode(struct inode *inode)
> > > > > > > static inline void netfs_inode_init(struct netfs_inode *ctx,
> > > > > > >                                   const struct netfs_request_ops *ops)
> > > > > > > {
> > > > > > > -       ctx->ops = ops;
> > > > > > > -       ctx->remote_i_size = i_size_read(&ctx->inode);
> > > > > > > +       ctx->netfs = kzalloc(sizeof(struct netfs_info)), GFP_KERNEL);
> > > > > > > +       /* FIXME: Check for NULL */
> > > > > > > +       ctx->netfs->ops = ops;
> > > > > > > +       ctx->netfs->remote_i_size = i_size_read(&ctx->inode);
> > > > > > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > > > > > -       ctx->cache = NULL;
> > > > > > > +       ctx->netfs->cache = NULL;
> > > > > > > #endif
> > > > > > > }
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > >>
> > > > > > >>>>
> > > > > > >>>>
> > > > > > >>>> Are you ok with the stub functions which are placed in fscache.h, and
> > > > > > >>>> when CONFIG_NFS_FSCACHE is not set, become either a no-op
> > > > > > >>>> or a 1-liner (nfs_netfs_readpage_release)?
> > > > > > >>>>
> > > > > > >>>> #else /* CONFIG_NFS_FSCACHE */
> > > > > > >>>> +static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {}
> > > > > > >>>> +static inline void nfs_netfs_initiate_read(struct nfs_pgio_header
> > > > > > >>>> *hdr) {}
> > > > > > >>>> +static inline void nfs_netfs_read_completion(struct nfs_pgio_header
> > > > > > >>>> *hdr) {}
> > > > > > >>>> +static inline void nfs_netfs_readpage_release(struct nfs_page *req)
> > > > > > >>>> +{
> > > > > > >>>> +       unlock_page(req->wb_page);
> > > > > > >>>> +}
> > > > > > >>>> static inline void nfs_fscache_release_super_cookie(struct
> > > > > > >>>> super_block *sb) {}
> > > > > > >>>> static inline void nfs_fscache_init_inode(struct inode *inode) {}
> > > > > > >>>>
> > > > > > >>>>
> > > > > > >>>> Do you object to the below?  If so, then do you want
> > > > > > >>>> #ifdef CONFIG_NFS_FSCACHE here?
> > > > > > >>>>
> > > > > > >>>> -- a/fs/nfs/inode.c
> > > > > > >>>> +++ b/fs/nfs/inode.c
> > > > > > >>>> @@ -2249,6 +2249,8 @@ struct inode *nfs_alloc_inode(struct
> > > > > > >>>> super_block *sb)
> > > > > > >>>> #ifdef CONFIG_NFS_V4_2
> > > > > > >>>>       nfsi->xattr_cache = NULL;
> > > > > > >>>> #endif
> > > > > > >>>> +       nfs_netfs_inode_init(nfsi);
> > > > > > >>>> +
> > > > > > >>>>       return VFS_I(nfsi);
> > > > > > >>>> }
> > > > > > >>>> EXPORT_SYMBOL_GPL(nfs_alloc_i
> > > > > > >>>> node);
> > > > > > >>>>
> > > > > > >>>>
> > > > > > >>>> Do you object to the changes in fs/nfs/read.c?  Specifically,
> > > > > > >>>> how about the below calls to netfs from nfs_read_folio and
> > > > > > >>>> nfs_readahead into equivalent netfs calls?  So when
> > > > > > >>>> NFS_CONFIG_FSCACHE is set, but fscache is not enabled
> > > > > > >>>> ('fsc' not on mount), these netfs functions do immediately call
> > > > > > >>>> netfs_alloc_request().  But I wonder if we could simply add a
> > > > > > >>>> check to see if fscache is enabled on the mount, and skip
> > > > > > >>>> over to satisfy what you want.  Am I understanding what you
> > > > > > >>>> want?
> > > > > > >>>
> > > > > > >>> Quite frankly, I'd prefer that we just split out the functionality that
> > > > > > >>> is needed from the netfs code so that it can be optimised. However I'm
> > > > > > >>> not interested enough in the cachefs functionality to work on that
> > > > > > >>> myself. ...and as I indicated above, I might be OK with opting into the
> > > > > > >>> netfs project, once the overhead can be made to disappear.
> > > > > > >>>
> > > > > > >> Understood.
> > > > > > >>
> > > > > > >> If you think it makes more sense, I can move some of the nfs_netfs_*
> > > > > > >> functions into a netfs.c file as a starting point.  Or that can maybe
> > > > > > >> be done in a future patchset?
> > > > > > >>
> > > > > > >> For now I was equating netfs and fscache together so we can
> > > > > > >> move on from the much older and single-page limiting fscache
> > > > > > >> interface that is likely to go away soon.
> > > > > > >>
> > > > > > >>>>
> > > > > > >>>> @@ -355,6 +343,10 @@ int nfs_read_folio(struct file *file, struct
> > > > > > >>>> folio *folio)
> > > > > > >>>>       if (NFS_STALE(inode))
> > > > > > >>>>               goto out_unlock;
> > > > > > >>>>
> > > > > > >>>> +       ret = nfs_netfs_read_folio(file, folio);
> > > > > > >>>> +       if (!ret)
> > > > > > >>>> +               goto out;
> > > > > > >>>> +
> > > > > > >>>>
> > > > > > >>>> @@ -405,6 +399,10 @@ void nfs_readahead(struct readahead_control
> > > > > > >>>> *ractl)
> > > > > > >>>>       if (NFS_STALE(inode))
> > > > > > >>>>               goto out;
> > > > > > >>>>
> > > > > > >>>> +       ret = nfs_netfs_readahead(ractl);
> > > > > > >>>> +       if (!ret)
> > > > > > >>>> +               goto out;
> > > > > > >>>> +
> > > > > > >>>>
> > > > > > >> The above wrappers should prevent any additional overhead when fscache
> > > > > > >> is not enabled.  As far as I know these work to avoid calling netfs
> > > > > > >> when 'fsc' is not on the mount.
> > > > > > >>
> > > > > > >>>>
> > > > > > >>>> And how about these calls from different points in the read
> > > > > > >>>> path to the earlier mentioned stub functions?
> > > > > > >>>>
> > > > > > >>>> @@ -110,20 +110,13 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
> > > > > > >>>>
> > > > > > >>>> static void nfs_readpage_release(struct nfs_page *req, int error)
> > > > > > >>>> {
> > > > > > >>>> -       struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > > > > > >>>>       struct page *page = req->wb_page;
> > > > > > >>>>
> > > > > > >>>> -       dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb-
> > > > > > >>>>> s_id,
> > > > > > >>>> -               (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
> > > > > > >>>> -               (long long)req_offset(req));
> > > > > > >>>> -
> > > > > > >>>>       if (nfs_error_is_fatal_on_server(error) && error != -
> > > > > > >>>> ETIMEDOUT)
> > > > > > >>>>               SetPageError(page);
> > > > > > >>>> -       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
> > > > > > >>>> -               if (PageUptodate(page))
> > > > > > >>>> -                       nfs_fscache_write_page(inode, page);
> > > > > > >>>> -               unlock_page(page);
> > > > > > >>>> -       }
> > > > > > >>>> +       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > > > > >>>> +               nfs_netfs_readpage_release(req);
> > > > > > >>>> +
> > > > > > >>>
> > > > > > >>> I'm not seeing the value of wrapping unlock_page(), no... That code is
> > > > > > >>> going to need to change when we move it to use folios natively anyway.
> > > > > > >>>
> > > > > > >> Ok, how about I make it conditional on whether fscache is configured
> > > > > > >> and enabled then, similar to the nfs_netfs_read_folio() and
> > > > > > >> nfs_netfs_readahead()?  Below is what that would look like.
> > > > > > >> I could inline the code in nfs_netfs_readpage_release() if you
> > > > > > >> think it would be clearer.
> > > > > > >>
> > > > > > >> static void nfs_readpage_release(struct nfs_page *req, int error)
> > > > > > >> {
> > > > > > >>       struct page *page = req->wb_page;
> > > > > > >>
> > > > > > >>       if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
> > > > > > >>               SetPageError(page);
> > > > > > >>       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > > > > >> #ifndef CONFIG_NFS_FSCACHE
> > > > > > >>               unlock_page(req->wb_page);
> > > > > > >> #else
> > > > > > >>               nfs_netfs_readpage_release(req);
> > > > > > >> #endif
> > > > > > >>       nfs_release_request(req);
> > > > > > >> }
> > > > > > >>
> > > > > > >>
> > > > > > >> void nfs_netfs_readpage_release(struct nfs_page *req)
> > > > > > >> {
> > > > > > >>   struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > > > > > >>
> > > > > > >>   /*
> > > > > > >>    * If fscache is enabled, netfs will unlock pages.
> > > > > > >>    */
> > > > > > >>   if (netfs_inode(inode)->cache)
> > > > > > >>       return;
> > > > > > >>
> > > > > > >>   unlock_page(req->wb_page);
> > > > > > >> }
> > > > > > >>
> > > > > > >>
> > > > > > >>>>       nfs_release_request(req);
> > > > > > >>>> }
> > > > > > >>>>
> > > > > > >>>> @@ -177,6 +170,8 @@ static void nfs_read_completion(struct
> > > > > > >>>> nfs_pgio_header *hdr)
> > > > > > >>>>               nfs_list_remove_request(req);
> > > > > > >>>>               nfs_readpage_release(req, error);
> > > > > > >>>>       }
> > > > > > >>>> +       nfs_netfs_read_completion(hdr);
> > > > > > >>>> +
> > > > > > >>>> out:
> > > > > > >>>>       hdr->release(hdr);
> > > > > > >>>> }
> > > > > > >>>> @@ -187,6 +182,7 @@ static void nfs_initiate_read(struct
> > > > > > >>>> nfs_pgio_header *hdr,
> > > > > > >>>>                             struct rpc_task_setup *task_setup_data,
> > > > > > >>>> int how)
> > > > > > >>>> {
> > > > > > >>>>       rpc_ops->read_setup(hdr, msg);
> > > > > > >>>> +       nfs_netfs_initiate_read(hdr);
> > > > > > >>>>       trace_nfs_initiate_read(hdr);
> > > > > > >>>> }
> > > > > > >>>>
> > > > > > >>>>
> > > > > > >>>> Are you ok with these additions?  Something like this would
> > > > > > >>>> be required in the case of fscache configured and enabled,
> > > > > > >>>> because we could have some of the data in a read in
> > > > > > >>>> fscache, and some not.  That is the reason for the netfs
> > > > > > >>>> design, and why we need to be able to call the normal
> > > > > > >>>> NFS read IO path (netfs calls into issue_read, and we call
> > > > > > >>>> back via netfs_subreq_terminated)?
> > > > > > >>>>
> > > > > > >>>> @@ -101,6 +101,9 @@ struct nfs_pageio_descriptor {
> > > > > > >>>>       struct pnfs_layout_segment *pg_lseg;
> > > > > > >>>>       struct nfs_io_completion *pg_io_completion;
> > > > > > >>>>       struct nfs_direct_req   *pg_dreq;
> > > > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > > > >>>> +       void                    *pg_netfs;
> > > > > > >>>> +#endif
> > > > > > >>>>
> > > > > > >>>> @@ -1619,6 +1619,9 @@ struct nfs_pgio_header {
> > > > > > >>>>       const struct nfs_rw_ops *rw_ops;
> > > > > > >>>>       struct nfs_io_completion *io_completion;
> > > > > > >>>>       struct nfs_direct_req   *dreq;
> > > > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > > > >>>> +       void                    *netfs;
> > > > > > >>>> +#endif
> > > > > > >>>>
> > > > > > >>>>
> > > > > > >>>> And these additions to pagelist.c?
> > > > > > >>>>
> > > > > > >>>> @@ -68,6 +69,10 @@ void nfs_pgheader_init(struct
> > > > > > >>>> nfs_pageio_descriptor *desc,
> > > > > > >>>>       hdr->good_bytes = mirror->pg_count;
> > > > > > >>>>       hdr->io_completion = desc->pg_io_completion;
> > > > > > >>>>       hdr->dreq = desc->pg_dreq;
> > > > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > > > >>>> +       if (desc->pg_netfs)
> > > > > > >>>> +               hdr->netfs = desc->pg_netfs;
> > > > > > >>>> +#endif
> > > > > > >>>
> > > > > > >>> Why the conditional?
> > > > > > >>>
> > > > > > >> Not really needed and I was thinking of removing it, so I'll do that.
> > > > > > >>
> > > > > > >>>>
> > > > > > >>>>
> > > > > > >>>> @@ -846,6 +851,9 @@ void nfs_pageio_init(struct nfs_pageio_descriptor
> > > > > > >>>> *desc,
> > > > > > >>>>       desc->pg_lseg = NULL;
> > > > > > >>>>       desc->pg_io_completion = NULL;
> > > > > > >>>>       desc->pg_dreq = NULL;
> > > > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > > > >>>> +       desc->pg_netfs = NULL;
> > > > > > >>>> +#endif
> > > > > > >>>>
> > > > > > >>>>
> > > > > > >>>> @@ -1360,6 +1369,9 @@ int nfs_pageio_resend(struct
> > > > > > >>>> nfs_pageio_descriptor *desc,
> > > > > > >>>>
> > > > > > >>>>       desc->pg_io_completion = hdr->io_completion;
> > > > > > >>>>       desc->pg_dreq = hdr->dreq;
> > > > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > > > >>>> +       desc->pg_netfs = hdr->netfs;
> > > > > > >>>> +#endif
> > > > > > >>>
> > > > > > >>> Those all need wrapper functions instead of embedding #ifdefs.
> > > > > > >>>
> > > > > > >> Ok.
> > > > > > >>
> > > > > > >>
> > > > > > >>
> > > > > > >>>>
> > > > > > >>>>
> > > > > > >>>>> My expectation is that the standard I/O path should have minimal
> > > > > > >>>>> overhead, and should certainly not increase the overhead that we
> > > > > > >>>>> already have. Will this be addressed in future iterations of these
> > > > > > >>>>> patches?
> > > > > > >>>>>
> > > > > > >>>>
> > > > > > >>>> I will do what I can to satisfy what you want, either by fixing up
> > > > > > >>>> this patch or follow-on patches.  Hopefully the above questions
> > > > > > >>>> will clarify the next steps.
> > > > > > >>>>
> > > > > > >>>
> > > > > > >>> --
> > > > > > >>> Trond Myklebust
> > > > > > >>> Linux NFS client maintainer, Hammerspace
> > > > > > >>> trond.myklebust@hammerspace.com
> > > > > >
> > > > > >
> > > > > >
> > > > > > Trond Myklebust
> > > > > > CTO, Hammerspace Inc
> > > > > > 1900 S Norfolk St, Suite 350 - #45
> > > > > > San Mateo, CA 94403
> > > > > >
> > > > > > www.hammer.space
> > > > > >
> > > > > >
> > > > >
> > > >
> > >
> >
Benjamin Maynard Jan. 3, 2023, 8:33 p.m. UTC | #20
Hi all,

I just wanted to follow up on this set of patches. As Daire explained
below, these patches are really important to a number of us using
FS-Cache due to the significant performance regression introduced in
5.17 and above.

I'd love to see these patches merged, or some feedback on what changes
might be needed.

Kind Regards
Benjamin Maynard

On Thu, 17 Nov 2022 at 11:04, Daire Byrne <daire@dneg.com> wrote:
>
> Hi,
>
> I just wanted to take the opportunity to reiterate why these patches
> are important to me (and others like Benjamin).
>
> The "new" fscache that is now in mainline has a major NFS performance
> regression from the previous fscache code in pre 5.17 kernels - single
> file reads from cache.
>
> Even if you have the fastest local disk (nvme/ssd) for your fscache,
> reading back a cached file (via NFS) now tops out at around 40MB/s
> whereas before (old fscache) the local fscache disk speed was the only
> limit (e.g. 5000MB/s for NVMe).
>
> So, in many cases, depending on what you are using fscache for, it can
> be faster to read the file over the (gigabit) network than from the
> local disk cache which somewhat negates its usefulness. As such, we
> mostly use pre-5.17 kernels in production and the old fscache code
> which maintains high cache read performance (but has other annoying
> issues).
>
> Now this performance regression might not be noticed too much by
> desktop users looking to use fscache on their systems, but it sure
> does affect servers (e.g. re-export servers) that want to use fscache
> to achieve very high performance.
>
> I can't really comment on these patches or the approach taken, but I
> do hope that we can restore/improve the fscache read performance for
> NFS in the mainline kernel as soon as possible (like these patches
> do).
>
> Daire
>
>
> On Mon, 14 Nov 2022 at 21:26, Benjamin Maynard <benmaynard@google.com> wrote:
> >
> > Thanks Dave, that did the trick!
> >
> > Building the kernel from
> > https://github.com/DaveWysochanskiRH/kernel/commit/42f58f3d36d83839022dc2617bb6c2d1b09db65f
> > and re-running the exact same tests yielded the expected results. Data
> > is now being served from /var/cache/fscache.
> >
> > I also reverted my change to the read ahead, so that read ahead is now
> > greater than the rsize. Still works as expected.
> >
> > I am also seeing much better single file read speeds, and culling is
> > working perfectly (not running into the issue we were seeing pre
> > 5.17).
> >
> > Thanks a lot Dave, Jeff and Daire for your help.
> >
> > Kind Regards
> > Benjamin Maynard
> >
> >
> >
> > Kind Regards
> >
> > Benjamin Maynard
> >
> > Customer Engineer
> >
> > benmaynard@google.com
> >
> > Google, Inc.
> >
> >
> >
> >
> > On Mon, 14 Nov 2022 at 17:35, David Wysochanski <dwysocha@redhat.com> wrote:
> > >
> > > On Mon, Nov 14, 2022 at 11:04 AM Benjamin Maynard <benmaynard@google.com> wrote:
> > > >
> > > > Hi Dave,
> > > >
> > > > I've added responses to your questions inline below.
> > > >
> > > > I also tried adding the noatime option to the mount on the source
> > > > filer as Jeff suggested, but this has not made any difference and the
> > > > issue is still persisting for me.
> > > >
> > > > I created the following diagram that explains my setup, and the exact
> > > > tests I am performing:
> > > > https://drive.google.com/file/d/12Xf-9yHCKM4eMr2YGqdSAVfGcximW4OG/view?usp=sharing.
> > > >
> > > > Hopefully this is clearer than my explanations below (let me know if
> > > > you'd prefer me to share an alternative way).
> > > >
> > > Yes, that's very helpful.  Let me think about this one as I'm not sure.
> > > As Jeff says we may need tracepoints to track it down if I cannot repro
> > > it and/or nothing comes to mind.
> > >
> > > > In order to remove the re-exporting layer of complexity, I also
> > > > performed the tests without the re-export server (architecture:
> > > > https://drive.google.com/file/d/1DQKhqo_UnQ8ul-z5Iram5LpisDmkKziQ/view?usp=share_link):
> > > >
> > > > Source NFS Server <-- Client (with FS-Cache)
> > > >
> > > > The same is happening, I cannot get FS-Cache to serve from cache.
> > > > Heavy writes, but no reads, even when the same file is copied many
> > > > times.
> > > >
> > > I'm pretty sure the above you're hitting the drop_caches /
> > > "fscache read optimisation" issue #1 I mentioned.
> > >
> > > I see dhowells just posted a v2 version of his previous patch:
> > > https://lore.kernel.org/linux-mm/166844174069.1124521.10890506360974169994.stgit@warthog.procyon.org.uk/
> > >
> > > I started with 6.1-rc5, added the above dhowells latest patch for that issue,
> > > and then my 5 patches on top.  Then I added a small patch to utilize
> > > dhowells patch to ensure the read optimisation is removed.  I ran my
> > > unit test that has been failing all along and as expected it passes with
> > > these patches.  I pushed the series to github:
> > > https://github.com/DaveWysochanskiRH/kernel/commits/nfs-fscache-netfs
> > > https://github.com/DaveWysochanskiRH/kernel/commit/42f58f3d36d83839022dc2617bb6c2d1b09db65f
> > >
> > > I will also email you the series of patches on top of 6.1-rc5 so you
> > > can just apply from your mailbox if you want.
> > >
> > >
> > >
> > > > Hopefully something I am doing wrong on my end, but I can't figure out what.
> > > >
> > > > Kind Regards
> > > > Benjamin Maynard
> > > >
> > > >
> > > > On Mon, 14 Nov 2022 at 13:47, David Wysochanski <dwysocha@redhat.com> wrote:
> > > > >
> > > > > I apologize I did not read carefully enough and I missed some details
> > > > > in your original post.
> > > > > More below.
> > > > >
> > > > > On Sat, Nov 12, 2022 at 7:47 AM Benjamin Maynard <benmaynard@google.com> wrote:
> > > > > >
> > > > > > Hi all,
> > > > > >
> > > > > > I've been doing some more testing with these patches, I applied all of
> > > > > > the patches (v10 from
> > > > > > https://patchwork.kernel.org/project/linux-nfs/list/?series=691729)
> > > > > > apart from Patch 6 (the RFC patch) to version 6.0.8 of the kernel.
> > > > > >
> > > > > > I have the following setup:
> > > > > >
> > > > > > Source NFS Server <-- Re-Export Server (with FS-Cache) <-- NFS Client.
> > > > > >
> > > > > > I have a 500Gb file on the Source NFS Server, which I am then copying
> > > > > > to the NFS Client via the Re-Export Server.
> > > > > >
> > > > > > On the first copy, I see heavy writes to /var/cache/fscache on the
> > > > > > re-export server, and once the file copy completes I see that
> > > > > > /var/cache/fscache is approximately 500Gb in size. All good so far.
> > > > > >
> > > > > > I then deleted that file from the NFS Client, and dropped the caches
> > > > > > just to be safe (echo 3 > /proc/sys/vm/drop_caches on the NFS Client).
> > > > > >
> > > > > If you delete the file from the NFS client, how does that not delete the
> > > > > file from the original NFS server?
> > > >
> > > > Sorry - to be clear, I never deleted the file from the NFS mount
> > > > (which I know would in turn delete it from the re-export server and
> > > > the source filer).
> > > >
> > > > In order to perform the performance test, I copied the file from the
> > > > NFS mount on the NFS Client, to a local directory (cp
> > > > /mnt/nfs/500gb.img /tmp).
> > > >
> > > > When I said "I then deleted that file from the NFS Client", I meant I
> > > > deleted the local copy of that file. Not the file on the mount (rm
> > > > /tmp/500gb.img).
> > > >
> > > > Just to also stress, I have never dropped the caches on the Re-Export
> > > > Server (the one with FS-Cache) at any point in any of these tests, so
> > > > I don't think this is the problem. I have only ever dropped the caches
> > > > on the NFS client that is mounting the Re-Export Server.
> > > >
> > > > > > I then performed another copy of the 500Gb file on the NFS Client,
> > > > > > again via the Re-Export Server. What I expected would happen is that I
> > > > > > would see heavy reads from the /var/cache/fscache volume as the file
> > > > > > should be served from FS-Cache.
> > > > > >
> > > > > I don't understand this.  When you say you "performed another copy"
> > > > > of what file?  Wasn't the file deleted in the above step?
> > > >
> > > > As above, only the local copy was deleted.
> > > >
> > > > > > However what I actually saw was no reads whatsoever, FS-Cache seems to
> > > > > > be ignored and the file is pulled from the Source NFS Filer again. I
> > > > > > also see heavy writes to /var/cache/fscache, so it appears that
> > > > > > FS-Cache is overwriting its existing cache, and never using it.
> > > > >
> > > > > That would happen if the file was changed or re-created.
> > > > >
> > > > > > I only have 104Gb of memory on the Re-Export Server (with FS-Cache) so
> > > > > > it is not possible that the file is being served from the page cache.
> > > > > >
> > > > > > We saw this behaviour before on an older set of the patches when our
> > > > > > mount between the Re-Export Server and the Source NFS Filer was using
> > > > > > the "sync" option, but we are now using the "async" option and the
> > > > > > same is happening.
> > > > > >
> > > > > > Mount options:
> > > > > >
> > > > > > Source NFS Server <-- Re-Export Server (with FS-Cache):
> > > > > >
> > > > > > 10.0.0.49:/files /srv/nfs/files nfs
> > > > > > rw,noatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,acregmin=600,acregmax=600,acdirmin=600,acdirmax=600,hard,nocto,proto=tcp,nconnect=16,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.49,mountvers=3,mountport=37485,mountproto=tcp,fsc,local_lock=none,addr=10.0.0.49
> > > > > >
> > > > > > Re-Export Server (with FS-Cache) <-- NFS Client:
> > > > > >
> > > > > > 10.0.0.3:/files /mnt/nfs nfs
> > > > > > rw,relatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.3,mountvers=3,mountport=20048,mountproto=tcp,local_lock=none,addr=10.0.0.3
> > > > > >
> > > > > > It is also worth noting this behaviour is not unique to the re-export
> > > > > > use case. I see FS-Cache not being used with the following setup:
> > > > > >
> > > > > > Source NFS Server <-- Client (with FS-Cache).
> > > > > >
> > > > >
> > > > > This points at something more fundamental like something missed
> > > > > in the test or maybe a mount option.  Can you explain what test
> > > > > you're doing here when you say "this behavior is not unique"?
> > > >
> > > > I've created the following diagram which explains the test I am
> > > > performing. I think it is a little easier to follow than explaining in
> > > > text. This should be viewable without any authentication:
> > > > https://drive.google.com/file/d/12Xf-9yHCKM4eMr2YGqdSAVfGcximW4OG/view?usp=sharing.
> > > >
> > > > By "this behaviour is not unique to the re-export use case" I mean
> > > > that the same happens if I remove the re-export server completely, and
> > > > just have the following setup:
> > > >
> > > > Source NFS Server <-- Client (with FS-Cache).
> > > >
> > > > > Can you show the mount options for both:
> > > > > - fscache filesystem on the re-export server (/var/cache/fscache)
> > > >
> > > > root@reexport:~$ mount | grep /var/cache/fscache
> > > > /dev/md127 on /var/cache/fscache type ext4
> > > > (rw,relatime,discard,nobarrier,stripe=1024)
> > > >
> > > > > - exported filesystem on the NFS server (filesystem in /etc/exports)
> > > >
> > > > I have tried both:
> > > >
> > > > root@source:~$ mount | grep files
> > > > /dev/sdb1 on /files type ext4 (rw)
> > > >
> > > > root@source:~$ cat /etc/exports
> > > > /files 10.0.0.0/8(rw,sync,wdelay,no_root_squash,no_all_squash,no_subtree_check,sec=sys,secure,nohide)
> > > >
> > > > and (at Jeff's suggestion):
> > > >
> > > > root@source:~$ mount | grep files
> > > > /dev/sdb1 on /files type ext4 (rw,noatime)
> > > >
> > > > root@source:~$ cat /etc/exports
> > > > /files 10.0.0.0/8(rw,sync,wdelay,no_root_squash,no_all_squash,no_subtree_check,sec=sys,secure,nohide)
> > > >
> > > >
> > > > > Unfortunately the problem with drop_caches makes it more difficult
> > > > > to know when fscache is truly working.  But some other unit test
> > > > > I have shows fscache does work with this patchset so I'm puzzled why
> > > > > you're not seeing it work at all.
> > > > >
> > > > > I pinged dhowells on the drop_caches issue so maybe we can get
> > > > > that one sorted out soon but I'm not sure since it's part of a series
> > > > > and proposes changes in mm.
> > > >
> > > > Just to be clear, I have never used drop_caches on the re-export
> > > > server in any of these tests. I have only ever done this on the NFS
> > > > Client.
> > > >
> > > > >
> > > > > > Thanks,
> > > > > > Ben
> > > > > >
> > > > > >
> > > > > > Kind Regards
> > > > > >
> > > > > > Benjamin Maynard
> > > > > >
> > > > > > Customer Engineer
> > > > > >
> > > > > > benmaynard@google.com
> > > > > >
> > > > > > Google, Inc.
> > > > > >
> > > > > >
> > > > > >
> > > > > >
> > > > > > On Mon, 31 Oct 2022 at 22:22, Trond Myklebust <trondmy@hammerspace.com> wrote:
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > > > On Oct 30, 2022, at 19:25, David Wysochanski <dwysocha@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Sat, Oct 29, 2022 at 12:46 PM David Wysochanski <dwysocha@redhat.com> wrote:
> > > > > > > >>
> > > > > > > >> On Fri, Oct 28, 2022 at 12:59 PM Trond Myklebust <trondmy@kernel.org> wrote:
> > > > > > > >>>
> > > > > > > >>> On Fri, 2022-10-28 at 07:50 -0400, David Wysochanski wrote:
> > > > > > > >>>> On Thu, Oct 27, 2022 at 3:16 PM Trond Myklebust <trondmy@kernel.org>
> > > > > > > >>>> wrote:
> > > > > > > >>>>>
> > > > > > > >>>>> On Mon, 2022-10-17 at 06:52 -0400, Dave Wysochanski wrote:
> > > > > > > >>>>>> Convert the NFS buffered read code paths to corresponding netfs
> > > > > > > >>>>>> APIs,
> > > > > > > >>>>>> but only when fscache is configured and enabled.
> > > > > > > >>>>>>
> > > > > > > >>>>>> The netfs API defines struct netfs_request_ops which must be
> > > > > > > >>>>>> filled
> > > > > > > >>>>>> in by the network filesystem.  For NFS, we only need to define 5
> > > > > > > >>>>>> of
> > > > > > > >>>>>> the functions, the main one being the issue_read() function.
> > > > > > > >>>>>> The issue_read() function is called by the netfs layer when a
> > > > > > > >>>>>> read
> > > > > > > >>>>>> cannot be fulfilled locally, and must be sent to the server
> > > > > > > >>>>>> (either
> > > > > > > >>>>>> the cache is not active, or it is active but the data is not
> > > > > > > >>>>>> available).
> > > > > > > >>>>>> Once the read from the server is complete, netfs requires a call
> > > > > > > >>>>>> to
> > > > > > > >>>>>> netfs_subreq_terminated() which conveys either how many bytes
> > > > > > > >>>>>> were
> > > > > > > >>>>>> read
> > > > > > > >>>>>> successfully, or an error.  Note that issue_read() is called with
> > > > > > > >>>>>> a
> > > > > > > >>>>>> structure, netfs_io_subrequest, which defines the IO requested,
> > > > > > > >>>>>> and
> > > > > > > >>>>>> contains a start and a length (both in bytes), and assumes the
> > > > > > > >>>>>> underlying
> > > > > > > >>>>>> netfs will return a either an error on the whole region, or the
> > > > > > > >>>>>> number
> > > > > > > >>>>>> of bytes successfully read.
> > > > > > > >>>>>>
> > > > > > > >>>>>> The NFS IO path is page based and the main APIs are the pgio APIs
> > > > > > > >>>>>> defined
> > > > > > > >>>>>> in pagelist.c.  For the pgio APIs, there is no way for the caller
> > > > > > > >>>>>> to
> > > > > > > >>>>>> know how many RPCs will be sent and how the pages will be broken
> > > > > > > >>>>>> up
> > > > > > > >>>>>> into underlying RPCs, each of which will have their own
> > > > > > > >>>>>> completion
> > > > > > > >>>>>> and
> > > > > > > >>>>>> return code.  In contrast, netfs is subrequest based, a single
> > > > > > > >>>>>> subrequest may contain multiple pages, and a single subrequest is
> > > > > > > >>>>>> initiated with issue_read() and terminated with
> > > > > > > >>>>>> netfs_subreq_terminated().
> > > > > > > >>>>>> Thus, to utilze the netfs APIs, NFS needs some way to accommodate
> > > > > > > >>>>>> the netfs API requirement on the single response to the whole
> > > > > > > >>>>>> subrequest, while also minimizing disruptive changes to the NFS
> > > > > > > >>>>>> pgio layer.
> > > > > > > >>>>>>
> > > > > > > >>>>>> The approach taken with this patch is to allocate a small
> > > > > > > >>>>>> structure
> > > > > > > >>>>>> for each nfs_netfs_issue_read() call, store the final error and
> > > > > > > >>>>>> number
> > > > > > > >>>>>> of bytes successfully transferred in the structure, and update
> > > > > > > >>>>>> these
> > > > > > > >>>>>> values
> > > > > > > >>>>>> as each RPC completes.  The refcount on the structure is used as
> > > > > > > >>>>>> a
> > > > > > > >>>>>> marker
> > > > > > > >>>>>> for the last RPC completion, is incremented in
> > > > > > > >>>>>> nfs_netfs_read_initiate(),
> > > > > > > >>>>>> and decremented inside nfs_netfs_read_completion(), when a
> > > > > > > >>>>>> nfs_pgio_header
> > > > > > > >>>>>> contains a valid pointer to the data.  On the final put (which
> > > > > > > >>>>>> signals
> > > > > > > >>>>>> the final outstanding RPC is complete) in
> > > > > > > >>>>>> nfs_netfs_read_completion(),
> > > > > > > >>>>>> call netfs_subreq_terminated() with either the final error value
> > > > > > > >>>>>> (if
> > > > > > > >>>>>> one or more READs complete with an error) or the number of bytes
> > > > > > > >>>>>> successfully transferred (if all RPCs complete successfully).
> > > > > > > >>>>>> Note
> > > > > > > >>>>>> that when all RPCs complete successfully, the number of bytes
> > > > > > > >>>>>> transferred
> > > > > > > >>>>>> is capped to the length of the subrequest.  Capping the
> > > > > > > >>>>>> transferred
> > > > > > > >>>>>> length
> > > > > > > >>>>>> to the subrequest length prevents "Subreq overread" warnings from
> > > > > > > >>>>>> netfs.
> > > > > > > >>>>>> This is due to the "aligned_len" in nfs_pageio_add_page(), and
> > > > > > > >>>>>> the
> > > > > > > >>>>>> corner case where NFS requests a full page at the end of the
> > > > > > > >>>>>> file,
> > > > > > > >>>>>> even when i_size reflects only a partial page (NFS overread).
> > > > > > > >>>>>>
> > > > > > > >>>>>> Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
> > > > > > > >>>>>> Reviewed-by: Jeff Layton <jlayton@kernel.org>
> > > > > > > >>>>>
> > > > > > > >>>>>
> > > > > > > >>>>> This is not doing what I asked for, which was to separate out the
> > > > > > > >>>>> fscache functionality, so that we can call that if and when it is
> > > > > > > >>>>> available.
> > > > > > > >>>>>
> > > > > > > >>>> I must have misunderstood then.
> > > > > > > >>>>
> > > > > > > >>>> The last feedback I have from you was that you wanted it to be
> > > > > > > >>>> an opt-in feature, and it was a comment on a previous patch
> > > > > > > >>>> to Kconfig.  I was proceeding the best I knew how, but
> > > > > > > >>>> let me try to get back on track.
> > > > > > > >>>>
> > > > > > > >>>>> Instead, it is just wrapping the NFS requests inside netfs
> > > > > > > >>>>> requests. As
> > > > > > > >>>>> it stands, that means it is just duplicating information, and
> > > > > > > >>>>> adding
> > > > > > > >>>>> unnecessary overhead to the standard I/O path (extra allocations,
> > > > > > > >>>>> extra
> > > > > > > >>>>> indirect calls, and extra bloat to the inode).
> > > > > > > >>>>>
> > > > > > > >>>> I think I understand what you're saying but I'm not sure.  Let me
> > > > > > > >>>> ask some clarifying questions.
> > > > > > > >>>>
> > > > > > > >>>> Are you objecting to the code when CONFIG_NFS_FSCACHE is
> > > > > > > >>>> configured?  Or when it is not?  Or both?  I think you're objecting
> > > > > > > >>>> when it's configured, but not enabled (we mount without 'fsc').
> > > > > > > >>>> Am I right?
> > > > > > > >>>>
> > > > > > > >>>> Also, are you objecting to the design that to use fcache we now
> > > > > > > >>>> have to use netfs, specifically:
> > > > > > > >>>> - call into netfs via either netfs_read_folio or netfs_readahead
> > > > > > > >>>> - if fscache is enabled, then the IO can be satisfied from fscache
> > > > > > > >>>> - if fscache is not enabled, or some of the IO cannot be satisfied
> > > > > > > >>>> from the cache, then NFS is called back via netfs_issue_read
> > > > > > > >>>> and we use the normal NFS read pageio interface.  This requires
> > > > > > > >>>> we call netfs_subreq_terminated() when all the RPCs complete,
> > > > > > > >>>> which is the reason for the small changes to pagelist.c
> > > > > > > >>>
> > > > > > > >>> I'm objecting to any middle layer "solution" that adds overhead to the
> > > > > > > >>> NFS I/O paths.
> > > > > > > >>>
> > > > > > > >> Got it.
> > > > > > > >>
> > > > > > > >>> I'm willing to consider solutions that are specific only to the fscache
> > > > > > > >>> use case (i.e. when the 'fsc' mount option is specified). However when
> > > > > > > >>> I perform a normal NFS mount, and do I/O, then I don't want to see
> > > > > > > >>> extra memory allocations, extra indirect calls and larger inode
> > > > > > > >>> footprints.
> > > > > > > >>>
> > > > > > > >>> IOW: I want the code to optimise for the case of standard NFS, not for
> > > > > > > >>> the case of 'NFS with cachefs additions'.
> > > > > > > >>>
> > > > > > > >> I agree completely.  Are you seeing extra memory allocations
> > > > > > > >> happen on mounts without 'fsc' or is it more a concern or how
> > > > > > > >> some of the patches look?  We should not be calling any netfs or
> > > > > > > >> fscache code if 'fsc' is not on the mount and I don't see any in my
> > > > > > > >> testing. So either there's a misunderstanding here, or there's a
> > > > > > > >> bug I'm missing.
> > > > > > > >>
> > > > > > > >> If fscache is not configured, then nfs_netfs_read_folio() and
> > > > > > > >> nfs_netfs_readahead() is a wrapper that returns -ENOBUFS.
> > > > > > > >> If it's configured but not enabled, then the checks for
> > > > > > > >> netfs_inode(inode)->cache should skip over any netfs code.
> > > > > > > >> But maybe there's a non-obvious bug you're seeing and
> > > > > > > >> somehow netfs is still getting called?  Because I cannot
> > > > > > > >> see netfs getting called if 'fsc' is not on the mount in my
> > > > > > > >> tests.
> > > > > > > >>
> > > > > > > >> int nfs_netfs_read_folio(struct file *file, struct folio *folio)
> > > > > > > >> {
> > > > > > > >>       if (!netfs_inode(folio_inode(folio))->cache)
> > > > > > > >>               return -ENOBUFS;
> > > > > > > >>
> > > > > > > >>       return netfs_read_folio(file, folio);
> > > > > > > >> }
> > > > > > > >>
> > > > > > > >> int nfs_netfs_readahead(struct readahead_control *ractl)
> > > > > > > >> {
> > > > > > > >>       struct inode *inode = ractl->mapping->host;
> > > > > > > >>
> > > > > > > >>       if (!netfs_inode(inode)->cache)
> > > > > > > >>               return -ENOBUFS;
> > > > > > > >>
> > > > > > > >>       netfs_readahead(ractl);
> > > > > > > >>       return 0;
> > > > > > > >> }
> > > > > > > >>
> > > > > > > >>
> > > > > > > >>>>
> > > > > > > >>>> Can you be more specific as to the portions of the patch you don't
> > > > > > > >>>> like
> > > > > > > >>>> so I can move it in the right direction?
> > > > > > > >>>>
> > > > > > > >>>> This is from patch #2 which you didn't comment on.  I'm not sure
> > > > > > > >>>> you're
> > > > > > > >>>> ok with it though, since you mention "extra bloat to the inode".
> > > > > > > >>>> Do you object to this even though it's wrapped in an
> > > > > > > >>>> #ifdef CONFIG_NFS_FSCACHE?  If so, do you require no
> > > > > > > >>>> extra size be added to nfs_inode?
> > > > > > > >>>>
> > > > > > > >>>> @@ -204,9 +208,11 @@ struct nfs_inode {
> > > > > > > >>>>       __u64 write_io;
> > > > > > > >>>>       __u64 read_io;
> > > > > > > >>>> #ifdef CONFIG_NFS_FSCACHE
> > > > > > > >>>> -       struct fscache_cookie   *fscache;
> > > > > > > >>>> -#endif
> > > > > > > >>>> +       struct netfs_inode      netfs; /* netfs context and VFS inode
> > > > > > > >>>> */
> > > > > > > >>>> +#else
> > > > > > > >>>>       struct inode            vfs_inode;
> > > > > > > >>>> +#endif
> > > > > > > >>>> +
> > > > > > > >>>
> > > > > > > >>> Ideally, I'd prefer no extra size. I can live with it up to a certain
> > > > > > > >>> point, however for now NFS is not unconditionally opting into the netfs
> > > > > > > >>> project. If we're to ever do that, then I want to see streamlined code
> > > > > > > >>> for the standard I/O case.
> > > > > > > >>>
> > > > > > > >> Ok and understood about standard I/O case.
> > > > > > > >>
> > > > > > > >> I was thinking how we might not increase the size, but I don't think
> > > > > > > >> I can make it work.
> > > > > > > >>
> > > > > > > >> I thought we could change to something like the below, without an
> > > > > > > >> embedded struct inode:
> > > > > > > >>
> > > > > > > >> @@ -204,9 +208,11 @@ struct nfs_inode {
> > > > > > > >>       __u64 write_io;
> > > > > > > >>       __u64 read_io;
> > > > > > > >> #ifdef CONFIG_NFS_FSCACHE
> > > > > > > >> -       struct fscache_cookie   *fscache;
> > > > > > > >> -#endif
> > > > > > > >> +       struct netfs_inode      *netfs; /* netfs context and VFS inode */
> > > > > > > >> +#else
> > > > > > > >>       struct inode            vfs_inode;
> > > > > > > >> +#endif
> > > > > > > >> +
> > > > > > > >>
> > > > > > > >> Then I would need to alloc/free a netfs_inode at the time of
> > > > > > > >> nfs_inode initiation.  Unfortunately this has the issue that the NFS_I()
> > > > > > > >> macro cannot work, because it requires an embedded "struct inode"
> > > > > > > >> due to "container_of" use:
> > > > > > > >>
> > > > > > > >> +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > >> +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > > > > > > >> +{
> > > > > > > >> +       return &nfsi->netfs.inode;
> > > > > > > >> +}
> > > > > > > >> +static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > > > > > > >> +{
> > > > > > > >> +       return container_of(inode, struct nfs_inode, netfs.inode);
> > > > > > > >> +}
> > > > > > > >> +#else
> > > > > > > >> +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > > > > > > >> +{
> > > > > > > >> +       return &nfsi->vfs_inode;
> > > > > > > >> +}
> > > > > > > >> static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > > > > > > >> {
> > > > > > > >>       return container_of(inode, struct nfs_inode, vfs_inode);
> > > > > > > >> }
> > > > > > > >> +#endif
> > > > > > > >>
> > > > > > > >>
> > > > > > > >
> > > > > > > > Actually Trond maybe we can achieve a "0 length increase" of
> > > > > > > > nfs_inode if dhowells would take a patch to modify the definition
> > > > > > > > of struct netfs_inode and netfs_inode_init(), something like the WIP
> > > > > > > > patch below.  What do you think?
> > > > > > >
> > > > > > > That works for me.
> > > > > > >
> > > > > > > >
> > > > > > > > I think maybe this could be a follow-on patch and if you/dhowells
> > > > > > > > think it's an ok idea I can try to work out what is needed across
> > > > > > > > the tree.  I thought about it more and I kinda agree that in the
> > > > > > > > case for NFS where fscache is "configured but not enabled",
> > > > > > > > then even though we're only adding 24 bytes to the nfs_inode
> > > > > > > > each time, it will add up so it is worth at least a discussion.
> > > > > > > >
> > > > > > > > diff --git a/include/linux/netfs.h b/include/linux/netfs.h
> > > > > > > > index f2402ddeafbf..195714f1c355 100644
> > > > > > > > --- a/include/linux/netfs.h
> > > > > > > > +++ b/include/linux/netfs.h
> > > > > > > > @@ -118,11 +118,7 @@ enum netfs_io_source {
> > > > > > > > typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error,
> > > > > > > >                                     bool was_async);
> > > > > > > >
> > > > > > > > -/*
> > > > > > > > - * Per-inode context.  This wraps the VFS inode.
> > > > > > > > - */
> > > > > > > > -struct netfs_inode {
> > > > > > > > -       struct inode            inode;          /* The VFS inode */
> > > > > > > > +struct netfs_info {
> > > > > > > >       const struct netfs_request_ops *ops;
> > > > > > > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > > > > > >       struct fscache_cookie   *cache;
> > > > > > > > @@ -130,6 +126,14 @@ struct netfs_inode {
> > > > > > > >       loff_t                  remote_i_size;  /* Size of the remote file */
> > > > > > > > };
> > > > > > > >
> > > > > > > > +/*
> > > > > > > > + * Per-inode context.  This wraps the VFS inode.
> > > > > > > > + */
> > > > > > > > +struct netfs_inode {
> > > > > > > > +       struct inode            inode;          /* The VFS inode */
> > > > > > > > +       struct netfs_info       *netfs;         /* Rest of netfs data */
> > > > > > > > +};
> > > > > > > > +
> > > > > > > > /*
> > > > > > > > * Resources required to do operations on a cache.
> > > > > > > > */
> > > > > > > > @@ -312,10 +316,12 @@ static inline struct netfs_inode
> > > > > > > > *netfs_inode(struct inode *inode)
> > > > > > > > static inline void netfs_inode_init(struct netfs_inode *ctx,
> > > > > > > >                                   const struct netfs_request_ops *ops)
> > > > > > > > {
> > > > > > > > -       ctx->ops = ops;
> > > > > > > > -       ctx->remote_i_size = i_size_read(&ctx->inode);
> > > > > > > > +       ctx->netfs = kzalloc(sizeof(struct netfs_info)), GFP_KERNEL);
> > > > > > > > +       /* FIXME: Check for NULL */
> > > > > > > > +       ctx->netfs->ops = ops;
> > > > > > > > +       ctx->netfs->remote_i_size = i_size_read(&ctx->inode);
> > > > > > > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > > > > > > -       ctx->cache = NULL;
> > > > > > > > +       ctx->netfs->cache = NULL;
> > > > > > > > #endif
> > > > > > > > }
> > > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > >>
> > > > > > > >>>>
> > > > > > > >>>>
> > > > > > > >>>> Are you ok with the stub functions which are placed in fscache.h, and
> > > > > > > >>>> when CONFIG_NFS_FSCACHE is not set, become either a no-op
> > > > > > > >>>> or a 1-liner (nfs_netfs_readpage_release)?
> > > > > > > >>>>
> > > > > > > >>>> #else /* CONFIG_NFS_FSCACHE */
> > > > > > > >>>> +static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {}
> > > > > > > >>>> +static inline void nfs_netfs_initiate_read(struct nfs_pgio_header
> > > > > > > >>>> *hdr) {}
> > > > > > > >>>> +static inline void nfs_netfs_read_completion(struct nfs_pgio_header
> > > > > > > >>>> *hdr) {}
> > > > > > > >>>> +static inline void nfs_netfs_readpage_release(struct nfs_page *req)
> > > > > > > >>>> +{
> > > > > > > >>>> +       unlock_page(req->wb_page);
> > > > > > > >>>> +}
> > > > > > > >>>> static inline void nfs_fscache_release_super_cookie(struct
> > > > > > > >>>> super_block *sb) {}
> > > > > > > >>>> static inline void nfs_fscache_init_inode(struct inode *inode) {}
> > > > > > > >>>>
> > > > > > > >>>>
> > > > > > > >>>> Do you object to the below?  If so, then do you want
> > > > > > > >>>> #ifdef CONFIG_NFS_FSCACHE here?
> > > > > > > >>>>
> > > > > > > >>>> -- a/fs/nfs/inode.c
> > > > > > > >>>> +++ b/fs/nfs/inode.c
> > > > > > > >>>> @@ -2249,6 +2249,8 @@ struct inode *nfs_alloc_inode(struct
> > > > > > > >>>> super_block *sb)
> > > > > > > >>>> #ifdef CONFIG_NFS_V4_2
> > > > > > > >>>>       nfsi->xattr_cache = NULL;
> > > > > > > >>>> #endif
> > > > > > > >>>> +       nfs_netfs_inode_init(nfsi);
> > > > > > > >>>> +
> > > > > > > >>>>       return VFS_I(nfsi);
> > > > > > > >>>> }
> > > > > > > >>>> EXPORT_SYMBOL_GPL(nfs_alloc_i
> > > > > > > >>>> node);
> > > > > > > >>>>
> > > > > > > >>>>
> > > > > > > >>>> Do you object to the changes in fs/nfs/read.c?  Specifically,
> > > > > > > >>>> how about the below calls to netfs from nfs_read_folio and
> > > > > > > >>>> nfs_readahead into equivalent netfs calls?  So when
> > > > > > > >>>> NFS_CONFIG_FSCACHE is set, but fscache is not enabled
> > > > > > > >>>> ('fsc' not on mount), these netfs functions do immediately call
> > > > > > > >>>> netfs_alloc_request().  But I wonder if we could simply add a
> > > > > > > >>>> check to see if fscache is enabled on the mount, and skip
> > > > > > > >>>> over to satisfy what you want.  Am I understanding what you
> > > > > > > >>>> want?
> > > > > > > >>>
> > > > > > > >>> Quite frankly, I'd prefer that we just split out the functionality that
> > > > > > > >>> is needed from the netfs code so that it can be optimised. However I'm
> > > > > > > >>> not interested enough in the cachefs functionality to work on that
> > > > > > > >>> myself. ...and as I indicated above, I might be OK with opting into the
> > > > > > > >>> netfs project, once the overhead can be made to disappear.
> > > > > > > >>>
> > > > > > > >> Understood.
> > > > > > > >>
> > > > > > > >> If you think it makes more sense, I can move some of the nfs_netfs_*
> > > > > > > >> functions into a netfs.c file as a starting point.  Or that can maybe
> > > > > > > >> be done in a future patchset?
> > > > > > > >>
> > > > > > > >> For now I was equating netfs and fscache together so we can
> > > > > > > >> move on from the much older and single-page limiting fscache
> > > > > > > >> interface that is likely to go away soon.
> > > > > > > >>
> > > > > > > >>>>
> > > > > > > >>>> @@ -355,6 +343,10 @@ int nfs_read_folio(struct file *file, struct
> > > > > > > >>>> folio *folio)
> > > > > > > >>>>       if (NFS_STALE(inode))
> > > > > > > >>>>               goto out_unlock;
> > > > > > > >>>>
> > > > > > > >>>> +       ret = nfs_netfs_read_folio(file, folio);
> > > > > > > >>>> +       if (!ret)
> > > > > > > >>>> +               goto out;
> > > > > > > >>>> +
> > > > > > > >>>>
> > > > > > > >>>> @@ -405,6 +399,10 @@ void nfs_readahead(struct readahead_control
> > > > > > > >>>> *ractl)
> > > > > > > >>>>       if (NFS_STALE(inode))
> > > > > > > >>>>               goto out;
> > > > > > > >>>>
> > > > > > > >>>> +       ret = nfs_netfs_readahead(ractl);
> > > > > > > >>>> +       if (!ret)
> > > > > > > >>>> +               goto out;
> > > > > > > >>>> +
> > > > > > > >>>>
> > > > > > > >> The above wrappers should prevent any additional overhead when fscache
> > > > > > > >> is not enabled.  As far as I know these work to avoid calling netfs
> > > > > > > >> when 'fsc' is not on the mount.
> > > > > > > >>
> > > > > > > >>>>
> > > > > > > >>>> And how about these calls from different points in the read
> > > > > > > >>>> path to the earlier mentioned stub functions?
> > > > > > > >>>>
> > > > > > > >>>> @@ -110,20 +110,13 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
> > > > > > > >>>>
> > > > > > > >>>> static void nfs_readpage_release(struct nfs_page *req, int error)
> > > > > > > >>>> {
> > > > > > > >>>> -       struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > > > > > > >>>>       struct page *page = req->wb_page;
> > > > > > > >>>>
> > > > > > > >>>> -       dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb-
> > > > > > > >>>>> s_id,
> > > > > > > >>>> -               (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
> > > > > > > >>>> -               (long long)req_offset(req));
> > > > > > > >>>> -
> > > > > > > >>>>       if (nfs_error_is_fatal_on_server(error) && error != -
> > > > > > > >>>> ETIMEDOUT)
> > > > > > > >>>>               SetPageError(page);
> > > > > > > >>>> -       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
> > > > > > > >>>> -               if (PageUptodate(page))
> > > > > > > >>>> -                       nfs_fscache_write_page(inode, page);
> > > > > > > >>>> -               unlock_page(page);
> > > > > > > >>>> -       }
> > > > > > > >>>> +       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > > > > > >>>> +               nfs_netfs_readpage_release(req);
> > > > > > > >>>> +
> > > > > > > >>>
> > > > > > > >>> I'm not seeing the value of wrapping unlock_page(), no... That code is
> > > > > > > >>> going to need to change when we move it to use folios natively anyway.
> > > > > > > >>>
> > > > > > > >> Ok, how about I make it conditional on whether fscache is configured
> > > > > > > >> and enabled then, similar to the nfs_netfs_read_folio() and
> > > > > > > >> nfs_netfs_readahead()?  Below is what that would look like.
> > > > > > > >> I could inline the code in nfs_netfs_readpage_release() if you
> > > > > > > >> think it would be clearer.
> > > > > > > >>
> > > > > > > >> static void nfs_readpage_release(struct nfs_page *req, int error)
> > > > > > > >> {
> > > > > > > >>       struct page *page = req->wb_page;
> > > > > > > >>
> > > > > > > >>       if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
> > > > > > > >>               SetPageError(page);
> > > > > > > >>       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > > > > > >> #ifndef CONFIG_NFS_FSCACHE
> > > > > > > >>               unlock_page(req->wb_page);
> > > > > > > >> #else
> > > > > > > >>               nfs_netfs_readpage_release(req);
> > > > > > > >> #endif
> > > > > > > >>       nfs_release_request(req);
> > > > > > > >> }
> > > > > > > >>
> > > > > > > >>
> > > > > > > >> void nfs_netfs_readpage_release(struct nfs_page *req)
> > > > > > > >> {
> > > > > > > >>   struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > > > > > > >>
> > > > > > > >>   /*
> > > > > > > >>    * If fscache is enabled, netfs will unlock pages.
> > > > > > > >>    */
> > > > > > > >>   if (netfs_inode(inode)->cache)
> > > > > > > >>       return;
> > > > > > > >>
> > > > > > > >>   unlock_page(req->wb_page);
> > > > > > > >> }
> > > > > > > >>
> > > > > > > >>
> > > > > > > >>>>       nfs_release_request(req);
> > > > > > > >>>> }
> > > > > > > >>>>
> > > > > > > >>>> @@ -177,6 +170,8 @@ static void nfs_read_completion(struct
> > > > > > > >>>> nfs_pgio_header *hdr)
> > > > > > > >>>>               nfs_list_remove_request(req);
> > > > > > > >>>>               nfs_readpage_release(req, error);
> > > > > > > >>>>       }
> > > > > > > >>>> +       nfs_netfs_read_completion(hdr);
> > > > > > > >>>> +
> > > > > > > >>>> out:
> > > > > > > >>>>       hdr->release(hdr);
> > > > > > > >>>> }
> > > > > > > >>>> @@ -187,6 +182,7 @@ static void nfs_initiate_read(struct
> > > > > > > >>>> nfs_pgio_header *hdr,
> > > > > > > >>>>                             struct rpc_task_setup *task_setup_data,
> > > > > > > >>>> int how)
> > > > > > > >>>> {
> > > > > > > >>>>       rpc_ops->read_setup(hdr, msg);
> > > > > > > >>>> +       nfs_netfs_initiate_read(hdr);
> > > > > > > >>>>       trace_nfs_initiate_read(hdr);
> > > > > > > >>>> }
> > > > > > > >>>>
> > > > > > > >>>>
> > > > > > > >>>> Are you ok with these additions?  Something like this would
> > > > > > > >>>> be required in the case of fscache configured and enabled,
> > > > > > > >>>> because we could have some of the data in a read in
> > > > > > > >>>> fscache, and some not.  That is the reason for the netfs
> > > > > > > >>>> design, and why we need to be able to call the normal
> > > > > > > >>>> NFS read IO path (netfs calls into issue_read, and we call
> > > > > > > >>>> back via netfs_subreq_terminated)?
> > > > > > > >>>>
> > > > > > > >>>> @@ -101,6 +101,9 @@ struct nfs_pageio_descriptor {
> > > > > > > >>>>       struct pnfs_layout_segment *pg_lseg;
> > > > > > > >>>>       struct nfs_io_completion *pg_io_completion;
> > > > > > > >>>>       struct nfs_direct_req   *pg_dreq;
> > > > > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > >>>> +       void                    *pg_netfs;
> > > > > > > >>>> +#endif
> > > > > > > >>>>
> > > > > > > >>>> @@ -1619,6 +1619,9 @@ struct nfs_pgio_header {
> > > > > > > >>>>       const struct nfs_rw_ops *rw_ops;
> > > > > > > >>>>       struct nfs_io_completion *io_completion;
> > > > > > > >>>>       struct nfs_direct_req   *dreq;
> > > > > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > >>>> +       void                    *netfs;
> > > > > > > >>>> +#endif
> > > > > > > >>>>
> > > > > > > >>>>
> > > > > > > >>>> And these additions to pagelist.c?
> > > > > > > >>>>
> > > > > > > >>>> @@ -68,6 +69,10 @@ void nfs_pgheader_init(struct
> > > > > > > >>>> nfs_pageio_descriptor *desc,
> > > > > > > >>>>       hdr->good_bytes = mirror->pg_count;
> > > > > > > >>>>       hdr->io_completion = desc->pg_io_completion;
> > > > > > > >>>>       hdr->dreq = desc->pg_dreq;
> > > > > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > >>>> +       if (desc->pg_netfs)
> > > > > > > >>>> +               hdr->netfs = desc->pg_netfs;
> > > > > > > >>>> +#endif
> > > > > > > >>>
> > > > > > > >>> Why the conditional?
> > > > > > > >>>
> > > > > > > >> Not really needed and I was thinking of removing it, so I'll do that.
> > > > > > > >>
> > > > > > > >>>>
> > > > > > > >>>>
> > > > > > > >>>> @@ -846,6 +851,9 @@ void nfs_pageio_init(struct nfs_pageio_descriptor
> > > > > > > >>>> *desc,
> > > > > > > >>>>       desc->pg_lseg = NULL;
> > > > > > > >>>>       desc->pg_io_completion = NULL;
> > > > > > > >>>>       desc->pg_dreq = NULL;
> > > > > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > >>>> +       desc->pg_netfs = NULL;
> > > > > > > >>>> +#endif
> > > > > > > >>>>
> > > > > > > >>>>
> > > > > > > >>>> @@ -1360,6 +1369,9 @@ int nfs_pageio_resend(struct
> > > > > > > >>>> nfs_pageio_descriptor *desc,
> > > > > > > >>>>
> > > > > > > >>>>       desc->pg_io_completion = hdr->io_completion;
> > > > > > > >>>>       desc->pg_dreq = hdr->dreq;
> > > > > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > >>>> +       desc->pg_netfs = hdr->netfs;
> > > > > > > >>>> +#endif
> > > > > > > >>>
> > > > > > > >>> Those all need wrapper functions instead of embedding #ifdefs.
> > > > > > > >>>
> > > > > > > >> Ok.
> > > > > > > >>
> > > > > > > >>
> > > > > > > >>
> > > > > > > >>>>
> > > > > > > >>>>
> > > > > > > >>>>> My expectation is that the standard I/O path should have minimal
> > > > > > > >>>>> overhead, and should certainly not increase the overhead that we
> > > > > > > >>>>> already have. Will this be addressed in future iterations of these
> > > > > > > >>>>> patches?
> > > > > > > >>>>>
> > > > > > > >>>>
> > > > > > > >>>> I will do what I can to satisfy what you want, either by fixing up
> > > > > > > >>>> this patch or follow-on patches.  Hopefully the above questions
> > > > > > > >>>> will clarify the next steps.
> > > > > > > >>>>
> > > > > > > >>>
> > > > > > > >>> --
> > > > > > > >>> Trond Myklebust
> > > > > > > >>> Linux NFS client maintainer, Hammerspace
> > > > > > > >>> trond.myklebust@hammerspace.com
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > > Trond Myklebust
> > > > > > > CTO, Hammerspace Inc
> > > > > > > 1900 S Norfolk St, Suite 350 - #45
> > > > > > > San Mateo, CA 94403
> > > > > > >
> > > > > > > www.hammer.space
> > > > > > >
> > > > > > >
> > > > > >
> > > > >
> > > >
> > >
Benjamin Maynard Feb. 6, 2023, 5:32 p.m. UTC | #21
Hi all,

Just pinging this thread for any further updates.

Kind Regards
Benjamin Maynard



Kind Regards

Benjamin Maynard

Customer Engineer

benmaynard@google.com

Google, Inc.




On Tue, 3 Jan 2023 at 20:33, Benjamin Maynard <benmaynard@google.com> wrote:
>
> Hi all,
>
> I just wanted to follow up on this set of patches. As Daire explained
> below, these patches are really important to a number of us using
> FS-Cache due to the significant performance regression introduced in
> 5.17 and above.
>
> I'd love to see these patches merged, or some feedback on what changes
> might be needed.
>
> Kind Regards
> Benjamin Maynard
>
> On Thu, 17 Nov 2022 at 11:04, Daire Byrne <daire@dneg.com> wrote:
> >
> > Hi,
> >
> > I just wanted to take the opportunity to reiterate why these patches
> > are important to me (and others like Benjamin).
> >
> > The "new" fscache that is now in mainline has a major NFS performance
> > regression from the previous fscache code in pre 5.17 kernels - single
> > file reads from cache.
> >
> > Even if you have the fastest local disk (nvme/ssd) for your fscache,
> > reading back a cached file (via NFS) now tops out at around 40MB/s
> > whereas before (old fscache) the local fscache disk speed was the only
> > limit (e.g. 5000MB/s for NVMe).
> >
> > So, in many cases, depending on what you are using fscache for, it can
> > be faster to read the file over the (gigabit) network than from the
> > local disk cache which somewhat negates its usefulness. As such, we
> > mostly use pre-5.17 kernels in production and the old fscache code
> > which maintains high cache read performance (but has other annoying
> > issues).
> >
> > Now this performance regression might not be noticed too much by
> > desktop users looking to use fscache on their systems, but it sure
> > does affect servers (e.g. re-export servers) that want to use fscache
> > to achieve very high performance.
> >
> > I can't really comment on these patches or the approach taken, but I
> > do hope that we can restore/improve the fscache read performance for
> > NFS in the mainline kernel as soon as possible (like these patches
> > do).
> >
> > Daire
> >
> >
> > On Mon, 14 Nov 2022 at 21:26, Benjamin Maynard <benmaynard@google.com> wrote:
> > >
> > > Thanks Dave, that did the trick!
> > >
> > > Building the kernel from
> > > https://github.com/DaveWysochanskiRH/kernel/commit/42f58f3d36d83839022dc2617bb6c2d1b09db65f
> > > and re-running the exact same tests yielded the expected results. Data
> > > is now being served from /var/cache/fscache.
> > >
> > > I also reverted my change to the read ahead, so that read ahead is now
> > > greater than the rsize. Still works as expected.
> > >
> > > I am also seeing much better single file read speeds, and culling is
> > > working perfectly (not running into the issue we were seeing pre
> > > 5.17).
> > >
> > > Thanks a lot Dave, Jeff and Daire for your help.
> > >
> > > Kind Regards
> > > Benjamin Maynard
> > >
> > >
> > >
> > > Kind Regards
> > >
> > > Benjamin Maynard
> > >
> > > Customer Engineer
> > >
> > > benmaynard@google.com
> > >
> > > Google, Inc.
> > >
> > >
> > >
> > >
> > > On Mon, 14 Nov 2022 at 17:35, David Wysochanski <dwysocha@redhat.com> wrote:
> > > >
> > > > On Mon, Nov 14, 2022 at 11:04 AM Benjamin Maynard <benmaynard@google.com> wrote:
> > > > >
> > > > > Hi Dave,
> > > > >
> > > > > I've added responses to your questions inline below.
> > > > >
> > > > > I also tried adding the noatime option to the mount on the source
> > > > > filer as Jeff suggested, but this has not made any difference and the
> > > > > issue is still persisting for me.
> > > > >
> > > > > I created the following diagram that explains my setup, and the exact
> > > > > tests I am performing:
> > > > > https://drive.google.com/file/d/12Xf-9yHCKM4eMr2YGqdSAVfGcximW4OG/view?usp=sharing.
> > > > >
> > > > > Hopefully this is clearer than my explanations below (let me know if
> > > > > you'd prefer me to share an alternative way).
> > > > >
> > > > Yes, that's very helpful.  Let me think about this one as I'm not sure.
> > > > As Jeff says we may need tracepoints to track it down if I cannot repro
> > > > it and/or nothing comes to mind.
> > > >
> > > > > In order to remove the re-exporting layer of complexity, I also
> > > > > performed the tests without the re-export server (architecture:
> > > > > https://drive.google.com/file/d/1DQKhqo_UnQ8ul-z5Iram5LpisDmkKziQ/view?usp=share_link):
> > > > >
> > > > > Source NFS Server <-- Client (with FS-Cache)
> > > > >
> > > > > The same is happening, I cannot get FS-Cache to serve from cache.
> > > > > Heavy writes, but no reads, even when the same file is copied many
> > > > > times.
> > > > >
> > > > I'm pretty sure the above you're hitting the drop_caches /
> > > > "fscache read optimisation" issue #1 I mentioned.
> > > >
> > > > I see dhowells just posted a v2 version of his previous patch:
> > > > https://lore.kernel.org/linux-mm/166844174069.1124521.10890506360974169994.stgit@warthog.procyon.org.uk/
> > > >
> > > > I started with 6.1-rc5, added the above dhowells latest patch for that issue,
> > > > and then my 5 patches on top.  Then I added a small patch to utilize
> > > > dhowells patch to ensure the read optimisation is removed.  I ran my
> > > > unit test that has been failing all along and as expected it passes with
> > > > these patches.  I pushed the series to github:
> > > > https://github.com/DaveWysochanskiRH/kernel/commits/nfs-fscache-netfs
> > > > https://github.com/DaveWysochanskiRH/kernel/commit/42f58f3d36d83839022dc2617bb6c2d1b09db65f
> > > >
> > > > I will also email you the series of patches on top of 6.1-rc5 so you
> > > > can just apply from your mailbox if you want.
> > > >
> > > >
> > > >
> > > > > Hopefully something I am doing wrong on my end, but I can't figure out what.
> > > > >
> > > > > Kind Regards
> > > > > Benjamin Maynard
> > > > >
> > > > >
> > > > > On Mon, 14 Nov 2022 at 13:47, David Wysochanski <dwysocha@redhat.com> wrote:
> > > > > >
> > > > > > I apologize I did not read carefully enough and I missed some details
> > > > > > in your original post.
> > > > > > More below.
> > > > > >
> > > > > > On Sat, Nov 12, 2022 at 7:47 AM Benjamin Maynard <benmaynard@google.com> wrote:
> > > > > > >
> > > > > > > Hi all,
> > > > > > >
> > > > > > > I've been doing some more testing with these patches, I applied all of
> > > > > > > the patches (v10 from
> > > > > > > https://patchwork.kernel.org/project/linux-nfs/list/?series=691729)
> > > > > > > apart from Patch 6 (the RFC patch) to version 6.0.8 of the kernel.
> > > > > > >
> > > > > > > I have the following setup:
> > > > > > >
> > > > > > > Source NFS Server <-- Re-Export Server (with FS-Cache) <-- NFS Client.
> > > > > > >
> > > > > > > I have a 500Gb file on the Source NFS Server, which I am then copying
> > > > > > > to the NFS Client via the Re-Export Server.
> > > > > > >
> > > > > > > On the first copy, I see heavy writes to /var/cache/fscache on the
> > > > > > > re-export server, and once the file copy completes I see that
> > > > > > > /var/cache/fscache is approximately 500Gb in size. All good so far.
> > > > > > >
> > > > > > > I then deleted that file from the NFS Client, and dropped the caches
> > > > > > > just to be safe (echo 3 > /proc/sys/vm/drop_caches on the NFS Client).
> > > > > > >
> > > > > > If you delete the file from the NFS client, how does that not delete the
> > > > > > file from the original NFS server?
> > > > >
> > > > > Sorry - to be clear, I never deleted the file from the NFS mount
> > > > > (which I know would in turn delete it from the re-export server and
> > > > > the source filer).
> > > > >
> > > > > In order to perform the performance test, I copied the file from the
> > > > > NFS mount on the NFS Client, to a local directory (cp
> > > > > /mnt/nfs/500gb.img /tmp).
> > > > >
> > > > > When I said "I then deleted that file from the NFS Client", I meant I
> > > > > deleted the local copy of that file. Not the file on the mount (rm
> > > > > /tmp/500gb.img).
> > > > >
> > > > > Just to also stress, I have never dropped the caches on the Re-Export
> > > > > Server (the one with FS-Cache) at any point in any of these tests, so
> > > > > I don't think this is the problem. I have only ever dropped the caches
> > > > > on the NFS client that is mounting the Re-Export Server.
> > > > >
> > > > > > > I then performed another copy of the 500Gb file on the NFS Client,
> > > > > > > again via the Re-Export Server. What I expected would happen is that I
> > > > > > > would see heavy reads from the /var/cache/fscache volume as the file
> > > > > > > should be served from FS-Cache.
> > > > > > >
> > > > > > I don't understand this.  When you say you "performed another copy"
> > > > > > of what file?  Wasn't the file deleted in the above step?
> > > > >
> > > > > As above, only the local copy was deleted.
> > > > >
> > > > > > > However what I actually saw was no reads whatsoever, FS-Cache seems to
> > > > > > > be ignored and the file is pulled from the Source NFS Filer again. I
> > > > > > > also see heavy writes to /var/cache/fscache, so it appears that
> > > > > > > FS-Cache is overwriting its existing cache, and never using it.
> > > > > >
> > > > > > That would happen if the file was changed or re-created.
> > > > > >
> > > > > > > I only have 104Gb of memory on the Re-Export Server (with FS-Cache) so
> > > > > > > it is not possible that the file is being served from the page cache.
> > > > > > >
> > > > > > > We saw this behaviour before on an older set of the patches when our
> > > > > > > mount between the Re-Export Server and the Source NFS Filer was using
> > > > > > > the "sync" option, but we are now using the "async" option and the
> > > > > > > same is happening.
> > > > > > >
> > > > > > > Mount options:
> > > > > > >
> > > > > > > Source NFS Server <-- Re-Export Server (with FS-Cache):
> > > > > > >
> > > > > > > 10.0.0.49:/files /srv/nfs/files nfs
> > > > > > > rw,noatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,acregmin=600,acregmax=600,acdirmin=600,acdirmax=600,hard,nocto,proto=tcp,nconnect=16,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.49,mountvers=3,mountport=37485,mountproto=tcp,fsc,local_lock=none,addr=10.0.0.49
> > > > > > >
> > > > > > > Re-Export Server (with FS-Cache) <-- NFS Client:
> > > > > > >
> > > > > > > 10.0.0.3:/files /mnt/nfs nfs
> > > > > > > rw,relatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.3,mountvers=3,mountport=20048,mountproto=tcp,local_lock=none,addr=10.0.0.3
> > > > > > >
> > > > > > > It is also worth noting this behaviour is not unique to the re-export
> > > > > > > use case. I see FS-Cache not being used with the following setup:
> > > > > > >
> > > > > > > Source NFS Server <-- Client (with FS-Cache).
> > > > > > >
> > > > > >
> > > > > > This points at something more fundamental like something missed
> > > > > > in the test or maybe a mount option.  Can you explain what test
> > > > > > you're doing here when you say "this behavior is not unique"?
> > > > >
> > > > > I've created the following diagram which explains the test I am
> > > > > performing. I think it is a little easier to follow than explaining in
> > > > > text. This should be viewable without any authentication:
> > > > > https://drive.google.com/file/d/12Xf-9yHCKM4eMr2YGqdSAVfGcximW4OG/view?usp=sharing.
> > > > >
> > > > > By "this behaviour is not unique to the re-export use case" I mean
> > > > > that the same happens if I remove the re-export server completely, and
> > > > > just have the following setup:
> > > > >
> > > > > Source NFS Server <-- Client (with FS-Cache).
> > > > >
> > > > > > Can you show the mount options for both:
> > > > > > - fscache filesystem on the re-export server (/var/cache/fscache)
> > > > >
> > > > > root@reexport:~$ mount | grep /var/cache/fscache
> > > > > /dev/md127 on /var/cache/fscache type ext4
> > > > > (rw,relatime,discard,nobarrier,stripe=1024)
> > > > >
> > > > > > - exported filesystem on the NFS server (filesystem in /etc/exports)
> > > > >
> > > > > I have tried both:
> > > > >
> > > > > root@source:~$ mount | grep files
> > > > > /dev/sdb1 on /files type ext4 (rw)
> > > > >
> > > > > root@source:~$ cat /etc/exports
> > > > > /files 10.0.0.0/8(rw,sync,wdelay,no_root_squash,no_all_squash,no_subtree_check,sec=sys,secure,nohide)
> > > > >
> > > > > and (at Jeff's suggestion):
> > > > >
> > > > > root@source:~$ mount | grep files
> > > > > /dev/sdb1 on /files type ext4 (rw,noatime)
> > > > >
> > > > > root@source:~$ cat /etc/exports
> > > > > /files 10.0.0.0/8(rw,sync,wdelay,no_root_squash,no_all_squash,no_subtree_check,sec=sys,secure,nohide)
> > > > >
> > > > >
> > > > > > Unfortunately the problem with drop_caches makes it more difficult
> > > > > > to know when fscache is truly working.  But some other unit test
> > > > > > I have shows fscache does work with this patchset so I'm puzzled why
> > > > > > you're not seeing it work at all.
> > > > > >
> > > > > > I pinged dhowells on the drop_caches issue so maybe we can get
> > > > > > that one sorted out soon but I'm not sure since it's part of a series
> > > > > > and proposes changes in mm.
> > > > >
> > > > > Just to be clear, I have never used drop_caches on the re-export
> > > > > server in any of these tests. I have only ever done this on the NFS
> > > > > Client.
> > > > >
> > > > > >
> > > > > > > Thanks,
> > > > > > > Ben
> > > > > > >
> > > > > > >
> > > > > > > Kind Regards
> > > > > > >
> > > > > > > Benjamin Maynard
> > > > > > >
> > > > > > > Customer Engineer
> > > > > > >
> > > > > > > benmaynard@google.com
> > > > > > >
> > > > > > > Google, Inc.
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > > On Mon, 31 Oct 2022 at 22:22, Trond Myklebust <trondmy@hammerspace.com> wrote:
> > > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > > On Oct 30, 2022, at 19:25, David Wysochanski <dwysocha@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > > On Sat, Oct 29, 2022 at 12:46 PM David Wysochanski <dwysocha@redhat.com> wrote:
> > > > > > > > >>
> > > > > > > > >> On Fri, Oct 28, 2022 at 12:59 PM Trond Myklebust <trondmy@kernel.org> wrote:
> > > > > > > > >>>
> > > > > > > > >>> On Fri, 2022-10-28 at 07:50 -0400, David Wysochanski wrote:
> > > > > > > > >>>> On Thu, Oct 27, 2022 at 3:16 PM Trond Myklebust <trondmy@kernel.org>
> > > > > > > > >>>> wrote:
> > > > > > > > >>>>>
> > > > > > > > >>>>> On Mon, 2022-10-17 at 06:52 -0400, Dave Wysochanski wrote:
> > > > > > > > >>>>>> Convert the NFS buffered read code paths to corresponding netfs
> > > > > > > > >>>>>> APIs,
> > > > > > > > >>>>>> but only when fscache is configured and enabled.
> > > > > > > > >>>>>>
> > > > > > > > >>>>>> The netfs API defines struct netfs_request_ops which must be
> > > > > > > > >>>>>> filled
> > > > > > > > >>>>>> in by the network filesystem.  For NFS, we only need to define 5
> > > > > > > > >>>>>> of
> > > > > > > > >>>>>> the functions, the main one being the issue_read() function.
> > > > > > > > >>>>>> The issue_read() function is called by the netfs layer when a
> > > > > > > > >>>>>> read
> > > > > > > > >>>>>> cannot be fulfilled locally, and must be sent to the server
> > > > > > > > >>>>>> (either
> > > > > > > > >>>>>> the cache is not active, or it is active but the data is not
> > > > > > > > >>>>>> available).
> > > > > > > > >>>>>> Once the read from the server is complete, netfs requires a call
> > > > > > > > >>>>>> to
> > > > > > > > >>>>>> netfs_subreq_terminated() which conveys either how many bytes
> > > > > > > > >>>>>> were
> > > > > > > > >>>>>> read
> > > > > > > > >>>>>> successfully, or an error.  Note that issue_read() is called with
> > > > > > > > >>>>>> a
> > > > > > > > >>>>>> structure, netfs_io_subrequest, which defines the IO requested,
> > > > > > > > >>>>>> and
> > > > > > > > >>>>>> contains a start and a length (both in bytes), and assumes the
> > > > > > > > >>>>>> underlying
> > > > > > > > >>>>>> netfs will return a either an error on the whole region, or the
> > > > > > > > >>>>>> number
> > > > > > > > >>>>>> of bytes successfully read.
> > > > > > > > >>>>>>
> > > > > > > > >>>>>> The NFS IO path is page based and the main APIs are the pgio APIs
> > > > > > > > >>>>>> defined
> > > > > > > > >>>>>> in pagelist.c.  For the pgio APIs, there is no way for the caller
> > > > > > > > >>>>>> to
> > > > > > > > >>>>>> know how many RPCs will be sent and how the pages will be broken
> > > > > > > > >>>>>> up
> > > > > > > > >>>>>> into underlying RPCs, each of which will have their own
> > > > > > > > >>>>>> completion
> > > > > > > > >>>>>> and
> > > > > > > > >>>>>> return code.  In contrast, netfs is subrequest based, a single
> > > > > > > > >>>>>> subrequest may contain multiple pages, and a single subrequest is
> > > > > > > > >>>>>> initiated with issue_read() and terminated with
> > > > > > > > >>>>>> netfs_subreq_terminated().
> > > > > > > > >>>>>> Thus, to utilze the netfs APIs, NFS needs some way to accommodate
> > > > > > > > >>>>>> the netfs API requirement on the single response to the whole
> > > > > > > > >>>>>> subrequest, while also minimizing disruptive changes to the NFS
> > > > > > > > >>>>>> pgio layer.
> > > > > > > > >>>>>>
> > > > > > > > >>>>>> The approach taken with this patch is to allocate a small
> > > > > > > > >>>>>> structure
> > > > > > > > >>>>>> for each nfs_netfs_issue_read() call, store the final error and
> > > > > > > > >>>>>> number
> > > > > > > > >>>>>> of bytes successfully transferred in the structure, and update
> > > > > > > > >>>>>> these
> > > > > > > > >>>>>> values
> > > > > > > > >>>>>> as each RPC completes.  The refcount on the structure is used as
> > > > > > > > >>>>>> a
> > > > > > > > >>>>>> marker
> > > > > > > > >>>>>> for the last RPC completion, is incremented in
> > > > > > > > >>>>>> nfs_netfs_read_initiate(),
> > > > > > > > >>>>>> and decremented inside nfs_netfs_read_completion(), when a
> > > > > > > > >>>>>> nfs_pgio_header
> > > > > > > > >>>>>> contains a valid pointer to the data.  On the final put (which
> > > > > > > > >>>>>> signals
> > > > > > > > >>>>>> the final outstanding RPC is complete) in
> > > > > > > > >>>>>> nfs_netfs_read_completion(),
> > > > > > > > >>>>>> call netfs_subreq_terminated() with either the final error value
> > > > > > > > >>>>>> (if
> > > > > > > > >>>>>> one or more READs complete with an error) or the number of bytes
> > > > > > > > >>>>>> successfully transferred (if all RPCs complete successfully).
> > > > > > > > >>>>>> Note
> > > > > > > > >>>>>> that when all RPCs complete successfully, the number of bytes
> > > > > > > > >>>>>> transferred
> > > > > > > > >>>>>> is capped to the length of the subrequest.  Capping the
> > > > > > > > >>>>>> transferred
> > > > > > > > >>>>>> length
> > > > > > > > >>>>>> to the subrequest length prevents "Subreq overread" warnings from
> > > > > > > > >>>>>> netfs.
> > > > > > > > >>>>>> This is due to the "aligned_len" in nfs_pageio_add_page(), and
> > > > > > > > >>>>>> the
> > > > > > > > >>>>>> corner case where NFS requests a full page at the end of the
> > > > > > > > >>>>>> file,
> > > > > > > > >>>>>> even when i_size reflects only a partial page (NFS overread).
> > > > > > > > >>>>>>
> > > > > > > > >>>>>> Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
> > > > > > > > >>>>>> Reviewed-by: Jeff Layton <jlayton@kernel.org>
> > > > > > > > >>>>>
> > > > > > > > >>>>>
> > > > > > > > >>>>> This is not doing what I asked for, which was to separate out the
> > > > > > > > >>>>> fscache functionality, so that we can call that if and when it is
> > > > > > > > >>>>> available.
> > > > > > > > >>>>>
> > > > > > > > >>>> I must have misunderstood then.
> > > > > > > > >>>>
> > > > > > > > >>>> The last feedback I have from you was that you wanted it to be
> > > > > > > > >>>> an opt-in feature, and it was a comment on a previous patch
> > > > > > > > >>>> to Kconfig.  I was proceeding the best I knew how, but
> > > > > > > > >>>> let me try to get back on track.
> > > > > > > > >>>>
> > > > > > > > >>>>> Instead, it is just wrapping the NFS requests inside netfs
> > > > > > > > >>>>> requests. As
> > > > > > > > >>>>> it stands, that means it is just duplicating information, and
> > > > > > > > >>>>> adding
> > > > > > > > >>>>> unnecessary overhead to the standard I/O path (extra allocations,
> > > > > > > > >>>>> extra
> > > > > > > > >>>>> indirect calls, and extra bloat to the inode).
> > > > > > > > >>>>>
> > > > > > > > >>>> I think I understand what you're saying but I'm not sure.  Let me
> > > > > > > > >>>> ask some clarifying questions.
> > > > > > > > >>>>
> > > > > > > > >>>> Are you objecting to the code when CONFIG_NFS_FSCACHE is
> > > > > > > > >>>> configured?  Or when it is not?  Or both?  I think you're objecting
> > > > > > > > >>>> when it's configured, but not enabled (we mount without 'fsc').
> > > > > > > > >>>> Am I right?
> > > > > > > > >>>>
> > > > > > > > >>>> Also, are you objecting to the design that to use fcache we now
> > > > > > > > >>>> have to use netfs, specifically:
> > > > > > > > >>>> - call into netfs via either netfs_read_folio or netfs_readahead
> > > > > > > > >>>> - if fscache is enabled, then the IO can be satisfied from fscache
> > > > > > > > >>>> - if fscache is not enabled, or some of the IO cannot be satisfied
> > > > > > > > >>>> from the cache, then NFS is called back via netfs_issue_read
> > > > > > > > >>>> and we use the normal NFS read pageio interface.  This requires
> > > > > > > > >>>> we call netfs_subreq_terminated() when all the RPCs complete,
> > > > > > > > >>>> which is the reason for the small changes to pagelist.c
> > > > > > > > >>>
> > > > > > > > >>> I'm objecting to any middle layer "solution" that adds overhead to the
> > > > > > > > >>> NFS I/O paths.
> > > > > > > > >>>
> > > > > > > > >> Got it.
> > > > > > > > >>
> > > > > > > > >>> I'm willing to consider solutions that are specific only to the fscache
> > > > > > > > >>> use case (i.e. when the 'fsc' mount option is specified). However when
> > > > > > > > >>> I perform a normal NFS mount, and do I/O, then I don't want to see
> > > > > > > > >>> extra memory allocations, extra indirect calls and larger inode
> > > > > > > > >>> footprints.
> > > > > > > > >>>
> > > > > > > > >>> IOW: I want the code to optimise for the case of standard NFS, not for
> > > > > > > > >>> the case of 'NFS with cachefs additions'.
> > > > > > > > >>>
> > > > > > > > >> I agree completely.  Are you seeing extra memory allocations
> > > > > > > > >> happen on mounts without 'fsc' or is it more a concern or how
> > > > > > > > >> some of the patches look?  We should not be calling any netfs or
> > > > > > > > >> fscache code if 'fsc' is not on the mount and I don't see any in my
> > > > > > > > >> testing. So either there's a misunderstanding here, or there's a
> > > > > > > > >> bug I'm missing.
> > > > > > > > >>
> > > > > > > > >> If fscache is not configured, then nfs_netfs_read_folio() and
> > > > > > > > >> nfs_netfs_readahead() is a wrapper that returns -ENOBUFS.
> > > > > > > > >> If it's configured but not enabled, then the checks for
> > > > > > > > >> netfs_inode(inode)->cache should skip over any netfs code.
> > > > > > > > >> But maybe there's a non-obvious bug you're seeing and
> > > > > > > > >> somehow netfs is still getting called?  Because I cannot
> > > > > > > > >> see netfs getting called if 'fsc' is not on the mount in my
> > > > > > > > >> tests.
> > > > > > > > >>
> > > > > > > > >> int nfs_netfs_read_folio(struct file *file, struct folio *folio)
> > > > > > > > >> {
> > > > > > > > >>       if (!netfs_inode(folio_inode(folio))->cache)
> > > > > > > > >>               return -ENOBUFS;
> > > > > > > > >>
> > > > > > > > >>       return netfs_read_folio(file, folio);
> > > > > > > > >> }
> > > > > > > > >>
> > > > > > > > >> int nfs_netfs_readahead(struct readahead_control *ractl)
> > > > > > > > >> {
> > > > > > > > >>       struct inode *inode = ractl->mapping->host;
> > > > > > > > >>
> > > > > > > > >>       if (!netfs_inode(inode)->cache)
> > > > > > > > >>               return -ENOBUFS;
> > > > > > > > >>
> > > > > > > > >>       netfs_readahead(ractl);
> > > > > > > > >>       return 0;
> > > > > > > > >> }
> > > > > > > > >>
> > > > > > > > >>
> > > > > > > > >>>>
> > > > > > > > >>>> Can you be more specific as to the portions of the patch you don't
> > > > > > > > >>>> like
> > > > > > > > >>>> so I can move it in the right direction?
> > > > > > > > >>>>
> > > > > > > > >>>> This is from patch #2 which you didn't comment on.  I'm not sure
> > > > > > > > >>>> you're
> > > > > > > > >>>> ok with it though, since you mention "extra bloat to the inode".
> > > > > > > > >>>> Do you object to this even though it's wrapped in an
> > > > > > > > >>>> #ifdef CONFIG_NFS_FSCACHE?  If so, do you require no
> > > > > > > > >>>> extra size be added to nfs_inode?
> > > > > > > > >>>>
> > > > > > > > >>>> @@ -204,9 +208,11 @@ struct nfs_inode {
> > > > > > > > >>>>       __u64 write_io;
> > > > > > > > >>>>       __u64 read_io;
> > > > > > > > >>>> #ifdef CONFIG_NFS_FSCACHE
> > > > > > > > >>>> -       struct fscache_cookie   *fscache;
> > > > > > > > >>>> -#endif
> > > > > > > > >>>> +       struct netfs_inode      netfs; /* netfs context and VFS inode
> > > > > > > > >>>> */
> > > > > > > > >>>> +#else
> > > > > > > > >>>>       struct inode            vfs_inode;
> > > > > > > > >>>> +#endif
> > > > > > > > >>>> +
> > > > > > > > >>>
> > > > > > > > >>> Ideally, I'd prefer no extra size. I can live with it up to a certain
> > > > > > > > >>> point, however for now NFS is not unconditionally opting into the netfs
> > > > > > > > >>> project. If we're to ever do that, then I want to see streamlined code
> > > > > > > > >>> for the standard I/O case.
> > > > > > > > >>>
> > > > > > > > >> Ok and understood about standard I/O case.
> > > > > > > > >>
> > > > > > > > >> I was thinking how we might not increase the size, but I don't think
> > > > > > > > >> I can make it work.
> > > > > > > > >>
> > > > > > > > >> I thought we could change to something like the below, without an
> > > > > > > > >> embedded struct inode:
> > > > > > > > >>
> > > > > > > > >> @@ -204,9 +208,11 @@ struct nfs_inode {
> > > > > > > > >>       __u64 write_io;
> > > > > > > > >>       __u64 read_io;
> > > > > > > > >> #ifdef CONFIG_NFS_FSCACHE
> > > > > > > > >> -       struct fscache_cookie   *fscache;
> > > > > > > > >> -#endif
> > > > > > > > >> +       struct netfs_inode      *netfs; /* netfs context and VFS inode */
> > > > > > > > >> +#else
> > > > > > > > >>       struct inode            vfs_inode;
> > > > > > > > >> +#endif
> > > > > > > > >> +
> > > > > > > > >>
> > > > > > > > >> Then I would need to alloc/free a netfs_inode at the time of
> > > > > > > > >> nfs_inode initiation.  Unfortunately this has the issue that the NFS_I()
> > > > > > > > >> macro cannot work, because it requires an embedded "struct inode"
> > > > > > > > >> due to "container_of" use:
> > > > > > > > >>
> > > > > > > > >> +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > >> +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > > > > > > > >> +{
> > > > > > > > >> +       return &nfsi->netfs.inode;
> > > > > > > > >> +}
> > > > > > > > >> +static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > > > > > > > >> +{
> > > > > > > > >> +       return container_of(inode, struct nfs_inode, netfs.inode);
> > > > > > > > >> +}
> > > > > > > > >> +#else
> > > > > > > > >> +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > > > > > > > >> +{
> > > > > > > > >> +       return &nfsi->vfs_inode;
> > > > > > > > >> +}
> > > > > > > > >> static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > > > > > > > >> {
> > > > > > > > >>       return container_of(inode, struct nfs_inode, vfs_inode);
> > > > > > > > >> }
> > > > > > > > >> +#endif
> > > > > > > > >>
> > > > > > > > >>
> > > > > > > > >
> > > > > > > > > Actually Trond maybe we can achieve a "0 length increase" of
> > > > > > > > > nfs_inode if dhowells would take a patch to modify the definition
> > > > > > > > > of struct netfs_inode and netfs_inode_init(), something like the WIP
> > > > > > > > > patch below.  What do you think?
> > > > > > > >
> > > > > > > > That works for me.
> > > > > > > >
> > > > > > > > >
> > > > > > > > > I think maybe this could be a follow-on patch and if you/dhowells
> > > > > > > > > think it's an ok idea I can try to work out what is needed across
> > > > > > > > > the tree.  I thought about it more and I kinda agree that in the
> > > > > > > > > case for NFS where fscache is "configured but not enabled",
> > > > > > > > > then even though we're only adding 24 bytes to the nfs_inode
> > > > > > > > > each time, it will add up so it is worth at least a discussion.
> > > > > > > > >
> > > > > > > > > diff --git a/include/linux/netfs.h b/include/linux/netfs.h
> > > > > > > > > index f2402ddeafbf..195714f1c355 100644
> > > > > > > > > --- a/include/linux/netfs.h
> > > > > > > > > +++ b/include/linux/netfs.h
> > > > > > > > > @@ -118,11 +118,7 @@ enum netfs_io_source {
> > > > > > > > > typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error,
> > > > > > > > >                                     bool was_async);
> > > > > > > > >
> > > > > > > > > -/*
> > > > > > > > > - * Per-inode context.  This wraps the VFS inode.
> > > > > > > > > - */
> > > > > > > > > -struct netfs_inode {
> > > > > > > > > -       struct inode            inode;          /* The VFS inode */
> > > > > > > > > +struct netfs_info {
> > > > > > > > >       const struct netfs_request_ops *ops;
> > > > > > > > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > > > > > > >       struct fscache_cookie   *cache;
> > > > > > > > > @@ -130,6 +126,14 @@ struct netfs_inode {
> > > > > > > > >       loff_t                  remote_i_size;  /* Size of the remote file */
> > > > > > > > > };
> > > > > > > > >
> > > > > > > > > +/*
> > > > > > > > > + * Per-inode context.  This wraps the VFS inode.
> > > > > > > > > + */
> > > > > > > > > +struct netfs_inode {
> > > > > > > > > +       struct inode            inode;          /* The VFS inode */
> > > > > > > > > +       struct netfs_info       *netfs;         /* Rest of netfs data */
> > > > > > > > > +};
> > > > > > > > > +
> > > > > > > > > /*
> > > > > > > > > * Resources required to do operations on a cache.
> > > > > > > > > */
> > > > > > > > > @@ -312,10 +316,12 @@ static inline struct netfs_inode
> > > > > > > > > *netfs_inode(struct inode *inode)
> > > > > > > > > static inline void netfs_inode_init(struct netfs_inode *ctx,
> > > > > > > > >                                   const struct netfs_request_ops *ops)
> > > > > > > > > {
> > > > > > > > > -       ctx->ops = ops;
> > > > > > > > > -       ctx->remote_i_size = i_size_read(&ctx->inode);
> > > > > > > > > +       ctx->netfs = kzalloc(sizeof(struct netfs_info)), GFP_KERNEL);
> > > > > > > > > +       /* FIXME: Check for NULL */
> > > > > > > > > +       ctx->netfs->ops = ops;
> > > > > > > > > +       ctx->netfs->remote_i_size = i_size_read(&ctx->inode);
> > > > > > > > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > > > > > > > -       ctx->cache = NULL;
> > > > > > > > > +       ctx->netfs->cache = NULL;
> > > > > > > > > #endif
> > > > > > > > > }
> > > > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > >>
> > > > > > > > >>>>
> > > > > > > > >>>>
> > > > > > > > >>>> Are you ok with the stub functions which are placed in fscache.h, and
> > > > > > > > >>>> when CONFIG_NFS_FSCACHE is not set, become either a no-op
> > > > > > > > >>>> or a 1-liner (nfs_netfs_readpage_release)?
> > > > > > > > >>>>
> > > > > > > > >>>> #else /* CONFIG_NFS_FSCACHE */
> > > > > > > > >>>> +static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {}
> > > > > > > > >>>> +static inline void nfs_netfs_initiate_read(struct nfs_pgio_header
> > > > > > > > >>>> *hdr) {}
> > > > > > > > >>>> +static inline void nfs_netfs_read_completion(struct nfs_pgio_header
> > > > > > > > >>>> *hdr) {}
> > > > > > > > >>>> +static inline void nfs_netfs_readpage_release(struct nfs_page *req)
> > > > > > > > >>>> +{
> > > > > > > > >>>> +       unlock_page(req->wb_page);
> > > > > > > > >>>> +}
> > > > > > > > >>>> static inline void nfs_fscache_release_super_cookie(struct
> > > > > > > > >>>> super_block *sb) {}
> > > > > > > > >>>> static inline void nfs_fscache_init_inode(struct inode *inode) {}
> > > > > > > > >>>>
> > > > > > > > >>>>
> > > > > > > > >>>> Do you object to the below?  If so, then do you want
> > > > > > > > >>>> #ifdef CONFIG_NFS_FSCACHE here?
> > > > > > > > >>>>
> > > > > > > > >>>> -- a/fs/nfs/inode.c
> > > > > > > > >>>> +++ b/fs/nfs/inode.c
> > > > > > > > >>>> @@ -2249,6 +2249,8 @@ struct inode *nfs_alloc_inode(struct
> > > > > > > > >>>> super_block *sb)
> > > > > > > > >>>> #ifdef CONFIG_NFS_V4_2
> > > > > > > > >>>>       nfsi->xattr_cache = NULL;
> > > > > > > > >>>> #endif
> > > > > > > > >>>> +       nfs_netfs_inode_init(nfsi);
> > > > > > > > >>>> +
> > > > > > > > >>>>       return VFS_I(nfsi);
> > > > > > > > >>>> }
> > > > > > > > >>>> EXPORT_SYMBOL_GPL(nfs_alloc_i
> > > > > > > > >>>> node);
> > > > > > > > >>>>
> > > > > > > > >>>>
> > > > > > > > >>>> Do you object to the changes in fs/nfs/read.c?  Specifically,
> > > > > > > > >>>> how about the below calls to netfs from nfs_read_folio and
> > > > > > > > >>>> nfs_readahead into equivalent netfs calls?  So when
> > > > > > > > >>>> NFS_CONFIG_FSCACHE is set, but fscache is not enabled
> > > > > > > > >>>> ('fsc' not on mount), these netfs functions do immediately call
> > > > > > > > >>>> netfs_alloc_request().  But I wonder if we could simply add a
> > > > > > > > >>>> check to see if fscache is enabled on the mount, and skip
> > > > > > > > >>>> over to satisfy what you want.  Am I understanding what you
> > > > > > > > >>>> want?
> > > > > > > > >>>
> > > > > > > > >>> Quite frankly, I'd prefer that we just split out the functionality that
> > > > > > > > >>> is needed from the netfs code so that it can be optimised. However I'm
> > > > > > > > >>> not interested enough in the cachefs functionality to work on that
> > > > > > > > >>> myself. ...and as I indicated above, I might be OK with opting into the
> > > > > > > > >>> netfs project, once the overhead can be made to disappear.
> > > > > > > > >>>
> > > > > > > > >> Understood.
> > > > > > > > >>
> > > > > > > > >> If you think it makes more sense, I can move some of the nfs_netfs_*
> > > > > > > > >> functions into a netfs.c file as a starting point.  Or that can maybe
> > > > > > > > >> be done in a future patchset?
> > > > > > > > >>
> > > > > > > > >> For now I was equating netfs and fscache together so we can
> > > > > > > > >> move on from the much older and single-page limiting fscache
> > > > > > > > >> interface that is likely to go away soon.
> > > > > > > > >>
> > > > > > > > >>>>
> > > > > > > > >>>> @@ -355,6 +343,10 @@ int nfs_read_folio(struct file *file, struct
> > > > > > > > >>>> folio *folio)
> > > > > > > > >>>>       if (NFS_STALE(inode))
> > > > > > > > >>>>               goto out_unlock;
> > > > > > > > >>>>
> > > > > > > > >>>> +       ret = nfs_netfs_read_folio(file, folio);
> > > > > > > > >>>> +       if (!ret)
> > > > > > > > >>>> +               goto out;
> > > > > > > > >>>> +
> > > > > > > > >>>>
> > > > > > > > >>>> @@ -405,6 +399,10 @@ void nfs_readahead(struct readahead_control
> > > > > > > > >>>> *ractl)
> > > > > > > > >>>>       if (NFS_STALE(inode))
> > > > > > > > >>>>               goto out;
> > > > > > > > >>>>
> > > > > > > > >>>> +       ret = nfs_netfs_readahead(ractl);
> > > > > > > > >>>> +       if (!ret)
> > > > > > > > >>>> +               goto out;
> > > > > > > > >>>> +
> > > > > > > > >>>>
> > > > > > > > >> The above wrappers should prevent any additional overhead when fscache
> > > > > > > > >> is not enabled.  As far as I know these work to avoid calling netfs
> > > > > > > > >> when 'fsc' is not on the mount.
> > > > > > > > >>
> > > > > > > > >>>>
> > > > > > > > >>>> And how about these calls from different points in the read
> > > > > > > > >>>> path to the earlier mentioned stub functions?
> > > > > > > > >>>>
> > > > > > > > >>>> @@ -110,20 +110,13 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
> > > > > > > > >>>>
> > > > > > > > >>>> static void nfs_readpage_release(struct nfs_page *req, int error)
> > > > > > > > >>>> {
> > > > > > > > >>>> -       struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > > > > > > > >>>>       struct page *page = req->wb_page;
> > > > > > > > >>>>
> > > > > > > > >>>> -       dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb-
> > > > > > > > >>>>> s_id,
> > > > > > > > >>>> -               (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
> > > > > > > > >>>> -               (long long)req_offset(req));
> > > > > > > > >>>> -
> > > > > > > > >>>>       if (nfs_error_is_fatal_on_server(error) && error != -
> > > > > > > > >>>> ETIMEDOUT)
> > > > > > > > >>>>               SetPageError(page);
> > > > > > > > >>>> -       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
> > > > > > > > >>>> -               if (PageUptodate(page))
> > > > > > > > >>>> -                       nfs_fscache_write_page(inode, page);
> > > > > > > > >>>> -               unlock_page(page);
> > > > > > > > >>>> -       }
> > > > > > > > >>>> +       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > > > > > > >>>> +               nfs_netfs_readpage_release(req);
> > > > > > > > >>>> +
> > > > > > > > >>>
> > > > > > > > >>> I'm not seeing the value of wrapping unlock_page(), no... That code is
> > > > > > > > >>> going to need to change when we move it to use folios natively anyway.
> > > > > > > > >>>
> > > > > > > > >> Ok, how about I make it conditional on whether fscache is configured
> > > > > > > > >> and enabled then, similar to the nfs_netfs_read_folio() and
> > > > > > > > >> nfs_netfs_readahead()?  Below is what that would look like.
> > > > > > > > >> I could inline the code in nfs_netfs_readpage_release() if you
> > > > > > > > >> think it would be clearer.
> > > > > > > > >>
> > > > > > > > >> static void nfs_readpage_release(struct nfs_page *req, int error)
> > > > > > > > >> {
> > > > > > > > >>       struct page *page = req->wb_page;
> > > > > > > > >>
> > > > > > > > >>       if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
> > > > > > > > >>               SetPageError(page);
> > > > > > > > >>       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > > > > > > >> #ifndef CONFIG_NFS_FSCACHE
> > > > > > > > >>               unlock_page(req->wb_page);
> > > > > > > > >> #else
> > > > > > > > >>               nfs_netfs_readpage_release(req);
> > > > > > > > >> #endif
> > > > > > > > >>       nfs_release_request(req);
> > > > > > > > >> }
> > > > > > > > >>
> > > > > > > > >>
> > > > > > > > >> void nfs_netfs_readpage_release(struct nfs_page *req)
> > > > > > > > >> {
> > > > > > > > >>   struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > > > > > > > >>
> > > > > > > > >>   /*
> > > > > > > > >>    * If fscache is enabled, netfs will unlock pages.
> > > > > > > > >>    */
> > > > > > > > >>   if (netfs_inode(inode)->cache)
> > > > > > > > >>       return;
> > > > > > > > >>
> > > > > > > > >>   unlock_page(req->wb_page);
> > > > > > > > >> }
> > > > > > > > >>
> > > > > > > > >>
> > > > > > > > >>>>       nfs_release_request(req);
> > > > > > > > >>>> }
> > > > > > > > >>>>
> > > > > > > > >>>> @@ -177,6 +170,8 @@ static void nfs_read_completion(struct
> > > > > > > > >>>> nfs_pgio_header *hdr)
> > > > > > > > >>>>               nfs_list_remove_request(req);
> > > > > > > > >>>>               nfs_readpage_release(req, error);
> > > > > > > > >>>>       }
> > > > > > > > >>>> +       nfs_netfs_read_completion(hdr);
> > > > > > > > >>>> +
> > > > > > > > >>>> out:
> > > > > > > > >>>>       hdr->release(hdr);
> > > > > > > > >>>> }
> > > > > > > > >>>> @@ -187,6 +182,7 @@ static void nfs_initiate_read(struct
> > > > > > > > >>>> nfs_pgio_header *hdr,
> > > > > > > > >>>>                             struct rpc_task_setup *task_setup_data,
> > > > > > > > >>>> int how)
> > > > > > > > >>>> {
> > > > > > > > >>>>       rpc_ops->read_setup(hdr, msg);
> > > > > > > > >>>> +       nfs_netfs_initiate_read(hdr);
> > > > > > > > >>>>       trace_nfs_initiate_read(hdr);
> > > > > > > > >>>> }
> > > > > > > > >>>>
> > > > > > > > >>>>
> > > > > > > > >>>> Are you ok with these additions?  Something like this would
> > > > > > > > >>>> be required in the case of fscache configured and enabled,
> > > > > > > > >>>> because we could have some of the data in a read in
> > > > > > > > >>>> fscache, and some not.  That is the reason for the netfs
> > > > > > > > >>>> design, and why we need to be able to call the normal
> > > > > > > > >>>> NFS read IO path (netfs calls into issue_read, and we call
> > > > > > > > >>>> back via netfs_subreq_terminated)?
> > > > > > > > >>>>
> > > > > > > > >>>> @@ -101,6 +101,9 @@ struct nfs_pageio_descriptor {
> > > > > > > > >>>>       struct pnfs_layout_segment *pg_lseg;
> > > > > > > > >>>>       struct nfs_io_completion *pg_io_completion;
> > > > > > > > >>>>       struct nfs_direct_req   *pg_dreq;
> > > > > > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > >>>> +       void                    *pg_netfs;
> > > > > > > > >>>> +#endif
> > > > > > > > >>>>
> > > > > > > > >>>> @@ -1619,6 +1619,9 @@ struct nfs_pgio_header {
> > > > > > > > >>>>       const struct nfs_rw_ops *rw_ops;
> > > > > > > > >>>>       struct nfs_io_completion *io_completion;
> > > > > > > > >>>>       struct nfs_direct_req   *dreq;
> > > > > > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > >>>> +       void                    *netfs;
> > > > > > > > >>>> +#endif
> > > > > > > > >>>>
> > > > > > > > >>>>
> > > > > > > > >>>> And these additions to pagelist.c?
> > > > > > > > >>>>
> > > > > > > > >>>> @@ -68,6 +69,10 @@ void nfs_pgheader_init(struct
> > > > > > > > >>>> nfs_pageio_descriptor *desc,
> > > > > > > > >>>>       hdr->good_bytes = mirror->pg_count;
> > > > > > > > >>>>       hdr->io_completion = desc->pg_io_completion;
> > > > > > > > >>>>       hdr->dreq = desc->pg_dreq;
> > > > > > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > >>>> +       if (desc->pg_netfs)
> > > > > > > > >>>> +               hdr->netfs = desc->pg_netfs;
> > > > > > > > >>>> +#endif
> > > > > > > > >>>
> > > > > > > > >>> Why the conditional?
> > > > > > > > >>>
> > > > > > > > >> Not really needed and I was thinking of removing it, so I'll do that.
> > > > > > > > >>
> > > > > > > > >>>>
> > > > > > > > >>>>
> > > > > > > > >>>> @@ -846,6 +851,9 @@ void nfs_pageio_init(struct nfs_pageio_descriptor
> > > > > > > > >>>> *desc,
> > > > > > > > >>>>       desc->pg_lseg = NULL;
> > > > > > > > >>>>       desc->pg_io_completion = NULL;
> > > > > > > > >>>>       desc->pg_dreq = NULL;
> > > > > > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > >>>> +       desc->pg_netfs = NULL;
> > > > > > > > >>>> +#endif
> > > > > > > > >>>>
> > > > > > > > >>>>
> > > > > > > > >>>> @@ -1360,6 +1369,9 @@ int nfs_pageio_resend(struct
> > > > > > > > >>>> nfs_pageio_descriptor *desc,
> > > > > > > > >>>>
> > > > > > > > >>>>       desc->pg_io_completion = hdr->io_completion;
> > > > > > > > >>>>       desc->pg_dreq = hdr->dreq;
> > > > > > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > >>>> +       desc->pg_netfs = hdr->netfs;
> > > > > > > > >>>> +#endif
> > > > > > > > >>>
> > > > > > > > >>> Those all need wrapper functions instead of embedding #ifdefs.
> > > > > > > > >>>
> > > > > > > > >> Ok.
> > > > > > > > >>
> > > > > > > > >>
> > > > > > > > >>
> > > > > > > > >>>>
> > > > > > > > >>>>
> > > > > > > > >>>>> My expectation is that the standard I/O path should have minimal
> > > > > > > > >>>>> overhead, and should certainly not increase the overhead that we
> > > > > > > > >>>>> already have. Will this be addressed in future iterations of these
> > > > > > > > >>>>> patches?
> > > > > > > > >>>>>
> > > > > > > > >>>>
> > > > > > > > >>>> I will do what I can to satisfy what you want, either by fixing up
> > > > > > > > >>>> this patch or follow-on patches.  Hopefully the above questions
> > > > > > > > >>>> will clarify the next steps.
> > > > > > > > >>>>
> > > > > > > > >>>
> > > > > > > > >>> --
> > > > > > > > >>> Trond Myklebust
> > > > > > > > >>> Linux NFS client maintainer, Hammerspace
> > > > > > > > >>> trond.myklebust@hammerspace.com
> > > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > Trond Myklebust
> > > > > > > > CTO, Hammerspace Inc
> > > > > > > > 1900 S Norfolk St, Suite 350 - #45
> > > > > > > > San Mateo, CA 94403
> > > > > > > >
> > > > > > > > www.hammer.space
> > > > > > > >
> > > > > > > >
> > > > > > >
> > > > > >
> > > > >
> > > >
David Wysochanski Feb. 9, 2023, 3:09 p.m. UTC | #22
Ben,

Thanks for your interest and sorry for the delay.
I just replied on the v10 patchset thread.


On Mon, Feb 6, 2023 at 12:33 PM Benjamin Maynard <benmaynard@google.com> wrote:
>
> Hi all,
>
> Just pinging this thread for any further updates.
>
> Kind Regards
> Benjamin Maynard
>
>
>
> Kind Regards
>
> Benjamin Maynard
>
> Customer Engineer
>
> benmaynard@google.com
>
> Google, Inc.
>
>
>
>
> On Tue, 3 Jan 2023 at 20:33, Benjamin Maynard <benmaynard@google.com> wrote:
> >
> > Hi all,
> >
> > I just wanted to follow up on this set of patches. As Daire explained
> > below, these patches are really important to a number of us using
> > FS-Cache due to the significant performance regression introduced in
> > 5.17 and above.
> >
> > I'd love to see these patches merged, or some feedback on what changes
> > might be needed.
> >
> > Kind Regards
> > Benjamin Maynard
> >
> > On Thu, 17 Nov 2022 at 11:04, Daire Byrne <daire@dneg.com> wrote:
> > >
> > > Hi,
> > >
> > > I just wanted to take the opportunity to reiterate why these patches
> > > are important to me (and others like Benjamin).
> > >
> > > The "new" fscache that is now in mainline has a major NFS performance
> > > regression from the previous fscache code in pre 5.17 kernels - single
> > > file reads from cache.
> > >
> > > Even if you have the fastest local disk (nvme/ssd) for your fscache,
> > > reading back a cached file (via NFS) now tops out at around 40MB/s
> > > whereas before (old fscache) the local fscache disk speed was the only
> > > limit (e.g. 5000MB/s for NVMe).
> > >
> > > So, in many cases, depending on what you are using fscache for, it can
> > > be faster to read the file over the (gigabit) network than from the
> > > local disk cache which somewhat negates its usefulness. As such, we
> > > mostly use pre-5.17 kernels in production and the old fscache code
> > > which maintains high cache read performance (but has other annoying
> > > issues).
> > >
> > > Now this performance regression might not be noticed too much by
> > > desktop users looking to use fscache on their systems, but it sure
> > > does affect servers (e.g. re-export servers) that want to use fscache
> > > to achieve very high performance.
> > >
> > > I can't really comment on these patches or the approach taken, but I
> > > do hope that we can restore/improve the fscache read performance for
> > > NFS in the mainline kernel as soon as possible (like these patches
> > > do).
> > >
> > > Daire
> > >
> > >
> > > On Mon, 14 Nov 2022 at 21:26, Benjamin Maynard <benmaynard@google.com> wrote:
> > > >
> > > > Thanks Dave, that did the trick!
> > > >
> > > > Building the kernel from
> > > > https://github.com/DaveWysochanskiRH/kernel/commit/42f58f3d36d83839022dc2617bb6c2d1b09db65f
> > > > and re-running the exact same tests yielded the expected results. Data
> > > > is now being served from /var/cache/fscache.
> > > >
> > > > I also reverted my change to the read ahead, so that read ahead is now
> > > > greater than the rsize. Still works as expected.
> > > >
> > > > I am also seeing much better single file read speeds, and culling is
> > > > working perfectly (not running into the issue we were seeing pre
> > > > 5.17).
> > > >
> > > > Thanks a lot Dave, Jeff and Daire for your help.
> > > >
> > > > Kind Regards
> > > > Benjamin Maynard
> > > >
> > > >
> > > >
> > > > Kind Regards
> > > >
> > > > Benjamin Maynard
> > > >
> > > > Customer Engineer
> > > >
> > > > benmaynard@google.com
> > > >
> > > > Google, Inc.
> > > >
> > > >
> > > >
> > > >
> > > > On Mon, 14 Nov 2022 at 17:35, David Wysochanski <dwysocha@redhat.com> wrote:
> > > > >
> > > > > On Mon, Nov 14, 2022 at 11:04 AM Benjamin Maynard <benmaynard@google.com> wrote:
> > > > > >
> > > > > > Hi Dave,
> > > > > >
> > > > > > I've added responses to your questions inline below.
> > > > > >
> > > > > > I also tried adding the noatime option to the mount on the source
> > > > > > filer as Jeff suggested, but this has not made any difference and the
> > > > > > issue is still persisting for me.
> > > > > >
> > > > > > I created the following diagram that explains my setup, and the exact
> > > > > > tests I am performing:
> > > > > > https://drive.google.com/file/d/12Xf-9yHCKM4eMr2YGqdSAVfGcximW4OG/view?usp=sharing.
> > > > > >
> > > > > > Hopefully this is clearer than my explanations below (let me know if
> > > > > > you'd prefer me to share an alternative way).
> > > > > >
> > > > > Yes, that's very helpful.  Let me think about this one as I'm not sure.
> > > > > As Jeff says we may need tracepoints to track it down if I cannot repro
> > > > > it and/or nothing comes to mind.
> > > > >
> > > > > > In order to remove the re-exporting layer of complexity, I also
> > > > > > performed the tests without the re-export server (architecture:
> > > > > > https://drive.google.com/file/d/1DQKhqo_UnQ8ul-z5Iram5LpisDmkKziQ/view?usp=share_link):
> > > > > >
> > > > > > Source NFS Server <-- Client (with FS-Cache)
> > > > > >
> > > > > > The same is happening, I cannot get FS-Cache to serve from cache.
> > > > > > Heavy writes, but no reads, even when the same file is copied many
> > > > > > times.
> > > > > >
> > > > > I'm pretty sure the above you're hitting the drop_caches /
> > > > > "fscache read optimisation" issue #1 I mentioned.
> > > > >
> > > > > I see dhowells just posted a v2 version of his previous patch:
> > > > > https://lore.kernel.org/linux-mm/166844174069.1124521.10890506360974169994.stgit@warthog.procyon.org.uk/
> > > > >
> > > > > I started with 6.1-rc5, added the above dhowells latest patch for that issue,
> > > > > and then my 5 patches on top.  Then I added a small patch to utilize
> > > > > dhowells patch to ensure the read optimisation is removed.  I ran my
> > > > > unit test that has been failing all along and as expected it passes with
> > > > > these patches.  I pushed the series to github:
> > > > > https://github.com/DaveWysochanskiRH/kernel/commits/nfs-fscache-netfs
> > > > > https://github.com/DaveWysochanskiRH/kernel/commit/42f58f3d36d83839022dc2617bb6c2d1b09db65f
> > > > >
> > > > > I will also email you the series of patches on top of 6.1-rc5 so you
> > > > > can just apply from your mailbox if you want.
> > > > >
> > > > >
> > > > >
> > > > > > Hopefully something I am doing wrong on my end, but I can't figure out what.
> > > > > >
> > > > > > Kind Regards
> > > > > > Benjamin Maynard
> > > > > >
> > > > > >
> > > > > > On Mon, 14 Nov 2022 at 13:47, David Wysochanski <dwysocha@redhat.com> wrote:
> > > > > > >
> > > > > > > I apologize I did not read carefully enough and I missed some details
> > > > > > > in your original post.
> > > > > > > More below.
> > > > > > >
> > > > > > > On Sat, Nov 12, 2022 at 7:47 AM Benjamin Maynard <benmaynard@google.com> wrote:
> > > > > > > >
> > > > > > > > Hi all,
> > > > > > > >
> > > > > > > > I've been doing some more testing with these patches, I applied all of
> > > > > > > > the patches (v10 from
> > > > > > > > https://patchwork.kernel.org/project/linux-nfs/list/?series=691729)
> > > > > > > > apart from Patch 6 (the RFC patch) to version 6.0.8 of the kernel.
> > > > > > > >
> > > > > > > > I have the following setup:
> > > > > > > >
> > > > > > > > Source NFS Server <-- Re-Export Server (with FS-Cache) <-- NFS Client.
> > > > > > > >
> > > > > > > > I have a 500Gb file on the Source NFS Server, which I am then copying
> > > > > > > > to the NFS Client via the Re-Export Server.
> > > > > > > >
> > > > > > > > On the first copy, I see heavy writes to /var/cache/fscache on the
> > > > > > > > re-export server, and once the file copy completes I see that
> > > > > > > > /var/cache/fscache is approximately 500Gb in size. All good so far.
> > > > > > > >
> > > > > > > > I then deleted that file from the NFS Client, and dropped the caches
> > > > > > > > just to be safe (echo 3 > /proc/sys/vm/drop_caches on the NFS Client).
> > > > > > > >
> > > > > > > If you delete the file from the NFS client, how does that not delete the
> > > > > > > file from the original NFS server?
> > > > > >
> > > > > > Sorry - to be clear, I never deleted the file from the NFS mount
> > > > > > (which I know would in turn delete it from the re-export server and
> > > > > > the source filer).
> > > > > >
> > > > > > In order to perform the performance test, I copied the file from the
> > > > > > NFS mount on the NFS Client, to a local directory (cp
> > > > > > /mnt/nfs/500gb.img /tmp).
> > > > > >
> > > > > > When I said "I then deleted that file from the NFS Client", I meant I
> > > > > > deleted the local copy of that file. Not the file on the mount (rm
> > > > > > /tmp/500gb.img).
> > > > > >
> > > > > > Just to also stress, I have never dropped the caches on the Re-Export
> > > > > > Server (the one with FS-Cache) at any point in any of these tests, so
> > > > > > I don't think this is the problem. I have only ever dropped the caches
> > > > > > on the NFS client that is mounting the Re-Export Server.
> > > > > >
> > > > > > > > I then performed another copy of the 500Gb file on the NFS Client,
> > > > > > > > again via the Re-Export Server. What I expected would happen is that I
> > > > > > > > would see heavy reads from the /var/cache/fscache volume as the file
> > > > > > > > should be served from FS-Cache.
> > > > > > > >
> > > > > > > I don't understand this.  When you say you "performed another copy"
> > > > > > > of what file?  Wasn't the file deleted in the above step?
> > > > > >
> > > > > > As above, only the local copy was deleted.
> > > > > >
> > > > > > > > However what I actually saw was no reads whatsoever, FS-Cache seems to
> > > > > > > > be ignored and the file is pulled from the Source NFS Filer again. I
> > > > > > > > also see heavy writes to /var/cache/fscache, so it appears that
> > > > > > > > FS-Cache is overwriting its existing cache, and never using it.
> > > > > > >
> > > > > > > That would happen if the file was changed or re-created.
> > > > > > >
> > > > > > > > I only have 104Gb of memory on the Re-Export Server (with FS-Cache) so
> > > > > > > > it is not possible that the file is being served from the page cache.
> > > > > > > >
> > > > > > > > We saw this behaviour before on an older set of the patches when our
> > > > > > > > mount between the Re-Export Server and the Source NFS Filer was using
> > > > > > > > the "sync" option, but we are now using the "async" option and the
> > > > > > > > same is happening.
> > > > > > > >
> > > > > > > > Mount options:
> > > > > > > >
> > > > > > > > Source NFS Server <-- Re-Export Server (with FS-Cache):
> > > > > > > >
> > > > > > > > 10.0.0.49:/files /srv/nfs/files nfs
> > > > > > > > rw,noatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,acregmin=600,acregmax=600,acdirmin=600,acdirmax=600,hard,nocto,proto=tcp,nconnect=16,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.49,mountvers=3,mountport=37485,mountproto=tcp,fsc,local_lock=none,addr=10.0.0.49
> > > > > > > >
> > > > > > > > Re-Export Server (with FS-Cache) <-- NFS Client:
> > > > > > > >
> > > > > > > > 10.0.0.3:/files /mnt/nfs nfs
> > > > > > > > rw,relatime,vers=3,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.0.0.3,mountvers=3,mountport=20048,mountproto=tcp,local_lock=none,addr=10.0.0.3
> > > > > > > >
> > > > > > > > It is also worth noting this behaviour is not unique to the re-export
> > > > > > > > use case. I see FS-Cache not being used with the following setup:
> > > > > > > >
> > > > > > > > Source NFS Server <-- Client (with FS-Cache).
> > > > > > > >
> > > > > > >
> > > > > > > This points at something more fundamental like something missed
> > > > > > > in the test or maybe a mount option.  Can you explain what test
> > > > > > > you're doing here when you say "this behavior is not unique"?
> > > > > >
> > > > > > I've created the following diagram which explains the test I am
> > > > > > performing. I think it is a little easier to follow than explaining in
> > > > > > text. This should be viewable without any authentication:
> > > > > > https://drive.google.com/file/d/12Xf-9yHCKM4eMr2YGqdSAVfGcximW4OG/view?usp=sharing.
> > > > > >
> > > > > > By "this behaviour is not unique to the re-export use case" I mean
> > > > > > that the same happens if I remove the re-export server completely, and
> > > > > > just have the following setup:
> > > > > >
> > > > > > Source NFS Server <-- Client (with FS-Cache).
> > > > > >
> > > > > > > Can you show the mount options for both:
> > > > > > > - fscache filesystem on the re-export server (/var/cache/fscache)
> > > > > >
> > > > > > root@reexport:~$ mount | grep /var/cache/fscache
> > > > > > /dev/md127 on /var/cache/fscache type ext4
> > > > > > (rw,relatime,discard,nobarrier,stripe=1024)
> > > > > >
> > > > > > > - exported filesystem on the NFS server (filesystem in /etc/exports)
> > > > > >
> > > > > > I have tried both:
> > > > > >
> > > > > > root@source:~$ mount | grep files
> > > > > > /dev/sdb1 on /files type ext4 (rw)
> > > > > >
> > > > > > root@source:~$ cat /etc/exports
> > > > > > /files 10.0.0.0/8(rw,sync,wdelay,no_root_squash,no_all_squash,no_subtree_check,sec=sys,secure,nohide)
> > > > > >
> > > > > > and (at Jeff's suggestion):
> > > > > >
> > > > > > root@source:~$ mount | grep files
> > > > > > /dev/sdb1 on /files type ext4 (rw,noatime)
> > > > > >
> > > > > > root@source:~$ cat /etc/exports
> > > > > > /files 10.0.0.0/8(rw,sync,wdelay,no_root_squash,no_all_squash,no_subtree_check,sec=sys,secure,nohide)
> > > > > >
> > > > > >
> > > > > > > Unfortunately the problem with drop_caches makes it more difficult
> > > > > > > to know when fscache is truly working.  But some other unit test
> > > > > > > I have shows fscache does work with this patchset so I'm puzzled why
> > > > > > > you're not seeing it work at all.
> > > > > > >
> > > > > > > I pinged dhowells on the drop_caches issue so maybe we can get
> > > > > > > that one sorted out soon but I'm not sure since it's part of a series
> > > > > > > and proposes changes in mm.
> > > > > >
> > > > > > Just to be clear, I have never used drop_caches on the re-export
> > > > > > server in any of these tests. I have only ever done this on the NFS
> > > > > > Client.
> > > > > >
> > > > > > >
> > > > > > > > Thanks,
> > > > > > > > Ben
> > > > > > > >
> > > > > > > >
> > > > > > > > Kind Regards
> > > > > > > >
> > > > > > > > Benjamin Maynard
> > > > > > > >
> > > > > > > > Customer Engineer
> > > > > > > >
> > > > > > > > benmaynard@google.com
> > > > > > > >
> > > > > > > > Google, Inc.
> > > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > On Mon, 31 Oct 2022 at 22:22, Trond Myklebust <trondmy@hammerspace.com> wrote:
> > > > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > > On Oct 30, 2022, at 19:25, David Wysochanski <dwysocha@redhat.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Sat, Oct 29, 2022 at 12:46 PM David Wysochanski <dwysocha@redhat.com> wrote:
> > > > > > > > > >>
> > > > > > > > > >> On Fri, Oct 28, 2022 at 12:59 PM Trond Myklebust <trondmy@kernel.org> wrote:
> > > > > > > > > >>>
> > > > > > > > > >>> On Fri, 2022-10-28 at 07:50 -0400, David Wysochanski wrote:
> > > > > > > > > >>>> On Thu, Oct 27, 2022 at 3:16 PM Trond Myklebust <trondmy@kernel.org>
> > > > > > > > > >>>> wrote:
> > > > > > > > > >>>>>
> > > > > > > > > >>>>> On Mon, 2022-10-17 at 06:52 -0400, Dave Wysochanski wrote:
> > > > > > > > > >>>>>> Convert the NFS buffered read code paths to corresponding netfs
> > > > > > > > > >>>>>> APIs,
> > > > > > > > > >>>>>> but only when fscache is configured and enabled.
> > > > > > > > > >>>>>>
> > > > > > > > > >>>>>> The netfs API defines struct netfs_request_ops which must be
> > > > > > > > > >>>>>> filled
> > > > > > > > > >>>>>> in by the network filesystem.  For NFS, we only need to define 5
> > > > > > > > > >>>>>> of
> > > > > > > > > >>>>>> the functions, the main one being the issue_read() function.
> > > > > > > > > >>>>>> The issue_read() function is called by the netfs layer when a
> > > > > > > > > >>>>>> read
> > > > > > > > > >>>>>> cannot be fulfilled locally, and must be sent to the server
> > > > > > > > > >>>>>> (either
> > > > > > > > > >>>>>> the cache is not active, or it is active but the data is not
> > > > > > > > > >>>>>> available).
> > > > > > > > > >>>>>> Once the read from the server is complete, netfs requires a call
> > > > > > > > > >>>>>> to
> > > > > > > > > >>>>>> netfs_subreq_terminated() which conveys either how many bytes
> > > > > > > > > >>>>>> were
> > > > > > > > > >>>>>> read
> > > > > > > > > >>>>>> successfully, or an error.  Note that issue_read() is called with
> > > > > > > > > >>>>>> a
> > > > > > > > > >>>>>> structure, netfs_io_subrequest, which defines the IO requested,
> > > > > > > > > >>>>>> and
> > > > > > > > > >>>>>> contains a start and a length (both in bytes), and assumes the
> > > > > > > > > >>>>>> underlying
> > > > > > > > > >>>>>> netfs will return a either an error on the whole region, or the
> > > > > > > > > >>>>>> number
> > > > > > > > > >>>>>> of bytes successfully read.
> > > > > > > > > >>>>>>
> > > > > > > > > >>>>>> The NFS IO path is page based and the main APIs are the pgio APIs
> > > > > > > > > >>>>>> defined
> > > > > > > > > >>>>>> in pagelist.c.  For the pgio APIs, there is no way for the caller
> > > > > > > > > >>>>>> to
> > > > > > > > > >>>>>> know how many RPCs will be sent and how the pages will be broken
> > > > > > > > > >>>>>> up
> > > > > > > > > >>>>>> into underlying RPCs, each of which will have their own
> > > > > > > > > >>>>>> completion
> > > > > > > > > >>>>>> and
> > > > > > > > > >>>>>> return code.  In contrast, netfs is subrequest based, a single
> > > > > > > > > >>>>>> subrequest may contain multiple pages, and a single subrequest is
> > > > > > > > > >>>>>> initiated with issue_read() and terminated with
> > > > > > > > > >>>>>> netfs_subreq_terminated().
> > > > > > > > > >>>>>> Thus, to utilze the netfs APIs, NFS needs some way to accommodate
> > > > > > > > > >>>>>> the netfs API requirement on the single response to the whole
> > > > > > > > > >>>>>> subrequest, while also minimizing disruptive changes to the NFS
> > > > > > > > > >>>>>> pgio layer.
> > > > > > > > > >>>>>>
> > > > > > > > > >>>>>> The approach taken with this patch is to allocate a small
> > > > > > > > > >>>>>> structure
> > > > > > > > > >>>>>> for each nfs_netfs_issue_read() call, store the final error and
> > > > > > > > > >>>>>> number
> > > > > > > > > >>>>>> of bytes successfully transferred in the structure, and update
> > > > > > > > > >>>>>> these
> > > > > > > > > >>>>>> values
> > > > > > > > > >>>>>> as each RPC completes.  The refcount on the structure is used as
> > > > > > > > > >>>>>> a
> > > > > > > > > >>>>>> marker
> > > > > > > > > >>>>>> for the last RPC completion, is incremented in
> > > > > > > > > >>>>>> nfs_netfs_read_initiate(),
> > > > > > > > > >>>>>> and decremented inside nfs_netfs_read_completion(), when a
> > > > > > > > > >>>>>> nfs_pgio_header
> > > > > > > > > >>>>>> contains a valid pointer to the data.  On the final put (which
> > > > > > > > > >>>>>> signals
> > > > > > > > > >>>>>> the final outstanding RPC is complete) in
> > > > > > > > > >>>>>> nfs_netfs_read_completion(),
> > > > > > > > > >>>>>> call netfs_subreq_terminated() with either the final error value
> > > > > > > > > >>>>>> (if
> > > > > > > > > >>>>>> one or more READs complete with an error) or the number of bytes
> > > > > > > > > >>>>>> successfully transferred (if all RPCs complete successfully).
> > > > > > > > > >>>>>> Note
> > > > > > > > > >>>>>> that when all RPCs complete successfully, the number of bytes
> > > > > > > > > >>>>>> transferred
> > > > > > > > > >>>>>> is capped to the length of the subrequest.  Capping the
> > > > > > > > > >>>>>> transferred
> > > > > > > > > >>>>>> length
> > > > > > > > > >>>>>> to the subrequest length prevents "Subreq overread" warnings from
> > > > > > > > > >>>>>> netfs.
> > > > > > > > > >>>>>> This is due to the "aligned_len" in nfs_pageio_add_page(), and
> > > > > > > > > >>>>>> the
> > > > > > > > > >>>>>> corner case where NFS requests a full page at the end of the
> > > > > > > > > >>>>>> file,
> > > > > > > > > >>>>>> even when i_size reflects only a partial page (NFS overread).
> > > > > > > > > >>>>>>
> > > > > > > > > >>>>>> Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
> > > > > > > > > >>>>>> Reviewed-by: Jeff Layton <jlayton@kernel.org>
> > > > > > > > > >>>>>
> > > > > > > > > >>>>>
> > > > > > > > > >>>>> This is not doing what I asked for, which was to separate out the
> > > > > > > > > >>>>> fscache functionality, so that we can call that if and when it is
> > > > > > > > > >>>>> available.
> > > > > > > > > >>>>>
> > > > > > > > > >>>> I must have misunderstood then.
> > > > > > > > > >>>>
> > > > > > > > > >>>> The last feedback I have from you was that you wanted it to be
> > > > > > > > > >>>> an opt-in feature, and it was a comment on a previous patch
> > > > > > > > > >>>> to Kconfig.  I was proceeding the best I knew how, but
> > > > > > > > > >>>> let me try to get back on track.
> > > > > > > > > >>>>
> > > > > > > > > >>>>> Instead, it is just wrapping the NFS requests inside netfs
> > > > > > > > > >>>>> requests. As
> > > > > > > > > >>>>> it stands, that means it is just duplicating information, and
> > > > > > > > > >>>>> adding
> > > > > > > > > >>>>> unnecessary overhead to the standard I/O path (extra allocations,
> > > > > > > > > >>>>> extra
> > > > > > > > > >>>>> indirect calls, and extra bloat to the inode).
> > > > > > > > > >>>>>
> > > > > > > > > >>>> I think I understand what you're saying but I'm not sure.  Let me
> > > > > > > > > >>>> ask some clarifying questions.
> > > > > > > > > >>>>
> > > > > > > > > >>>> Are you objecting to the code when CONFIG_NFS_FSCACHE is
> > > > > > > > > >>>> configured?  Or when it is not?  Or both?  I think you're objecting
> > > > > > > > > >>>> when it's configured, but not enabled (we mount without 'fsc').
> > > > > > > > > >>>> Am I right?
> > > > > > > > > >>>>
> > > > > > > > > >>>> Also, are you objecting to the design that to use fcache we now
> > > > > > > > > >>>> have to use netfs, specifically:
> > > > > > > > > >>>> - call into netfs via either netfs_read_folio or netfs_readahead
> > > > > > > > > >>>> - if fscache is enabled, then the IO can be satisfied from fscache
> > > > > > > > > >>>> - if fscache is not enabled, or some of the IO cannot be satisfied
> > > > > > > > > >>>> from the cache, then NFS is called back via netfs_issue_read
> > > > > > > > > >>>> and we use the normal NFS read pageio interface.  This requires
> > > > > > > > > >>>> we call netfs_subreq_terminated() when all the RPCs complete,
> > > > > > > > > >>>> which is the reason for the small changes to pagelist.c
> > > > > > > > > >>>
> > > > > > > > > >>> I'm objecting to any middle layer "solution" that adds overhead to the
> > > > > > > > > >>> NFS I/O paths.
> > > > > > > > > >>>
> > > > > > > > > >> Got it.
> > > > > > > > > >>
> > > > > > > > > >>> I'm willing to consider solutions that are specific only to the fscache
> > > > > > > > > >>> use case (i.e. when the 'fsc' mount option is specified). However when
> > > > > > > > > >>> I perform a normal NFS mount, and do I/O, then I don't want to see
> > > > > > > > > >>> extra memory allocations, extra indirect calls and larger inode
> > > > > > > > > >>> footprints.
> > > > > > > > > >>>
> > > > > > > > > >>> IOW: I want the code to optimise for the case of standard NFS, not for
> > > > > > > > > >>> the case of 'NFS with cachefs additions'.
> > > > > > > > > >>>
> > > > > > > > > >> I agree completely.  Are you seeing extra memory allocations
> > > > > > > > > >> happen on mounts without 'fsc' or is it more a concern or how
> > > > > > > > > >> some of the patches look?  We should not be calling any netfs or
> > > > > > > > > >> fscache code if 'fsc' is not on the mount and I don't see any in my
> > > > > > > > > >> testing. So either there's a misunderstanding here, or there's a
> > > > > > > > > >> bug I'm missing.
> > > > > > > > > >>
> > > > > > > > > >> If fscache is not configured, then nfs_netfs_read_folio() and
> > > > > > > > > >> nfs_netfs_readahead() is a wrapper that returns -ENOBUFS.
> > > > > > > > > >> If it's configured but not enabled, then the checks for
> > > > > > > > > >> netfs_inode(inode)->cache should skip over any netfs code.
> > > > > > > > > >> But maybe there's a non-obvious bug you're seeing and
> > > > > > > > > >> somehow netfs is still getting called?  Because I cannot
> > > > > > > > > >> see netfs getting called if 'fsc' is not on the mount in my
> > > > > > > > > >> tests.
> > > > > > > > > >>
> > > > > > > > > >> int nfs_netfs_read_folio(struct file *file, struct folio *folio)
> > > > > > > > > >> {
> > > > > > > > > >>       if (!netfs_inode(folio_inode(folio))->cache)
> > > > > > > > > >>               return -ENOBUFS;
> > > > > > > > > >>
> > > > > > > > > >>       return netfs_read_folio(file, folio);
> > > > > > > > > >> }
> > > > > > > > > >>
> > > > > > > > > >> int nfs_netfs_readahead(struct readahead_control *ractl)
> > > > > > > > > >> {
> > > > > > > > > >>       struct inode *inode = ractl->mapping->host;
> > > > > > > > > >>
> > > > > > > > > >>       if (!netfs_inode(inode)->cache)
> > > > > > > > > >>               return -ENOBUFS;
> > > > > > > > > >>
> > > > > > > > > >>       netfs_readahead(ractl);
> > > > > > > > > >>       return 0;
> > > > > > > > > >> }
> > > > > > > > > >>
> > > > > > > > > >>
> > > > > > > > > >>>>
> > > > > > > > > >>>> Can you be more specific as to the portions of the patch you don't
> > > > > > > > > >>>> like
> > > > > > > > > >>>> so I can move it in the right direction?
> > > > > > > > > >>>>
> > > > > > > > > >>>> This is from patch #2 which you didn't comment on.  I'm not sure
> > > > > > > > > >>>> you're
> > > > > > > > > >>>> ok with it though, since you mention "extra bloat to the inode".
> > > > > > > > > >>>> Do you object to this even though it's wrapped in an
> > > > > > > > > >>>> #ifdef CONFIG_NFS_FSCACHE?  If so, do you require no
> > > > > > > > > >>>> extra size be added to nfs_inode?
> > > > > > > > > >>>>
> > > > > > > > > >>>> @@ -204,9 +208,11 @@ struct nfs_inode {
> > > > > > > > > >>>>       __u64 write_io;
> > > > > > > > > >>>>       __u64 read_io;
> > > > > > > > > >>>> #ifdef CONFIG_NFS_FSCACHE
> > > > > > > > > >>>> -       struct fscache_cookie   *fscache;
> > > > > > > > > >>>> -#endif
> > > > > > > > > >>>> +       struct netfs_inode      netfs; /* netfs context and VFS inode
> > > > > > > > > >>>> */
> > > > > > > > > >>>> +#else
> > > > > > > > > >>>>       struct inode            vfs_inode;
> > > > > > > > > >>>> +#endif
> > > > > > > > > >>>> +
> > > > > > > > > >>>
> > > > > > > > > >>> Ideally, I'd prefer no extra size. I can live with it up to a certain
> > > > > > > > > >>> point, however for now NFS is not unconditionally opting into the netfs
> > > > > > > > > >>> project. If we're to ever do that, then I want to see streamlined code
> > > > > > > > > >>> for the standard I/O case.
> > > > > > > > > >>>
> > > > > > > > > >> Ok and understood about standard I/O case.
> > > > > > > > > >>
> > > > > > > > > >> I was thinking how we might not increase the size, but I don't think
> > > > > > > > > >> I can make it work.
> > > > > > > > > >>
> > > > > > > > > >> I thought we could change to something like the below, without an
> > > > > > > > > >> embedded struct inode:
> > > > > > > > > >>
> > > > > > > > > >> @@ -204,9 +208,11 @@ struct nfs_inode {
> > > > > > > > > >>       __u64 write_io;
> > > > > > > > > >>       __u64 read_io;
> > > > > > > > > >> #ifdef CONFIG_NFS_FSCACHE
> > > > > > > > > >> -       struct fscache_cookie   *fscache;
> > > > > > > > > >> -#endif
> > > > > > > > > >> +       struct netfs_inode      *netfs; /* netfs context and VFS inode */
> > > > > > > > > >> +#else
> > > > > > > > > >>       struct inode            vfs_inode;
> > > > > > > > > >> +#endif
> > > > > > > > > >> +
> > > > > > > > > >>
> > > > > > > > > >> Then I would need to alloc/free a netfs_inode at the time of
> > > > > > > > > >> nfs_inode initiation.  Unfortunately this has the issue that the NFS_I()
> > > > > > > > > >> macro cannot work, because it requires an embedded "struct inode"
> > > > > > > > > >> due to "container_of" use:
> > > > > > > > > >>
> > > > > > > > > >> +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > > >> +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > > > > > > > > >> +{
> > > > > > > > > >> +       return &nfsi->netfs.inode;
> > > > > > > > > >> +}
> > > > > > > > > >> +static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > > > > > > > > >> +{
> > > > > > > > > >> +       return container_of(inode, struct nfs_inode, netfs.inode);
> > > > > > > > > >> +}
> > > > > > > > > >> +#else
> > > > > > > > > >> +static inline struct inode *VFS_I(struct nfs_inode *nfsi)
> > > > > > > > > >> +{
> > > > > > > > > >> +       return &nfsi->vfs_inode;
> > > > > > > > > >> +}
> > > > > > > > > >> static inline struct nfs_inode *NFS_I(const struct inode *inode)
> > > > > > > > > >> {
> > > > > > > > > >>       return container_of(inode, struct nfs_inode, vfs_inode);
> > > > > > > > > >> }
> > > > > > > > > >> +#endif
> > > > > > > > > >>
> > > > > > > > > >>
> > > > > > > > > >
> > > > > > > > > > Actually Trond maybe we can achieve a "0 length increase" of
> > > > > > > > > > nfs_inode if dhowells would take a patch to modify the definition
> > > > > > > > > > of struct netfs_inode and netfs_inode_init(), something like the WIP
> > > > > > > > > > patch below.  What do you think?
> > > > > > > > >
> > > > > > > > > That works for me.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > I think maybe this could be a follow-on patch and if you/dhowells
> > > > > > > > > > think it's an ok idea I can try to work out what is needed across
> > > > > > > > > > the tree.  I thought about it more and I kinda agree that in the
> > > > > > > > > > case for NFS where fscache is "configured but not enabled",
> > > > > > > > > > then even though we're only adding 24 bytes to the nfs_inode
> > > > > > > > > > each time, it will add up so it is worth at least a discussion.
> > > > > > > > > >
> > > > > > > > > > diff --git a/include/linux/netfs.h b/include/linux/netfs.h
> > > > > > > > > > index f2402ddeafbf..195714f1c355 100644
> > > > > > > > > > --- a/include/linux/netfs.h
> > > > > > > > > > +++ b/include/linux/netfs.h
> > > > > > > > > > @@ -118,11 +118,7 @@ enum netfs_io_source {
> > > > > > > > > > typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error,
> > > > > > > > > >                                     bool was_async);
> > > > > > > > > >
> > > > > > > > > > -/*
> > > > > > > > > > - * Per-inode context.  This wraps the VFS inode.
> > > > > > > > > > - */
> > > > > > > > > > -struct netfs_inode {
> > > > > > > > > > -       struct inode            inode;          /* The VFS inode */
> > > > > > > > > > +struct netfs_info {
> > > > > > > > > >       const struct netfs_request_ops *ops;
> > > > > > > > > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > > > > > > > >       struct fscache_cookie   *cache;
> > > > > > > > > > @@ -130,6 +126,14 @@ struct netfs_inode {
> > > > > > > > > >       loff_t                  remote_i_size;  /* Size of the remote file */
> > > > > > > > > > };
> > > > > > > > > >
> > > > > > > > > > +/*
> > > > > > > > > > + * Per-inode context.  This wraps the VFS inode.
> > > > > > > > > > + */
> > > > > > > > > > +struct netfs_inode {
> > > > > > > > > > +       struct inode            inode;          /* The VFS inode */
> > > > > > > > > > +       struct netfs_info       *netfs;         /* Rest of netfs data */
> > > > > > > > > > +};
> > > > > > > > > > +
> > > > > > > > > > /*
> > > > > > > > > > * Resources required to do operations on a cache.
> > > > > > > > > > */
> > > > > > > > > > @@ -312,10 +316,12 @@ static inline struct netfs_inode
> > > > > > > > > > *netfs_inode(struct inode *inode)
> > > > > > > > > > static inline void netfs_inode_init(struct netfs_inode *ctx,
> > > > > > > > > >                                   const struct netfs_request_ops *ops)
> > > > > > > > > > {
> > > > > > > > > > -       ctx->ops = ops;
> > > > > > > > > > -       ctx->remote_i_size = i_size_read(&ctx->inode);
> > > > > > > > > > +       ctx->netfs = kzalloc(sizeof(struct netfs_info)), GFP_KERNEL);
> > > > > > > > > > +       /* FIXME: Check for NULL */
> > > > > > > > > > +       ctx->netfs->ops = ops;
> > > > > > > > > > +       ctx->netfs->remote_i_size = i_size_read(&ctx->inode);
> > > > > > > > > > #if IS_ENABLED(CONFIG_FSCACHE)
> > > > > > > > > > -       ctx->cache = NULL;
> > > > > > > > > > +       ctx->netfs->cache = NULL;
> > > > > > > > > > #endif
> > > > > > > > > > }
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >>
> > > > > > > > > >>>>
> > > > > > > > > >>>>
> > > > > > > > > >>>> Are you ok with the stub functions which are placed in fscache.h, and
> > > > > > > > > >>>> when CONFIG_NFS_FSCACHE is not set, become either a no-op
> > > > > > > > > >>>> or a 1-liner (nfs_netfs_readpage_release)?
> > > > > > > > > >>>>
> > > > > > > > > >>>> #else /* CONFIG_NFS_FSCACHE */
> > > > > > > > > >>>> +static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {}
> > > > > > > > > >>>> +static inline void nfs_netfs_initiate_read(struct nfs_pgio_header
> > > > > > > > > >>>> *hdr) {}
> > > > > > > > > >>>> +static inline void nfs_netfs_read_completion(struct nfs_pgio_header
> > > > > > > > > >>>> *hdr) {}
> > > > > > > > > >>>> +static inline void nfs_netfs_readpage_release(struct nfs_page *req)
> > > > > > > > > >>>> +{
> > > > > > > > > >>>> +       unlock_page(req->wb_page);
> > > > > > > > > >>>> +}
> > > > > > > > > >>>> static inline void nfs_fscache_release_super_cookie(struct
> > > > > > > > > >>>> super_block *sb) {}
> > > > > > > > > >>>> static inline void nfs_fscache_init_inode(struct inode *inode) {}
> > > > > > > > > >>>>
> > > > > > > > > >>>>
> > > > > > > > > >>>> Do you object to the below?  If so, then do you want
> > > > > > > > > >>>> #ifdef CONFIG_NFS_FSCACHE here?
> > > > > > > > > >>>>
> > > > > > > > > >>>> -- a/fs/nfs/inode.c
> > > > > > > > > >>>> +++ b/fs/nfs/inode.c
> > > > > > > > > >>>> @@ -2249,6 +2249,8 @@ struct inode *nfs_alloc_inode(struct
> > > > > > > > > >>>> super_block *sb)
> > > > > > > > > >>>> #ifdef CONFIG_NFS_V4_2
> > > > > > > > > >>>>       nfsi->xattr_cache = NULL;
> > > > > > > > > >>>> #endif
> > > > > > > > > >>>> +       nfs_netfs_inode_init(nfsi);
> > > > > > > > > >>>> +
> > > > > > > > > >>>>       return VFS_I(nfsi);
> > > > > > > > > >>>> }
> > > > > > > > > >>>> EXPORT_SYMBOL_GPL(nfs_alloc_i
> > > > > > > > > >>>> node);
> > > > > > > > > >>>>
> > > > > > > > > >>>>
> > > > > > > > > >>>> Do you object to the changes in fs/nfs/read.c?  Specifically,
> > > > > > > > > >>>> how about the below calls to netfs from nfs_read_folio and
> > > > > > > > > >>>> nfs_readahead into equivalent netfs calls?  So when
> > > > > > > > > >>>> NFS_CONFIG_FSCACHE is set, but fscache is not enabled
> > > > > > > > > >>>> ('fsc' not on mount), these netfs functions do immediately call
> > > > > > > > > >>>> netfs_alloc_request().  But I wonder if we could simply add a
> > > > > > > > > >>>> check to see if fscache is enabled on the mount, and skip
> > > > > > > > > >>>> over to satisfy what you want.  Am I understanding what you
> > > > > > > > > >>>> want?
> > > > > > > > > >>>
> > > > > > > > > >>> Quite frankly, I'd prefer that we just split out the functionality that
> > > > > > > > > >>> is needed from the netfs code so that it can be optimised. However I'm
> > > > > > > > > >>> not interested enough in the cachefs functionality to work on that
> > > > > > > > > >>> myself. ...and as I indicated above, I might be OK with opting into the
> > > > > > > > > >>> netfs project, once the overhead can be made to disappear.
> > > > > > > > > >>>
> > > > > > > > > >> Understood.
> > > > > > > > > >>
> > > > > > > > > >> If you think it makes more sense, I can move some of the nfs_netfs_*
> > > > > > > > > >> functions into a netfs.c file as a starting point.  Or that can maybe
> > > > > > > > > >> be done in a future patchset?
> > > > > > > > > >>
> > > > > > > > > >> For now I was equating netfs and fscache together so we can
> > > > > > > > > >> move on from the much older and single-page limiting fscache
> > > > > > > > > >> interface that is likely to go away soon.
> > > > > > > > > >>
> > > > > > > > > >>>>
> > > > > > > > > >>>> @@ -355,6 +343,10 @@ int nfs_read_folio(struct file *file, struct
> > > > > > > > > >>>> folio *folio)
> > > > > > > > > >>>>       if (NFS_STALE(inode))
> > > > > > > > > >>>>               goto out_unlock;
> > > > > > > > > >>>>
> > > > > > > > > >>>> +       ret = nfs_netfs_read_folio(file, folio);
> > > > > > > > > >>>> +       if (!ret)
> > > > > > > > > >>>> +               goto out;
> > > > > > > > > >>>> +
> > > > > > > > > >>>>
> > > > > > > > > >>>> @@ -405,6 +399,10 @@ void nfs_readahead(struct readahead_control
> > > > > > > > > >>>> *ractl)
> > > > > > > > > >>>>       if (NFS_STALE(inode))
> > > > > > > > > >>>>               goto out;
> > > > > > > > > >>>>
> > > > > > > > > >>>> +       ret = nfs_netfs_readahead(ractl);
> > > > > > > > > >>>> +       if (!ret)
> > > > > > > > > >>>> +               goto out;
> > > > > > > > > >>>> +
> > > > > > > > > >>>>
> > > > > > > > > >> The above wrappers should prevent any additional overhead when fscache
> > > > > > > > > >> is not enabled.  As far as I know these work to avoid calling netfs
> > > > > > > > > >> when 'fsc' is not on the mount.
> > > > > > > > > >>
> > > > > > > > > >>>>
> > > > > > > > > >>>> And how about these calls from different points in the read
> > > > > > > > > >>>> path to the earlier mentioned stub functions?
> > > > > > > > > >>>>
> > > > > > > > > >>>> @@ -110,20 +110,13 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
> > > > > > > > > >>>>
> > > > > > > > > >>>> static void nfs_readpage_release(struct nfs_page *req, int error)
> > > > > > > > > >>>> {
> > > > > > > > > >>>> -       struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > > > > > > > > >>>>       struct page *page = req->wb_page;
> > > > > > > > > >>>>
> > > > > > > > > >>>> -       dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb-
> > > > > > > > > >>>>> s_id,
> > > > > > > > > >>>> -               (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
> > > > > > > > > >>>> -               (long long)req_offset(req));
> > > > > > > > > >>>> -
> > > > > > > > > >>>>       if (nfs_error_is_fatal_on_server(error) && error != -
> > > > > > > > > >>>> ETIMEDOUT)
> > > > > > > > > >>>>               SetPageError(page);
> > > > > > > > > >>>> -       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
> > > > > > > > > >>>> -               if (PageUptodate(page))
> > > > > > > > > >>>> -                       nfs_fscache_write_page(inode, page);
> > > > > > > > > >>>> -               unlock_page(page);
> > > > > > > > > >>>> -       }
> > > > > > > > > >>>> +       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > > > > > > > >>>> +               nfs_netfs_readpage_release(req);
> > > > > > > > > >>>> +
> > > > > > > > > >>>
> > > > > > > > > >>> I'm not seeing the value of wrapping unlock_page(), no... That code is
> > > > > > > > > >>> going to need to change when we move it to use folios natively anyway.
> > > > > > > > > >>>
> > > > > > > > > >> Ok, how about I make it conditional on whether fscache is configured
> > > > > > > > > >> and enabled then, similar to the nfs_netfs_read_folio() and
> > > > > > > > > >> nfs_netfs_readahead()?  Below is what that would look like.
> > > > > > > > > >> I could inline the code in nfs_netfs_readpage_release() if you
> > > > > > > > > >> think it would be clearer.
> > > > > > > > > >>
> > > > > > > > > >> static void nfs_readpage_release(struct nfs_page *req, int error)
> > > > > > > > > >> {
> > > > > > > > > >>       struct page *page = req->wb_page;
> > > > > > > > > >>
> > > > > > > > > >>       if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
> > > > > > > > > >>               SetPageError(page);
> > > > > > > > > >>       if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
> > > > > > > > > >> #ifndef CONFIG_NFS_FSCACHE
> > > > > > > > > >>               unlock_page(req->wb_page);
> > > > > > > > > >> #else
> > > > > > > > > >>               nfs_netfs_readpage_release(req);
> > > > > > > > > >> #endif
> > > > > > > > > >>       nfs_release_request(req);
> > > > > > > > > >> }
> > > > > > > > > >>
> > > > > > > > > >>
> > > > > > > > > >> void nfs_netfs_readpage_release(struct nfs_page *req)
> > > > > > > > > >> {
> > > > > > > > > >>   struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
> > > > > > > > > >>
> > > > > > > > > >>   /*
> > > > > > > > > >>    * If fscache is enabled, netfs will unlock pages.
> > > > > > > > > >>    */
> > > > > > > > > >>   if (netfs_inode(inode)->cache)
> > > > > > > > > >>       return;
> > > > > > > > > >>
> > > > > > > > > >>   unlock_page(req->wb_page);
> > > > > > > > > >> }
> > > > > > > > > >>
> > > > > > > > > >>
> > > > > > > > > >>>>       nfs_release_request(req);
> > > > > > > > > >>>> }
> > > > > > > > > >>>>
> > > > > > > > > >>>> @@ -177,6 +170,8 @@ static void nfs_read_completion(struct
> > > > > > > > > >>>> nfs_pgio_header *hdr)
> > > > > > > > > >>>>               nfs_list_remove_request(req);
> > > > > > > > > >>>>               nfs_readpage_release(req, error);
> > > > > > > > > >>>>       }
> > > > > > > > > >>>> +       nfs_netfs_read_completion(hdr);
> > > > > > > > > >>>> +
> > > > > > > > > >>>> out:
> > > > > > > > > >>>>       hdr->release(hdr);
> > > > > > > > > >>>> }
> > > > > > > > > >>>> @@ -187,6 +182,7 @@ static void nfs_initiate_read(struct
> > > > > > > > > >>>> nfs_pgio_header *hdr,
> > > > > > > > > >>>>                             struct rpc_task_setup *task_setup_data,
> > > > > > > > > >>>> int how)
> > > > > > > > > >>>> {
> > > > > > > > > >>>>       rpc_ops->read_setup(hdr, msg);
> > > > > > > > > >>>> +       nfs_netfs_initiate_read(hdr);
> > > > > > > > > >>>>       trace_nfs_initiate_read(hdr);
> > > > > > > > > >>>> }
> > > > > > > > > >>>>
> > > > > > > > > >>>>
> > > > > > > > > >>>> Are you ok with these additions?  Something like this would
> > > > > > > > > >>>> be required in the case of fscache configured and enabled,
> > > > > > > > > >>>> because we could have some of the data in a read in
> > > > > > > > > >>>> fscache, and some not.  That is the reason for the netfs
> > > > > > > > > >>>> design, and why we need to be able to call the normal
> > > > > > > > > >>>> NFS read IO path (netfs calls into issue_read, and we call
> > > > > > > > > >>>> back via netfs_subreq_terminated)?
> > > > > > > > > >>>>
> > > > > > > > > >>>> @@ -101,6 +101,9 @@ struct nfs_pageio_descriptor {
> > > > > > > > > >>>>       struct pnfs_layout_segment *pg_lseg;
> > > > > > > > > >>>>       struct nfs_io_completion *pg_io_completion;
> > > > > > > > > >>>>       struct nfs_direct_req   *pg_dreq;
> > > > > > > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > > >>>> +       void                    *pg_netfs;
> > > > > > > > > >>>> +#endif
> > > > > > > > > >>>>
> > > > > > > > > >>>> @@ -1619,6 +1619,9 @@ struct nfs_pgio_header {
> > > > > > > > > >>>>       const struct nfs_rw_ops *rw_ops;
> > > > > > > > > >>>>       struct nfs_io_completion *io_completion;
> > > > > > > > > >>>>       struct nfs_direct_req   *dreq;
> > > > > > > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > > >>>> +       void                    *netfs;
> > > > > > > > > >>>> +#endif
> > > > > > > > > >>>>
> > > > > > > > > >>>>
> > > > > > > > > >>>> And these additions to pagelist.c?
> > > > > > > > > >>>>
> > > > > > > > > >>>> @@ -68,6 +69,10 @@ void nfs_pgheader_init(struct
> > > > > > > > > >>>> nfs_pageio_descriptor *desc,
> > > > > > > > > >>>>       hdr->good_bytes = mirror->pg_count;
> > > > > > > > > >>>>       hdr->io_completion = desc->pg_io_completion;
> > > > > > > > > >>>>       hdr->dreq = desc->pg_dreq;
> > > > > > > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > > >>>> +       if (desc->pg_netfs)
> > > > > > > > > >>>> +               hdr->netfs = desc->pg_netfs;
> > > > > > > > > >>>> +#endif
> > > > > > > > > >>>
> > > > > > > > > >>> Why the conditional?
> > > > > > > > > >>>
> > > > > > > > > >> Not really needed and I was thinking of removing it, so I'll do that.
> > > > > > > > > >>
> > > > > > > > > >>>>
> > > > > > > > > >>>>
> > > > > > > > > >>>> @@ -846,6 +851,9 @@ void nfs_pageio_init(struct nfs_pageio_descriptor
> > > > > > > > > >>>> *desc,
> > > > > > > > > >>>>       desc->pg_lseg = NULL;
> > > > > > > > > >>>>       desc->pg_io_completion = NULL;
> > > > > > > > > >>>>       desc->pg_dreq = NULL;
> > > > > > > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > > >>>> +       desc->pg_netfs = NULL;
> > > > > > > > > >>>> +#endif
> > > > > > > > > >>>>
> > > > > > > > > >>>>
> > > > > > > > > >>>> @@ -1360,6 +1369,9 @@ int nfs_pageio_resend(struct
> > > > > > > > > >>>> nfs_pageio_descriptor *desc,
> > > > > > > > > >>>>
> > > > > > > > > >>>>       desc->pg_io_completion = hdr->io_completion;
> > > > > > > > > >>>>       desc->pg_dreq = hdr->dreq;
> > > > > > > > > >>>> +#ifdef CONFIG_NFS_FSCACHE
> > > > > > > > > >>>> +       desc->pg_netfs = hdr->netfs;
> > > > > > > > > >>>> +#endif
> > > > > > > > > >>>
> > > > > > > > > >>> Those all need wrapper functions instead of embedding #ifdefs.
> > > > > > > > > >>>
> > > > > > > > > >> Ok.
> > > > > > > > > >>
> > > > > > > > > >>
> > > > > > > > > >>
> > > > > > > > > >>>>
> > > > > > > > > >>>>
> > > > > > > > > >>>>> My expectation is that the standard I/O path should have minimal
> > > > > > > > > >>>>> overhead, and should certainly not increase the overhead that we
> > > > > > > > > >>>>> already have. Will this be addressed in future iterations of these
> > > > > > > > > >>>>> patches?
> > > > > > > > > >>>>>
> > > > > > > > > >>>>
> > > > > > > > > >>>> I will do what I can to satisfy what you want, either by fixing up
> > > > > > > > > >>>> this patch or follow-on patches.  Hopefully the above questions
> > > > > > > > > >>>> will clarify the next steps.
> > > > > > > > > >>>>
> > > > > > > > > >>>
> > > > > > > > > >>> --
> > > > > > > > > >>> Trond Myklebust
> > > > > > > > > >>> Linux NFS client maintainer, Hammerspace
> > > > > > > > > >>> trond.myklebust@hammerspace.com
> > > > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > Trond Myklebust
> > > > > > > > > CTO, Hammerspace Inc
> > > > > > > > > 1900 S Norfolk St, Suite 350 - #45
> > > > > > > > > San Mateo, CA 94403
> > > > > > > > >
> > > > > > > > > www.hammer.space
> > > > > > > > >
> > > > > > > > >
> > > > > > > >
> > > > > > >
> > > > > >
> > > > >
>
diff mbox series

Patch

diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index a6fc1c8b6644..58462f6579d6 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -15,6 +15,9 @@ 
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/iversion.h>
+#include <linux/xarray.h>
+#include <linux/fscache.h>
+#include <linux/netfs.h>
 
 #include "internal.h"
 #include "iostat.h"
@@ -184,7 +187,7 @@  void nfs_fscache_init_inode(struct inode *inode)
  */
 void nfs_fscache_clear_inode(struct inode *inode)
 {
-	fscache_relinquish_cookie(netfs_i_cookie(&NFS_I(inode)->netfs), false);
+	fscache_relinquish_cookie(netfs_i_cookie(netfs_inode(inode)), false);
 	netfs_inode(inode)->cache = NULL;
 }
 
@@ -210,7 +213,7 @@  void nfs_fscache_clear_inode(struct inode *inode)
 void nfs_fscache_open_file(struct inode *inode, struct file *filp)
 {
 	struct nfs_fscache_inode_auxdata auxdata;
-	struct fscache_cookie *cookie = netfs_i_cookie(&NFS_I(inode)->netfs);
+	struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode));
 	bool open_for_write = inode_is_open_for_write(inode);
 
 	if (!fscache_cookie_valid(cookie))
@@ -228,119 +231,160 @@  EXPORT_SYMBOL_GPL(nfs_fscache_open_file);
 void nfs_fscache_release_file(struct inode *inode, struct file *filp)
 {
 	struct nfs_fscache_inode_auxdata auxdata;
-	struct fscache_cookie *cookie = netfs_i_cookie(&NFS_I(inode)->netfs);
+	struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode));
 	loff_t i_size = i_size_read(inode);
 
 	nfs_fscache_update_auxdata(&auxdata, inode);
 	fscache_unuse_cookie(cookie, &auxdata, &i_size);
 }
 
-/*
- * Fallback page reading interface.
- */
-static int fscache_fallback_read_page(struct inode *inode, struct page *page)
+int nfs_netfs_read_folio(struct file *file, struct folio *folio)
 {
-	struct netfs_cache_resources cres;
-	struct fscache_cookie *cookie = netfs_i_cookie(&NFS_I(inode)->netfs);
-	struct iov_iter iter;
-	struct bio_vec bvec[1];
-	int ret;
-
-	memset(&cres, 0, sizeof(cres));
-	bvec[0].bv_page		= page;
-	bvec[0].bv_offset	= 0;
-	bvec[0].bv_len		= PAGE_SIZE;
-	iov_iter_bvec(&iter, READ, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
-
-	ret = fscache_begin_read_operation(&cres, cookie);
-	if (ret < 0)
-		return ret;
-
-	ret = fscache_read(&cres, page_offset(page), &iter, NETFS_READ_HOLE_FAIL,
-			   NULL, NULL);
-	fscache_end_operation(&cres);
-	return ret;
+	if (!netfs_inode(folio_inode(folio))->cache)
+		return -ENOBUFS;
+
+	return netfs_read_folio(file, folio);
 }
 
-/*
- * Fallback page writing interface.
- */
-static int fscache_fallback_write_page(struct inode *inode, struct page *page,
-				       bool no_space_allocated_yet)
+int nfs_netfs_readahead(struct readahead_control *ractl)
 {
-	struct netfs_cache_resources cres;
-	struct fscache_cookie *cookie = netfs_i_cookie(&NFS_I(inode)->netfs);
-	struct iov_iter iter;
-	struct bio_vec bvec[1];
-	loff_t start = page_offset(page);
-	size_t len = PAGE_SIZE;
-	int ret;
-
-	memset(&cres, 0, sizeof(cres));
-	bvec[0].bv_page		= page;
-	bvec[0].bv_offset	= 0;
-	bvec[0].bv_len		= PAGE_SIZE;
-	iov_iter_bvec(&iter, WRITE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
-
-	ret = fscache_begin_write_operation(&cres, cookie);
-	if (ret < 0)
-		return ret;
-
-	ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode),
-				      no_space_allocated_yet);
-	if (ret == 0)
-		ret = fscache_write(&cres, page_offset(page), &iter, NULL, NULL);
-	fscache_end_operation(&cres);
-	return ret;
+	struct inode *inode = ractl->mapping->host;
+
+	if (!netfs_inode(inode)->cache)
+		return -ENOBUFS;
+
+	netfs_readahead(ractl);
+	return 0;
 }
 
-/*
- * Retrieve a page from fscache
- */
-int __nfs_fscache_read_page(struct inode *inode, struct page *page)
+atomic_t nfs_netfs_debug_id;
+static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *file)
 {
-	int ret;
+	rreq->netfs_priv = get_nfs_open_context(nfs_file_open_context(file));
+	rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id);
 
-	trace_nfs_fscache_read_page(inode, page);
-	if (PageChecked(page)) {
-		ClearPageChecked(page);
-		ret = 1;
-		goto out;
-	}
+	return 0;
+}
 
-	ret = fscache_fallback_read_page(inode, page);
-	if (ret < 0) {
-		nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL);
-		SetPageChecked(page);
-		goto out;
-	}
+static void nfs_netfs_free_request(struct netfs_io_request *rreq)
+{
+	put_nfs_open_context(rreq->netfs_priv);
+}
 
-	/* Read completed synchronously */
-	nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK);
-	SetPageUptodate(page);
-	ret = 0;
-out:
-	trace_nfs_fscache_read_page_exit(inode, page, ret);
-	return ret;
+static inline int nfs_netfs_begin_cache_operation(struct netfs_io_request *rreq)
+{
+	return fscache_begin_read_operation(&rreq->cache_resources,
+					    netfs_i_cookie(netfs_inode(rreq->inode)));
 }
 
-/*
- * Store a newly fetched page in fscache.  We can be certain there's no page
- * stored in the cache as yet otherwise we would've read it from there.
- */
-void __nfs_fscache_write_page(struct inode *inode, struct page *page)
+static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq)
 {
-	int ret;
+	struct nfs_netfs_io_data *netfs;
+
+	netfs = kzalloc(sizeof(*netfs), GFP_KERNEL_ACCOUNT);
+	if (!netfs)
+		return NULL;
+	netfs->sreq = sreq;
+	refcount_set(&netfs->refcount, 1);
+	return netfs;
+}
 
-	trace_nfs_fscache_write_page(inode, page);
+static bool nfs_netfs_clamp_length(struct netfs_io_subrequest *sreq)
+{
+	size_t	rsize = NFS_SB(sreq->rreq->inode->i_sb)->rsize;
 
-	ret = fscache_fallback_write_page(inode, page, true);
+	sreq->len = min(sreq->len, rsize);
+	return true;
+}
 
-	if (ret != 0) {
-		nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL);
-		nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED);
-	} else {
-		nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_OK);
+static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq)
+{
+	struct nfs_netfs_io_data	*netfs;
+	struct nfs_pageio_descriptor	pgio;
+	struct inode *inode = sreq->rreq->inode;
+	struct nfs_open_context *ctx = sreq->rreq->netfs_priv;
+	struct page *page;
+	int err;
+	pgoff_t start = (sreq->start + sreq->transferred) >> PAGE_SHIFT;
+	pgoff_t last = ((sreq->start + sreq->len -
+			 sreq->transferred - 1) >> PAGE_SHIFT);
+	XA_STATE(xas, &sreq->rreq->mapping->i_pages, start);
+
+	nfs_pageio_init_read(&pgio, inode, false,
+			     &nfs_async_read_completion_ops);
+
+	netfs = nfs_netfs_alloc(sreq);
+	if (!netfs)
+		return netfs_subreq_terminated(sreq, -ENOMEM, false);
+
+	pgio.pg_netfs = netfs; /* used in completion */
+
+	xas_lock(&xas);
+	xas_for_each(&xas, page, last) {
+		/* nfs_pageio_add_page() may schedule() due to pNFS layout and other RPCs  */
+		xas_pause(&xas);
+		xas_unlock(&xas);
+		err = nfs_pageio_add_page(&pgio, ctx, page);
+		if (err < 0) {
+			netfs->error = err;
+			goto out;
+		}
+		xas_lock(&xas);
 	}
-	trace_nfs_fscache_write_page_exit(inode, page, ret);
+	xas_unlock(&xas);
+out:
+	nfs_pageio_complete_read(&pgio);
+	nfs_netfs_put(netfs);
+}
+
+void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr)
+{
+	struct nfs_netfs_io_data        *netfs = hdr->netfs;
+
+	if (!netfs)
+		return;
+
+	nfs_netfs_get(netfs);
+}
+
+void nfs_netfs_readpage_release(struct nfs_page *req)
+{
+	struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
+
+	/*
+	 * If fscache is enabled, netfs will unlock pages.
+	 */
+	if (netfs_inode(inode)->cache)
+		return;
+
+	unlock_page(req->wb_page);
 }
+
+void nfs_netfs_read_completion(struct nfs_pgio_header *hdr)
+{
+	struct nfs_netfs_io_data        *netfs = hdr->netfs;
+	struct netfs_io_subrequest      *sreq;
+
+	if (!netfs)
+		return;
+
+	sreq = netfs->sreq;
+	if (test_bit(NFS_IOHDR_EOF, &hdr->flags))
+		__set_bit(NETFS_SREQ_CLEAR_TAIL, &sreq->flags);
+
+	if (hdr->error)
+		netfs->error = hdr->error;
+	else
+		atomic64_add(hdr->res.count, &netfs->transferred);
+
+	nfs_netfs_put(netfs);
+	hdr->netfs = NULL;
+}
+
+const struct netfs_request_ops nfs_netfs_ops = {
+	.init_request		= nfs_netfs_init_request,
+	.free_request		= nfs_netfs_free_request,
+	.begin_cache_operation	= nfs_netfs_begin_cache_operation,
+	.issue_read		= nfs_netfs_issue_read,
+	.clamp_length		= nfs_netfs_clamp_length
+};
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 38614ed8f951..8d9d916b3a86 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -34,6 +34,58 @@  struct nfs_fscache_inode_auxdata {
 	u64	change_attr;
 };
 
+struct nfs_netfs_io_data {
+	/*
+	 * NFS may split a netfs_io_subrequest into multiple RPCs, each
+	 * with their own read completion.  In netfs, we can only call
+	 * netfs_subreq_terminated() once for each subrequest.  Use the
+	 * refcount here to double as a marker of the last RPC completion,
+	 * and only call netfs via netfs_subreq_terminated() once.
+	 */
+	refcount_t			refcount;
+	struct netfs_io_subrequest	*sreq;
+
+	/*
+	 * Final disposition of the netfs_io_subrequest, sent in
+	 * netfs_subreq_terminated()
+	 */
+	atomic64_t	transferred;
+	int		error;
+};
+
+static inline void nfs_netfs_get(struct nfs_netfs_io_data *netfs)
+{
+	refcount_inc(&netfs->refcount);
+}
+
+static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs)
+{
+	ssize_t final_len;
+
+	/* Only the last RPC completion should call netfs_subreq_terminated() */
+	if (!refcount_dec_and_test(&netfs->refcount))
+		return;
+
+	/*
+	 * The NFS pageio interface may read a complete page, even when netfs
+	 * only asked for a partial page.  Specifically, this may be seen when
+	 * one thread is truncating a file while another one is reading the last
+	 * page of the file.
+	 * Correct the final length here to be no larger than the netfs subrequest
+	 * length, and prevent netfs from complain throwing "Subreq overread".
+	 */
+	final_len = min_t(s64, netfs->sreq->len, atomic64_read(&netfs->transferred));
+	netfs_subreq_terminated(netfs->sreq, netfs->error ?: final_len, false);
+	kfree(netfs);
+}
+static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi)
+{
+	netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops);
+}
+extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr);
+extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr);
+extern void nfs_netfs_readpage_release(struct nfs_page *req);
+
 /*
  * fscache.c
  */
@@ -44,9 +96,8 @@  extern void nfs_fscache_init_inode(struct inode *);
 extern void nfs_fscache_clear_inode(struct inode *);
 extern void nfs_fscache_open_file(struct inode *, struct file *);
 extern void nfs_fscache_release_file(struct inode *, struct file *);
-
-extern int __nfs_fscache_read_page(struct inode *, struct page *);
-extern void __nfs_fscache_write_page(struct inode *, struct page *);
+extern int nfs_netfs_readahead(struct readahead_control *ractl);
+extern int nfs_netfs_read_folio(struct file *file, struct folio *folio);
 
 static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
 {
@@ -54,34 +105,11 @@  static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
 		if (current_is_kswapd() || !(gfp & __GFP_FS))
 			return false;
 		folio_wait_fscache(folio);
-		fscache_note_page_release(netfs_i_cookie(&NFS_I(folio->mapping->host)->netfs));
-		nfs_inc_fscache_stats(folio->mapping->host,
-				      NFSIOS_FSCACHE_PAGES_UNCACHED);
 	}
+	fscache_note_page_release(netfs_i_cookie(&NFS_I(folio->mapping->host)->netfs));
 	return true;
 }
 
-/*
- * Retrieve a page from an inode data storage object.
- */
-static inline int nfs_fscache_read_page(struct inode *inode, struct page *page)
-{
-	if (netfs_inode(inode)->cache)
-		return __nfs_fscache_read_page(inode, page);
-	return -ENOBUFS;
-}
-
-/*
- * Store a page newly fetched from the server in an inode data storage object
- * in the cache.
- */
-static inline void nfs_fscache_write_page(struct inode *inode,
-					   struct page *page)
-{
-	if (netfs_inode(inode)->cache)
-		__nfs_fscache_write_page(inode, page);
-}
-
 static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *auxdata,
 					      struct inode *inode)
 {
@@ -118,6 +146,13 @@  static inline const char *nfs_server_fscache_state(struct nfs_server *server)
 }
 
 #else /* CONFIG_NFS_FSCACHE */
+static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {}
+static inline void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr) {}
+static inline void nfs_netfs_read_completion(struct nfs_pgio_header *hdr) {}
+static inline void nfs_netfs_readpage_release(struct nfs_page *req)
+{
+	unlock_page(req->wb_page);
+}
 static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
 
 static inline void nfs_fscache_init_inode(struct inode *inode) {}
@@ -125,16 +160,19 @@  static inline void nfs_fscache_clear_inode(struct inode *inode) {}
 static inline void nfs_fscache_open_file(struct inode *inode,
 					 struct file *filp) {}
 static inline void nfs_fscache_release_file(struct inode *inode, struct file *file) {}
-
-static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
+static inline int nfs_netfs_readahead(struct readahead_control *ractl)
 {
-	return true; /* may release folio */
+	return -ENOBUFS;
 }
-static inline int nfs_fscache_read_page(struct inode *inode, struct page *page)
+static inline int nfs_netfs_read_folio(struct file *file, struct folio *folio)
 {
 	return -ENOBUFS;
 }
-static inline void nfs_fscache_write_page(struct inode *inode, struct page *page) {}
+
+static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
+{
+	return true; /* may release folio */
+}
 static inline void nfs_fscache_invalidate(struct inode *inode, int flags) {}
 
 static inline const char *nfs_server_fscache_state(struct nfs_server *server)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index aa2aec785ab5..b36a02b932e8 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2249,6 +2249,8 @@  struct inode *nfs_alloc_inode(struct super_block *sb)
 #ifdef CONFIG_NFS_V4_2
 	nfsi->xattr_cache = NULL;
 #endif
+	nfs_netfs_inode_init(nfsi);
+
 	return VFS_I(nfsi);
 }
 EXPORT_SYMBOL_GPL(nfs_alloc_inode);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 4d240ac4430f..e9c0e4abc954 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -453,6 +453,10 @@  extern void nfs_sb_deactive(struct super_block *sb);
 extern int nfs_client_for_each_server(struct nfs_client *clp,
 				      int (*fn)(struct nfs_server *, void *),
 				      void *data);
+#ifdef CONFIG_NFS_FSCACHE
+extern const struct netfs_request_ops nfs_netfs_ops;
+#endif
+
 /* io.c */
 extern void nfs_start_io_read(struct inode *inode);
 extern void nfs_end_io_read(struct inode *inode);
@@ -482,9 +486,14 @@  extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool
 
 struct nfs_pgio_completion_ops;
 /* read.c */
+extern const struct nfs_pgio_completion_ops nfs_async_read_completion_ops;
 extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
 			struct inode *inode, bool force_mds,
 			const struct nfs_pgio_completion_ops *compl_ops);
+extern int nfs_pageio_add_page(struct nfs_pageio_descriptor *pgio,
+			       struct nfs_open_context *ctx,
+			       struct page *page);
+extern void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio);
 extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
 
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 317cedfa52bf..e28754476d1b 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -25,6 +25,7 @@ 
 #include "internal.h"
 #include "pnfs.h"
 #include "nfstrace.h"
+#include "fscache.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 
@@ -68,6 +69,10 @@  void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
 	hdr->good_bytes = mirror->pg_count;
 	hdr->io_completion = desc->pg_io_completion;
 	hdr->dreq = desc->pg_dreq;
+#ifdef CONFIG_NFS_FSCACHE
+	if (desc->pg_netfs)
+		hdr->netfs = desc->pg_netfs;
+#endif
 	hdr->release = release;
 	hdr->completion_ops = desc->pg_completion_ops;
 	if (hdr->completion_ops->init_hdr)
@@ -846,6 +851,9 @@  void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 	desc->pg_lseg = NULL;
 	desc->pg_io_completion = NULL;
 	desc->pg_dreq = NULL;
+#ifdef CONFIG_NFS_FSCACHE
+	desc->pg_netfs = NULL;
+#endif
 	desc->pg_bsize = bsize;
 
 	desc->pg_mirror_count = 1;
@@ -940,6 +948,7 @@  int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
 	/* Set up the argument struct */
 	nfs_pgio_rpcsetup(hdr, mirror->pg_count, desc->pg_ioflags, &cinfo);
 	desc->pg_rpc_callops = &nfs_pgio_common_ops;
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(nfs_generic_pgio);
@@ -1360,6 +1369,9 @@  int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
 
 	desc->pg_io_completion = hdr->io_completion;
 	desc->pg_dreq = hdr->dreq;
+#ifdef CONFIG_NFS_FSCACHE
+	desc->pg_netfs = hdr->netfs;
+#endif
 	list_splice_init(&hdr->pages, &pages);
 	while (!list_empty(&pages)) {
 		struct nfs_page *req = nfs_list_entry(pages.next);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 525e82ea9a9e..fdfebca017fc 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -30,7 +30,7 @@ 
 
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 
-static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops;
+const struct nfs_pgio_completion_ops nfs_async_read_completion_ops;
 static const struct nfs_rw_ops nfs_rw_read_ops;
 
 static struct kmem_cache *nfs_rdata_cachep;
@@ -74,7 +74,7 @@  void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
 
-static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio)
+void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio)
 {
 	struct nfs_pgio_mirror *pgm;
 	unsigned long npages;
@@ -110,20 +110,13 @@  EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
 
 static void nfs_readpage_release(struct nfs_page *req, int error)
 {
-	struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
 	struct page *page = req->wb_page;
 
-	dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id,
-		(unsigned long long)NFS_FILEID(inode), req->wb_bytes,
-		(long long)req_offset(req));
-
 	if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
 		SetPageError(page);
-	if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
-		if (PageUptodate(page))
-			nfs_fscache_write_page(inode, page);
-		unlock_page(page);
-	}
+	if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
+		nfs_netfs_readpage_release(req);
+
 	nfs_release_request(req);
 }
 
@@ -177,6 +170,8 @@  static void nfs_read_completion(struct nfs_pgio_header *hdr)
 		nfs_list_remove_request(req);
 		nfs_readpage_release(req, error);
 	}
+	nfs_netfs_read_completion(hdr);
+
 out:
 	hdr->release(hdr);
 }
@@ -187,6 +182,7 @@  static void nfs_initiate_read(struct nfs_pgio_header *hdr,
 			      struct rpc_task_setup *task_setup_data, int how)
 {
 	rpc_ops->read_setup(hdr, msg);
+	nfs_netfs_initiate_read(hdr);
 	trace_nfs_initiate_read(hdr);
 }
 
@@ -202,7 +198,7 @@  nfs_async_read_error(struct list_head *head, int error)
 	}
 }
 
-static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = {
+const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = {
 	.error_cleanup = nfs_async_read_error,
 	.completion = nfs_read_completion,
 };
@@ -294,12 +290,6 @@  nfs_pageio_add_page(struct nfs_pageio_descriptor *pgio,
 
 	aligned_len = min_t(unsigned int, ALIGN(len, rsize), PAGE_SIZE);
 
-	if (!IS_SYNC(page->mapping->host)) {
-		error = nfs_fscache_read_page(page->mapping->host, page);
-		if (error == 0)
-			goto out_unlock;
-	}
-
 	new = nfs_create_request(ctx, page, 0, aligned_len);
 	if (IS_ERR(new))
 		goto out_error;
@@ -315,8 +305,6 @@  nfs_pageio_add_page(struct nfs_pageio_descriptor *pgio,
 	return 0;
 out_error:
 	error = PTR_ERR(new);
-out_unlock:
-	unlock_page(page);
 out:
 	return error;
 }
@@ -355,6 +343,10 @@  int nfs_read_folio(struct file *file, struct folio *folio)
 	if (NFS_STALE(inode))
 		goto out_unlock;
 
+	ret = nfs_netfs_read_folio(file, folio);
+	if (!ret)
+		goto out;
+
 	if (file == NULL) {
 		ret = -EBADF;
 		ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
@@ -368,8 +360,10 @@  int nfs_read_folio(struct file *file, struct folio *folio)
 			     &nfs_async_read_completion_ops);
 
 	ret = nfs_pageio_add_page(&pgio, ctx, page);
-	if (ret)
-		goto out;
+	if (ret) {
+		put_nfs_open_context(ctx);
+		goto out_unlock;
+	}
 
 	nfs_pageio_complete_read(&pgio);
 	ret = pgio.pg_error < 0 ? pgio.pg_error : 0;
@@ -378,12 +372,12 @@  int nfs_read_folio(struct file *file, struct folio *folio)
 		if (!PageUptodate(page) && !ret)
 			ret = xchg(&ctx->error, 0);
 	}
-out:
 	put_nfs_open_context(ctx);
-	trace_nfs_aop_readpage_done(inode, page, ret);
-	return ret;
+	goto out;
+
 out_unlock:
 	unlock_page(page);
+out:
 	trace_nfs_aop_readpage_done(inode, page, ret);
 	return ret;
 }
@@ -405,6 +399,10 @@  void nfs_readahead(struct readahead_control *ractl)
 	if (NFS_STALE(inode))
 		goto out;
 
+	ret = nfs_netfs_readahead(ractl);
+	if (!ret)
+		goto out;
+
 	if (file == NULL) {
 		ret = -EBADF;
 		ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index ba7e2e4b0926..8eeb16d9bacd 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -101,6 +101,9 @@  struct nfs_pageio_descriptor {
 	struct pnfs_layout_segment *pg_lseg;
 	struct nfs_io_completion *pg_io_completion;
 	struct nfs_direct_req	*pg_dreq;
+#ifdef CONFIG_NFS_FSCACHE
+	void			*pg_netfs;
+#endif
 	unsigned int		pg_bsize;	/* default bsize for mirrors */
 
 	u32			pg_mirror_count;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index e86cf6642d21..e196ef595908 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1619,6 +1619,9 @@  struct nfs_pgio_header {
 	const struct nfs_rw_ops	*rw_ops;
 	struct nfs_io_completion *io_completion;
 	struct nfs_direct_req	*dreq;
+#ifdef CONFIG_NFS_FSCACHE
+	void			*netfs;
+#endif
 
 	int			pnfs_error;
 	int			error;		/* merge with pnfs_error */