diff mbox series

[rfc,2/2] NFSD: allow client to use write delegation stateid for READ

Message ID 20240724170138.1942307-2-sagi@grimberg.me (mailing list archive)
State New
Headers show
Series [rfc,1/2] nfsd: don't assume copy notify when preprocessing the stateid | expand

Commit Message

Sagi Grimberg July 24, 2024, 5:01 p.m. UTC
Based on a patch fom Dai Ngo, allow NFSv4 client to use write delegation
stateid for READ operation. Per RFC 8881 section 9.1.2. Use of the
Stateid and Locking.

In addition, for anonymous stateids, check for pending delegations by
the filehandle and client_id, and if a conflict found, recall the delegation
before allowing the read to take place.

Suggested-by: Dai Ngo <dai.ngo@oracle.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 fs/nfsd/nfs4proc.c  | 22 +++++++++++++++++++--
 fs/nfsd/nfs4state.c | 47 +++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs4xdr.c   |  9 +++++++++
 fs/nfsd/state.h     |  2 ++
 fs/nfsd/xdr4.h      |  2 ++
 5 files changed, 80 insertions(+), 2 deletions(-)

Comments

Olga Kornievskaia July 24, 2024, 7:29 p.m. UTC | #1
On Wed, Jul 24, 2024 at 1:01 PM Sagi Grimberg <sagi@grimberg.me> wrote:
>
> Based on a patch fom Dai Ngo, allow NFSv4 client to use write delegation
> stateid for READ operation. Per RFC 8881 section 9.1.2. Use of the
> Stateid and Locking.
>
> In addition, for anonymous stateids, check for pending delegations by
> the filehandle and client_id, and if a conflict found, recall the delegation
> before allowing the read to take place.
>
> Suggested-by: Dai Ngo <dai.ngo@oracle.com>
> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> ---
>  fs/nfsd/nfs4proc.c  | 22 +++++++++++++++++++--
>  fs/nfsd/nfs4state.c | 47 +++++++++++++++++++++++++++++++++++++++++++++
>  fs/nfsd/nfs4xdr.c   |  9 +++++++++
>  fs/nfsd/state.h     |  2 ++
>  fs/nfsd/xdr4.h      |  2 ++
>  5 files changed, 80 insertions(+), 2 deletions(-)
>
> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> index 7b70309ad8fb..324984ec70c6 100644
> --- a/fs/nfsd/nfs4proc.c
> +++ b/fs/nfsd/nfs4proc.c
> @@ -979,8 +979,24 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>         /* check stateid */
>         status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
>                                         &read->rd_stateid, RD_STATE,
> -                                       &read->rd_nf, NULL);
> -
> +                                       &read->rd_nf, &read->rd_wd_stid);

Now I can see that this patch wants to leverage the "returned stateid
of the nfs4_preprocess_stateid_op() but the logic in the previous
patch was in the way because it distinguished the copy_notify by the
non-null passed in stateid. So I would suggest that in order to not
break the copy_notify and help with this functionality something else
is sent into nfs4_proprocess_staetid_op() to allow for the stateid to
be passed and then distinguish between copy_notify and read.

> +       /*
> +        * rd_wd_stid is needed for nfsd4_encode_read to allow write
> +        * delegation stateid used for read. Its refcount is decremented
> +        * by nfsd4_read_release when read is done.
> +        */
> +       if (!status) {
> +               if (!read->rd_wd_stid) {
> +                       /* special stateid? */
> +                       status = nfsd4_deleg_read_conflict(rqstp, cstate->clp,
> +                               &cstate->current_fh);
> +               } else if (read->rd_wd_stid->sc_type != SC_TYPE_DELEG ||
> +                          delegstateid(read->rd_wd_stid)->dl_type !=
> +                                               NFS4_OPEN_DELEGATE_WRITE) {
> +                       nfs4_put_stid(read->rd_wd_stid);
> +                       read->rd_wd_stid = NULL;
> +               }
> +       }
>         read->rd_rqstp = rqstp;
>         read->rd_fhp = &cstate->current_fh;
>         return status;
> @@ -990,6 +1006,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  static void
>  nfsd4_read_release(union nfsd4_op_u *u)
>  {
> +       if (u->read.rd_wd_stid)
> +               nfs4_put_stid(u->read.rd_wd_stid);
>         if (u->read.rd_nf)
>                 nfsd_file_put(u->read.rd_nf);
>         trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp,
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index dc61a8adfcd4..7e6b9fb31a4c 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -8805,6 +8805,53 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
>         get_stateid(cstate, &u->write.wr_stateid);
>  }
>
> +/**
> + * nfsd4_deleg_read_conflict - Recall if read causes conflict
> + * @rqstp: RPC transaction context
> + * @clp: nfs client
> + * @fhp: nfs file handle
> + * @inode: file to be checked for a conflict
> + * @modified: return true if file was modified
> + * @size: new size of file if modified is true
> + *
> + * This function is called when there is a conflict between a write
> + * delegation and a read that is using a special stateid where the
> + * we cannot derive the client stateid exsistence. The server
> + * must recall a conflicting delegation before allowing the read
> + * to continue.
> + *
> + * Returns 0 if there is no conflict; otherwise an nfs_stat
> + * code is returned.
> + */
> +__be32 nfsd4_deleg_read_conflict(struct svc_rqst *rqstp,
> +               struct nfs4_client *clp, struct svc_fh *fhp)
> +{
> +       struct nfs4_file *fp;
> +       __be32 status = 0;
> +
> +       fp = nfsd4_file_hash_lookup(fhp);
> +       if (!fp)
> +               return nfs_ok;
> +
> +       spin_lock(&state_lock);
> +       spin_lock(&fp->fi_lock);
> +       if (!list_empty(&fp->fi_delegations) &&
> +           !nfs4_delegation_exists(clp, fp)) {
> +               /* conflict, recall deleg */
> +               status = nfserrno(nfsd_open_break_lease(fp->fi_inode,
> +                                       NFSD_MAY_READ));
> +               if (status)
> +                       goto out;
> +               if (!nfsd_wait_for_delegreturn(rqstp, fp->fi_inode))
> +                       status = nfserr_jukebox;
> +       }
> +out:
> +       spin_unlock(&fp->fi_lock);
> +       spin_unlock(&state_lock);
> +       return status;
> +}
> +
> +
>  /**
>   * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict
>   * @rqstp: RPC transaction context
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index c7bfd2180e3f..f0fe526fac3c 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -4418,6 +4418,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
>         unsigned long maxcount;
>         struct file *file;
>         __be32 *p;
> +       fmode_t o_fmode = 0;
>
>         if (nfserr)
>                 return nfserr;
> @@ -4437,10 +4438,18 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
>         maxcount = min_t(unsigned long, read->rd_length,
>                          (xdr->buf->buflen - xdr->buf->len));
>
> +       if (read->rd_wd_stid) {
> +               /* allow READ using write delegation stateid */
> +               o_fmode = file->f_mode;
> +               file->f_mode |= FMODE_READ;
> +       }
>         if (file->f_op->splice_read && splice_ok)
>                 nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
>         else
>                 nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
> +       if (o_fmode)
> +               file->f_mode = o_fmode;
> +
>         if (nfserr) {
>                 xdr_truncate_encode(xdr, starting_len);
>                 return nfserr;
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index ffc217099d19..c1f13b5877c6 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -780,6 +780,8 @@ static inline bool try_to_expire_client(struct nfs4_client *clp)
>         return clp->cl_state == NFSD4_EXPIRABLE;
>  }
>
> +extern __be32 nfsd4_deleg_read_conflict(struct svc_rqst *rqstp,
> +               struct nfs4_client *clp, struct svc_fh *fhp);
>  extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp,
>                 struct inode *inode, bool *file_modified, u64 *size);
>  #endif   /* NFSD4_STATE_H */
> diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
> index fbdd42cde1fa..434973a6a8b1 100644
> --- a/fs/nfsd/xdr4.h
> +++ b/fs/nfsd/xdr4.h
> @@ -426,6 +426,8 @@ struct nfsd4_read {
>         struct svc_rqst         *rd_rqstp;          /* response */
>         struct svc_fh           *rd_fhp;            /* response */
>         u32                     rd_eof;             /* response */
> +
> +       struct nfs4_stid        *rd_wd_stid;            /* internal */
>  };
>
>  struct nfsd4_readdir {
> --
> 2.43.0
>
>
Sagi Grimberg July 24, 2024, 9:47 p.m. UTC | #2
On 24/07/2024 22:29, Olga Kornievskaia wrote:
> On Wed, Jul 24, 2024 at 1:01 PM Sagi Grimberg <sagi@grimberg.me> wrote:
>> Based on a patch fom Dai Ngo, allow NFSv4 client to use write delegation
>> stateid for READ operation. Per RFC 8881 section 9.1.2. Use of the
>> Stateid and Locking.
>>
>> In addition, for anonymous stateids, check for pending delegations by
>> the filehandle and client_id, and if a conflict found, recall the delegation
>> before allowing the read to take place.
>>
>> Suggested-by: Dai Ngo <dai.ngo@oracle.com>
>> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
>> ---
>>   fs/nfsd/nfs4proc.c  | 22 +++++++++++++++++++--
>>   fs/nfsd/nfs4state.c | 47 +++++++++++++++++++++++++++++++++++++++++++++
>>   fs/nfsd/nfs4xdr.c   |  9 +++++++++
>>   fs/nfsd/state.h     |  2 ++
>>   fs/nfsd/xdr4.h      |  2 ++
>>   5 files changed, 80 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
>> index 7b70309ad8fb..324984ec70c6 100644
>> --- a/fs/nfsd/nfs4proc.c
>> +++ b/fs/nfsd/nfs4proc.c
>> @@ -979,8 +979,24 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>>          /* check stateid */
>>          status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
>>                                          &read->rd_stateid, RD_STATE,
>> -                                       &read->rd_nf, NULL);
>> -
>> +                                       &read->rd_nf, &read->rd_wd_stid);
> Now I can see that this patch wants to leverage the "returned stateid
> of the nfs4_preprocess_stateid_op() but the logic in the previous
> patch was in the way because it distinguished the copy_notify by the
> non-null passed in stateid. So I would suggest that in order to not
> break the copy_notify and help with this functionality something else
> is sent into nfs4_proprocess_staetid_op() to allow for the stateid to
> be passed and then distinguish between copy_notify and read.

My thinking is that instead having the generic stateid pre-processing
be aware which proc may accept a special stateid and which may not,
we'd want to have the call-sites enforce that...

But I am not the expert here, and will be happy to change based on
your preference...
Jeff Layton July 26, 2024, 1:54 p.m. UTC | #3
On Wed, 2024-07-24 at 10:01 -0700, Sagi Grimberg wrote:
> Based on a patch fom Dai Ngo, allow NFSv4 client to use write delegation
> stateid for READ operation. Per RFC 8881 section 9.1.2. Use of the
> Stateid and Locking.
> 
> In addition, for anonymous stateids, check for pending delegations by
> the filehandle and client_id, and if a conflict found, recall the delegation
> before allowing the read to take place.
> 
> Suggested-by: Dai Ngo <dai.ngo@oracle.com>
> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> ---
>  fs/nfsd/nfs4proc.c  | 22 +++++++++++++++++++--
>  fs/nfsd/nfs4state.c | 47 +++++++++++++++++++++++++++++++++++++++++++++
>  fs/nfsd/nfs4xdr.c   |  9 +++++++++
>  fs/nfsd/state.h     |  2 ++
>  fs/nfsd/xdr4.h      |  2 ++
>  5 files changed, 80 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> index 7b70309ad8fb..324984ec70c6 100644
> --- a/fs/nfsd/nfs4proc.c
> +++ b/fs/nfsd/nfs4proc.c
> @@ -979,8 +979,24 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  	/* check stateid */
>  	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
>  					&read->rd_stateid, RD_STATE,
> -					&read->rd_nf, NULL);
> -
> +					&read->rd_nf, &read->rd_wd_stid);

In the case where we have multiple stateids for this client, are we
guaranteed to get back the delegation stateid here? What if we get back
the open stateid for the O_WRONLY open instead?


> +	/*
> +	 * rd_wd_stid is needed for nfsd4_encode_read to allow write
> +	 * delegation stateid used for read. Its refcount is decremented
> +	 * by nfsd4_read_release when read is done.
> +	 */
> +	if (!status) {
> +		if (!read->rd_wd_stid) {
> +			/* special stateid? */
> +			status = nfsd4_deleg_read_conflict(rqstp, cstate->clp,
> +				&cstate->current_fh);
> +		} else if (read->rd_wd_stid->sc_type != SC_TYPE_DELEG ||
> +			   delegstateid(read->rd_wd_stid)->dl_type !=
> +						NFS4_OPEN_DELEGATE_WRITE) {
> +			nfs4_put_stid(read->rd_wd_stid);
> +			read->rd_wd_stid = NULL;
> +		}
> +	}
>  	read->rd_rqstp = rqstp;
>  	read->rd_fhp = &cstate->current_fh;
>  	return status;
> @@ -990,6 +1006,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  static void
>  nfsd4_read_release(union nfsd4_op_u *u)
>  {
> +	if (u->read.rd_wd_stid)
> +		nfs4_put_stid(u->read.rd_wd_stid);
>  	if (u->read.rd_nf)
>  		nfsd_file_put(u->read.rd_nf);
>  	trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp,
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index dc61a8adfcd4..7e6b9fb31a4c 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -8805,6 +8805,53 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
>  	get_stateid(cstate, &u->write.wr_stateid);
>  }
>  
> +/**
> + * nfsd4_deleg_read_conflict - Recall if read causes conflict
> + * @rqstp: RPC transaction context
> + * @clp: nfs client
> + * @fhp: nfs file handle
> + * @inode: file to be checked for a conflict
> + * @modified: return true if file was modified
> + * @size: new size of file if modified is true
> + *
> + * This function is called when there is a conflict between a write
> + * delegation and a read that is using a special stateid where the
> + * we cannot derive the client stateid exsistence. The server
> + * must recall a conflicting delegation before allowing the read
> + * to continue.
> + *
> + * Returns 0 if there is no conflict; otherwise an nfs_stat
> + * code is returned.
> + */
> +__be32 nfsd4_deleg_read_conflict(struct svc_rqst *rqstp,
> +		struct nfs4_client *clp, struct svc_fh *fhp)
> +{
> +	struct nfs4_file *fp;
> +	__be32 status = 0;
> +
> +	fp = nfsd4_file_hash_lookup(fhp);
> +	if (!fp)
> +		return nfs_ok;
> +
> +	spin_lock(&state_lock);
> +	spin_lock(&fp->fi_lock);
> +	if (!list_empty(&fp->fi_delegations) &&
> +	    !nfs4_delegation_exists(clp, fp)) {
> +		/* conflict, recall deleg */
> +		status = nfserrno(nfsd_open_break_lease(fp->fi_inode,
> +					NFSD_MAY_READ));
> +		if (status)
> +			goto out;
> +		if (!nfsd_wait_for_delegreturn(rqstp, fp->fi_inode))
> +			status = nfserr_jukebox;
> +	}
> +out:
> +	spin_unlock(&fp->fi_lock);
> +	spin_unlock(&state_lock);
> +	return status;
> +}
> +
> +
>  /**
>   * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict
>   * @rqstp: RPC transaction context
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index c7bfd2180e3f..f0fe526fac3c 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -4418,6 +4418,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
>  	unsigned long maxcount;
>  	struct file *file;
>  	__be32 *p;
> +	fmode_t o_fmode = 0;
>  
>  	if (nfserr)
>  		return nfserr;
> @@ -4437,10 +4438,18 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
>  	maxcount = min_t(unsigned long, read->rd_length,
>  			 (xdr->buf->buflen - xdr->buf->len));
>  
> +	if (read->rd_wd_stid) {
> +		/* allow READ using write delegation stateid */
> +		o_fmode = file->f_mode;
> +		file->f_mode |= FMODE_READ;
> +	}
>  	if (file->f_op->splice_read && splice_ok)
>  		nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
>  	else
>  		nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
> +	if (o_fmode)
> +		file->f_mode = o_fmode;
> +
>  	if (nfserr) {
>  		xdr_truncate_encode(xdr, starting_len);
>  		return nfserr;
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index ffc217099d19..c1f13b5877c6 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -780,6 +780,8 @@ static inline bool try_to_expire_client(struct nfs4_client *clp)
>  	return clp->cl_state == NFSD4_EXPIRABLE;
>  }
>  
> +extern __be32 nfsd4_deleg_read_conflict(struct svc_rqst *rqstp,
> +		struct nfs4_client *clp, struct svc_fh *fhp);
>  extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp,
>  		struct inode *inode, bool *file_modified, u64 *size);
>  #endif   /* NFSD4_STATE_H */
> diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
> index fbdd42cde1fa..434973a6a8b1 100644
> --- a/fs/nfsd/xdr4.h
> +++ b/fs/nfsd/xdr4.h
> @@ -426,6 +426,8 @@ struct nfsd4_read {
>  	struct svc_rqst		*rd_rqstp;          /* response */
>  	struct svc_fh		*rd_fhp;            /* response */
>  	u32			rd_eof;             /* response */
> +
> +	struct nfs4_stid	*rd_wd_stid;		/* internal */
>  };
>  
>  struct nfsd4_readdir {
Jeff Layton July 26, 2024, 2 p.m. UTC | #4
On Wed, 2024-07-24 at 14:47 -0700, Sagi Grimberg wrote:
> 
> 
> 
> On 24/07/2024 22:29, Olga Kornievskaia wrote:
> > On Wed, Jul 24, 2024 at 1:01 PM Sagi Grimberg <sagi@grimberg.me>
> > wrote:
> > > Based on a patch fom Dai Ngo, allow NFSv4 client to use write
> > > delegation
> > > stateid for READ operation. Per RFC 8881 section 9.1.2. Use of
> > > the
> > > Stateid and Locking.
> > > 
> > > In addition, for anonymous stateids, check for pending
> > > delegations by
> > > the filehandle and client_id, and if a conflict found, recall the
> > > delegation
> > > before allowing the read to take place.
> > > 
> > > Suggested-by: Dai Ngo <dai.ngo@oracle.com>
> > > Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> > > ---
> > >   fs/nfsd/nfs4proc.c  | 22 +++++++++++++++++++--
> > >   fs/nfsd/nfs4state.c | 47
> > > +++++++++++++++++++++++++++++++++++++++++++++
> > >   fs/nfsd/nfs4xdr.c   |  9 +++++++++
> > >   fs/nfsd/state.h     |  2 ++
> > >   fs/nfsd/xdr4.h      |  2 ++
> > >   5 files changed, 80 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> > > index 7b70309ad8fb..324984ec70c6 100644
> > > --- a/fs/nfsd/nfs4proc.c
> > > +++ b/fs/nfsd/nfs4proc.c
> > > @@ -979,8 +979,24 @@ nfsd4_read(struct svc_rqst *rqstp, struct
> > > nfsd4_compound_state *cstate,
> > >          /* check stateid */
> > >          status = nfs4_preprocess_stateid_op(rqstp, cstate,
> > > &cstate->current_fh,
> > >                                          &read->rd_stateid,
> > > RD_STATE,
> > > -                                       &read->rd_nf, NULL);
> > > -
> > > +                                       &read->rd_nf, &read-
> > > >rd_wd_stid);
> > Now I can see that this patch wants to leverage the "returned
> > stateid
> > of the nfs4_preprocess_stateid_op() but the logic in the previous
> > patch was in the way because it distinguished the copy_notify by
> > the
> > non-null passed in stateid. So I would suggest that in order to not
> > break the copy_notify and help with this functionality something
> > else
> > is sent into nfs4_proprocess_staetid_op() to allow for the stateid
> > to
> > be passed and then distinguish between copy_notify and read.
> 
> My thinking is that instead having the generic stateid pre-processing
> be aware which proc may accept a special stateid and which may not,
> we'd want to have the call-sites enforce that...
> 
> But I am not the expert here, and will be happy to change based on
> your preference...

I tend to agree with Sagi here. Keeping this sort of operation-specific
check in the operation-specific function makes more sense to me too.
Chuck Lever III July 26, 2024, 2:06 p.m. UTC | #5
On Wed, Jul 24, 2024 at 10:01:38AM -0700, Sagi Grimberg wrote:
> Based on a patch fom Dai Ngo, allow NFSv4 client to use write delegation
> stateid for READ operation. Per RFC 8881 section 9.1.2. Use of the
> Stateid and Locking.
> 
> In addition, for anonymous stateids, check for pending delegations by
> the filehandle and client_id, and if a conflict found, recall the delegation
> before allowing the read to take place.
> 
> Suggested-by: Dai Ngo <dai.ngo@oracle.com>
> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> ---
>  fs/nfsd/nfs4proc.c  | 22 +++++++++++++++++++--
>  fs/nfsd/nfs4state.c | 47 +++++++++++++++++++++++++++++++++++++++++++++
>  fs/nfsd/nfs4xdr.c   |  9 +++++++++
>  fs/nfsd/state.h     |  2 ++
>  fs/nfsd/xdr4.h      |  2 ++
>  5 files changed, 80 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> index 7b70309ad8fb..324984ec70c6 100644
> --- a/fs/nfsd/nfs4proc.c
> +++ b/fs/nfsd/nfs4proc.c
> @@ -979,8 +979,24 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  	/* check stateid */
>  	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
>  					&read->rd_stateid, RD_STATE,
> -					&read->rd_nf, NULL);
> -
> +					&read->rd_nf, &read->rd_wd_stid);
> +	/*
> +	 * rd_wd_stid is needed for nfsd4_encode_read to allow write
> +	 * delegation stateid used for read. Its refcount is decremented
> +	 * by nfsd4_read_release when read is done.
> +	 */
> +	if (!status) {
> +		if (!read->rd_wd_stid) {
> +			/* special stateid? */
> +			status = nfsd4_deleg_read_conflict(rqstp, cstate->clp,
> +				&cstate->current_fh);
> +		} else if (read->rd_wd_stid->sc_type != SC_TYPE_DELEG ||
> +			   delegstateid(read->rd_wd_stid)->dl_type !=
> +						NFS4_OPEN_DELEGATE_WRITE) {
> +			nfs4_put_stid(read->rd_wd_stid);
> +			read->rd_wd_stid = NULL;
> +		}
> +	}
>  	read->rd_rqstp = rqstp;
>  	read->rd_fhp = &cstate->current_fh;
>  	return status;
> @@ -990,6 +1006,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  static void
>  nfsd4_read_release(union nfsd4_op_u *u)
>  {
> +	if (u->read.rd_wd_stid)
> +		nfs4_put_stid(u->read.rd_wd_stid);
>  	if (u->read.rd_nf)
>  		nfsd_file_put(u->read.rd_nf);
>  	trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp,
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index dc61a8adfcd4..7e6b9fb31a4c 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -8805,6 +8805,53 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
>  	get_stateid(cstate, &u->write.wr_stateid);
>  }
>  
> +/**
> + * nfsd4_deleg_read_conflict - Recall if read causes conflict
> + * @rqstp: RPC transaction context
> + * @clp: nfs client
> + * @fhp: nfs file handle
> + * @inode: file to be checked for a conflict
> + * @modified: return true if file was modified
> + * @size: new size of file if modified is true
> + *
> + * This function is called when there is a conflict between a write
> + * delegation and a read that is using a special stateid where the
> + * we cannot derive the client stateid exsistence. The server
> + * must recall a conflicting delegation before allowing the read
> + * to continue.
> + *
> + * Returns 0 if there is no conflict; otherwise an nfs_stat
> + * code is returned.
> + */
> +__be32 nfsd4_deleg_read_conflict(struct svc_rqst *rqstp,
> +		struct nfs4_client *clp, struct svc_fh *fhp)
> +{
> +	struct nfs4_file *fp;
> +	__be32 status = 0;
> +
> +	fp = nfsd4_file_hash_lookup(fhp);
> +	if (!fp)
> +		return nfs_ok;
> +
> +	spin_lock(&state_lock);
> +	spin_lock(&fp->fi_lock);
> +	if (!list_empty(&fp->fi_delegations) &&
> +	    !nfs4_delegation_exists(clp, fp)) {
> +		/* conflict, recall deleg */
> +		status = nfserrno(nfsd_open_break_lease(fp->fi_inode,
> +					NFSD_MAY_READ));
> +		if (status)
> +			goto out;
> +		if (!nfsd_wait_for_delegreturn(rqstp, fp->fi_inode))
> +			status = nfserr_jukebox;
> +	}
> +out:
> +	spin_unlock(&fp->fi_lock);
> +	spin_unlock(&state_lock);
> +	return status;
> +}
> +
> +
>  /**
>   * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict
>   * @rqstp: RPC transaction context
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index c7bfd2180e3f..f0fe526fac3c 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -4418,6 +4418,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
>  	unsigned long maxcount;
>  	struct file *file;
>  	__be32 *p;
> +	fmode_t o_fmode = 0;
>  
>  	if (nfserr)
>  		return nfserr;
> @@ -4437,10 +4438,18 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
>  	maxcount = min_t(unsigned long, read->rd_length,
>  			 (xdr->buf->buflen - xdr->buf->len));
>  
> +	if (read->rd_wd_stid) {
> +		/* allow READ using write delegation stateid */
> +		o_fmode = file->f_mode;
> +		file->f_mode |= FMODE_READ;
> +	}

nfsd4_encode_read_plus() needs to handle write delegation stateids
as well.

I'm not too sure about modifying the f_mode on an nfsd_file you
just got from a cache of shared nfsd_files.

I think I'd prefer if preprocess_stateid returned an nfsd_file that
was already open for read. IIUC that would mean that no changes
would be needed here or in nfsd4_encode_read_plus().

Would that be difficult?


>  	if (file->f_op->splice_read && splice_ok)
>  		nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
>  	else
>  		nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
> +	if (o_fmode)
> +		file->f_mode = o_fmode;
> +
>  	if (nfserr) {
>  		xdr_truncate_encode(xdr, starting_len);
>  		return nfserr;
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index ffc217099d19..c1f13b5877c6 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -780,6 +780,8 @@ static inline bool try_to_expire_client(struct nfs4_client *clp)
>  	return clp->cl_state == NFSD4_EXPIRABLE;
>  }
>  
> +extern __be32 nfsd4_deleg_read_conflict(struct svc_rqst *rqstp,
> +		struct nfs4_client *clp, struct svc_fh *fhp);
>  extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp,
>  		struct inode *inode, bool *file_modified, u64 *size);
>  #endif   /* NFSD4_STATE_H */
> diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
> index fbdd42cde1fa..434973a6a8b1 100644
> --- a/fs/nfsd/xdr4.h
> +++ b/fs/nfsd/xdr4.h
> @@ -426,6 +426,8 @@ struct nfsd4_read {
>  	struct svc_rqst		*rd_rqstp;          /* response */
>  	struct svc_fh		*rd_fhp;            /* response */
>  	u32			rd_eof;             /* response */
> +
> +	struct nfs4_stid	*rd_wd_stid;		/* internal */
>  };
>  
>  struct nfsd4_readdir {
> -- 
> 2.43.0
>
Jeff Layton July 26, 2024, 2:49 p.m. UTC | #6
On Fri, 2024-07-26 at 09:54 -0400, Jeff Layton wrote:
> On Wed, 2024-07-24 at 10:01 -0700, Sagi Grimberg wrote:
> > Based on a patch fom Dai Ngo, allow NFSv4 client to use write
> > delegation
> > stateid for READ operation. Per RFC 8881 section 9.1.2. Use of the
> > Stateid and Locking.
> > 
> > In addition, for anonymous stateids, check for pending delegations
> > by
> > the filehandle and client_id, and if a conflict found, recall the
> > delegation
> > before allowing the read to take place.
> > 
> > Suggested-by: Dai Ngo <dai.ngo@oracle.com>
> > Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> > ---
> >  fs/nfsd/nfs4proc.c  | 22 +++++++++++++++++++--
> >  fs/nfsd/nfs4state.c | 47
> > +++++++++++++++++++++++++++++++++++++++++++++
> >  fs/nfsd/nfs4xdr.c   |  9 +++++++++
> >  fs/nfsd/state.h     |  2 ++
> >  fs/nfsd/xdr4.h      |  2 ++
> >  5 files changed, 80 insertions(+), 2 deletions(-)
> > 
> > diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> > index 7b70309ad8fb..324984ec70c6 100644
> > --- a/fs/nfsd/nfs4proc.c
> > +++ b/fs/nfsd/nfs4proc.c
> > @@ -979,8 +979,24 @@ nfsd4_read(struct svc_rqst *rqstp, struct
> > nfsd4_compound_state *cstate,
> >  	/* check stateid */
> >  	status = nfs4_preprocess_stateid_op(rqstp, cstate,
> > &cstate->current_fh,
> >  					&read->rd_stateid,
> > RD_STATE,
> > -					&read->rd_nf, NULL);
> > -
> > +					&read->rd_nf, &read-
> > >rd_wd_stid);
> 
> In the case where we have multiple stateids for this client, are we
> guaranteed to get back the delegation stateid here? What if we get
> back
> the open stateid for the O_WRONLY open instead?
> 

Oh, duh -- disregard this. We're looking for a specific stateid here,
so if the client sends a delegation stateid, that's what we'll find.
 
> 
> > +	/*
> > +	 * rd_wd_stid is needed for nfsd4_encode_read to allow
> > write
> > +	 * delegation stateid used for read. Its refcount is
> > decremented
> > +	 * by nfsd4_read_release when read is done.
> > +	 */
> > +	if (!status) {
> > +		if (!read->rd_wd_stid) {
> > +			/* special stateid? */
> > +			status = nfsd4_deleg_read_conflict(rqstp,
> > cstate->clp,
> > +				&cstate->current_fh);
> > +		} else if (read->rd_wd_stid->sc_type !=
> > SC_TYPE_DELEG ||
> > +			   delegstateid(read->rd_wd_stid)->dl_type
> > !=
> > +						NFS4_OPEN_DELEGATE
> > _WRITE) {
> > +			nfs4_put_stid(read->rd_wd_stid);
> > +			read->rd_wd_stid = NULL;
> > +		}
> > +	}
> >  	read->rd_rqstp = rqstp;
> >  	read->rd_fhp = &cstate->current_fh;
> >  	return status;
> > @@ -990,6 +1006,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct
> > nfsd4_compound_state *cstate,
> >  static void
> >  nfsd4_read_release(union nfsd4_op_u *u)
> >  {
> > +	if (u->read.rd_wd_stid)
> > +		nfs4_put_stid(u->read.rd_wd_stid);
> >  	if (u->read.rd_nf)
> >  		nfsd_file_put(u->read.rd_nf);
> >  	trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp,
> > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > index dc61a8adfcd4..7e6b9fb31a4c 100644
> > --- a/fs/nfsd/nfs4state.c
> > +++ b/fs/nfsd/nfs4state.c
> > @@ -8805,6 +8805,53 @@ nfsd4_get_writestateid(struct
> > nfsd4_compound_state *cstate,
> >  	get_stateid(cstate, &u->write.wr_stateid);
> >  }
> >  
> > +/**
> > + * nfsd4_deleg_read_conflict - Recall if read causes conflict
> > + * @rqstp: RPC transaction context
> > + * @clp: nfs client
> > + * @fhp: nfs file handle
> > + * @inode: file to be checked for a conflict
> > + * @modified: return true if file was modified
> > + * @size: new size of file if modified is true
> > + *
> > + * This function is called when there is a conflict between a
> > write
> > + * delegation and a read that is using a special stateid where the
> > + * we cannot derive the client stateid exsistence. The server
> > + * must recall a conflicting delegation before allowing the read
> > + * to continue.
> > + *
> > + * Returns 0 if there is no conflict; otherwise an nfs_stat
> > + * code is returned.
> > + */
> > +__be32 nfsd4_deleg_read_conflict(struct svc_rqst *rqstp,
> > +		struct nfs4_client *clp, struct svc_fh *fhp)
> > +{
> > +	struct nfs4_file *fp;
> > +	__be32 status = 0;
> > +
> > +	fp = nfsd4_file_hash_lookup(fhp);
> > +	if (!fp)
> > +		return nfs_ok;
> > +
> > +	spin_lock(&state_lock);
> > +	spin_lock(&fp->fi_lock);
> > +	if (!list_empty(&fp->fi_delegations) &&
> > +	    !nfs4_delegation_exists(clp, fp)) {
> > +		/* conflict, recall deleg */
> > +		status = nfserrno(nfsd_open_break_lease(fp-
> > >fi_inode,
> > +					NFSD_MAY_READ));
> > +		if (status)
> > +			goto out;
> > +		if (!nfsd_wait_for_delegreturn(rqstp, fp-
> > >fi_inode))
> > +			status = nfserr_jukebox;
> > +	}
> > +out:
> > +	spin_unlock(&fp->fi_lock);
> > +	spin_unlock(&state_lock);
> > +	return status;
> > +}
> > +
> > +
> >  /**
> >   * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes
> > conflict
> >   * @rqstp: RPC transaction context
> > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> > index c7bfd2180e3f..f0fe526fac3c 100644
> > --- a/fs/nfsd/nfs4xdr.c
> > +++ b/fs/nfsd/nfs4xdr.c
> > @@ -4418,6 +4418,7 @@ nfsd4_encode_read(struct nfsd4_compoundres
> > *resp, __be32 nfserr,
> >  	unsigned long maxcount;
> >  	struct file *file;
> >  	__be32 *p;
> > +	fmode_t o_fmode = 0;
> >  
> >  	if (nfserr)
> >  		return nfserr;
> > @@ -4437,10 +4438,18 @@ nfsd4_encode_read(struct nfsd4_compoundres
> > *resp, __be32 nfserr,
> >  	maxcount = min_t(unsigned long, read->rd_length,
> >  			 (xdr->buf->buflen - xdr->buf->len));
> >  
> > +	if (read->rd_wd_stid) {
> > +		/* allow READ using write delegation stateid */
> > +		o_fmode = file->f_mode;
> > +		file->f_mode |= FMODE_READ;
> > +	}

^^^
I do agree with Chuck that this looks sketchy.

> >  	if (file->f_op->splice_read && splice_ok)
> >  		nfserr = nfsd4_encode_splice_read(resp, read,
> > file, maxcount);
> >  	else
> >  		nfserr = nfsd4_encode_readv(resp, read, file,
> > maxcount);
> > +	if (o_fmode)
> > +		file->f_mode = o_fmode;
> > +
> >  	if (nfserr) {
> >  		xdr_truncate_encode(xdr, starting_len);
> >  		return nfserr;
> > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > index ffc217099d19..c1f13b5877c6 100644
> > --- a/fs/nfsd/state.h
> > +++ b/fs/nfsd/state.h
> > @@ -780,6 +780,8 @@ static inline bool try_to_expire_client(struct
> > nfs4_client *clp)
> >  	return clp->cl_state == NFSD4_EXPIRABLE;
> >  }
> >  
> > +extern __be32 nfsd4_deleg_read_conflict(struct svc_rqst *rqstp,
> > +		struct nfs4_client *clp, struct svc_fh *fhp);
> >  extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp,
> >  		struct inode *inode, bool *file_modified, u64
> > *size);
> >  #endif   /* NFSD4_STATE_H */
> > diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
> > index fbdd42cde1fa..434973a6a8b1 100644
> > --- a/fs/nfsd/xdr4.h
> > +++ b/fs/nfsd/xdr4.h
> > @@ -426,6 +426,8 @@ struct nfsd4_read {
> >  	struct svc_rqst		*rd_rqstp;          /*
> > response */
> >  	struct svc_fh		*rd_fhp;            /* response */
> >  	u32			rd_eof;             /* response */
> > +
> > +	struct nfs4_stid	*rd_wd_stid;		/*
> > internal */
> >  };
> >  
> >  struct nfsd4_readdir {
>
Dai Ngo July 27, 2024, 4:46 p.m. UTC | #7
On 7/24/24 10:01 AM, Sagi Grimberg wrote:
> Based on a patch fom Dai Ngo, allow NFSv4 client to use write delegation
> stateid for READ operation. Per RFC 8881 section 9.1.2. Use of the
> Stateid and Locking.
>
> In addition, for anonymous stateids, check for pending delegations by
> the filehandle and client_id, and if a conflict found, recall the delegation
> before allowing the read to take place.
>
> Suggested-by: Dai Ngo <dai.ngo@oracle.com>
> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> ---
>   fs/nfsd/nfs4proc.c  | 22 +++++++++++++++++++--
>   fs/nfsd/nfs4state.c | 47 +++++++++++++++++++++++++++++++++++++++++++++
>   fs/nfsd/nfs4xdr.c   |  9 +++++++++
>   fs/nfsd/state.h     |  2 ++
>   fs/nfsd/xdr4.h      |  2 ++
>   5 files changed, 80 insertions(+), 2 deletions(-)
>
> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> index 7b70309ad8fb..324984ec70c6 100644
> --- a/fs/nfsd/nfs4proc.c
> +++ b/fs/nfsd/nfs4proc.c
> @@ -979,8 +979,24 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>   	/* check stateid */
>   	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
>   					&read->rd_stateid, RD_STATE,
> -					&read->rd_nf, NULL);
> -
> +					&read->rd_nf, &read->rd_wd_stid);
> +	/*
> +	 * rd_wd_stid is needed for nfsd4_encode_read to allow write
> +	 * delegation stateid used for read. Its refcount is decremented
> +	 * by nfsd4_read_release when read is done.
> +	 */
> +	if (!status) {
> +		if (!read->rd_wd_stid) {
> +			/* special stateid? */
> +			status = nfsd4_deleg_read_conflict(rqstp, cstate->clp,
> +				&cstate->current_fh);
> +		} else if (read->rd_wd_stid->sc_type != SC_TYPE_DELEG ||
> +			   delegstateid(read->rd_wd_stid)->dl_type !=
> +						NFS4_OPEN_DELEGATE_WRITE) {
> +			nfs4_put_stid(read->rd_wd_stid);
> +			read->rd_wd_stid = NULL;
> +		}
> +	}
>   	read->rd_rqstp = rqstp;
>   	read->rd_fhp = &cstate->current_fh;
>   	return status;
> @@ -990,6 +1006,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>   static void
>   nfsd4_read_release(union nfsd4_op_u *u)
>   {
> +	if (u->read.rd_wd_stid)
> +		nfs4_put_stid(u->read.rd_wd_stid);
>   	if (u->read.rd_nf)
>   		nfsd_file_put(u->read.rd_nf);
>   	trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp,
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index dc61a8adfcd4..7e6b9fb31a4c 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -8805,6 +8805,53 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
>   	get_stateid(cstate, &u->write.wr_stateid);
>   }
>   
> +/**
> + * nfsd4_deleg_read_conflict - Recall if read causes conflict
> + * @rqstp: RPC transaction context
> + * @clp: nfs client
> + * @fhp: nfs file handle
> + * @inode: file to be checked for a conflict
> + * @modified: return true if file was modified
> + * @size: new size of file if modified is true
> + *
> + * This function is called when there is a conflict between a write
> + * delegation and a read that is using a special stateid where the
> + * we cannot derive the client stateid exsistence. The server
> + * must recall a conflicting delegation before allowing the read
> + * to continue.
> + *
> + * Returns 0 if there is no conflict; otherwise an nfs_stat
> + * code is returned.
> + */
> +__be32 nfsd4_deleg_read_conflict(struct svc_rqst *rqstp,
> +		struct nfs4_client *clp, struct svc_fh *fhp)
> +{
> +	struct nfs4_file *fp;
> +	__be32 status = 0;
> +
> +	fp = nfsd4_file_hash_lookup(fhp);
> +	if (!fp)
> +		return nfs_ok;
> +
> +	spin_lock(&state_lock);
> +	spin_lock(&fp->fi_lock);
> +	if (!list_empty(&fp->fi_delegations) &&
> +	    !nfs4_delegation_exists(clp, fp)) {
> +		/* conflict, recall deleg */

I don't see how we can have a delegation conflict here. If a client
has a write delegation then there should not be any delegations from
other clients.

-Dai

> +		status = nfserrno(nfsd_open_break_lease(fp->fi_inode,
> +					NFSD_MAY_READ));
> +		if (status)
> +			goto out;
> +		if (!nfsd_wait_for_delegreturn(rqstp, fp->fi_inode))
> +			status = nfserr_jukebox;
> +	}
> +out:
> +	spin_unlock(&fp->fi_lock);
> +	spin_unlock(&state_lock);
> +	return status;
> +}
> +
> +
>   /**
>    * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict
>    * @rqstp: RPC transaction context
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index c7bfd2180e3f..f0fe526fac3c 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -4418,6 +4418,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
>   	unsigned long maxcount;
>   	struct file *file;
>   	__be32 *p;
> +	fmode_t o_fmode = 0;
>   
>   	if (nfserr)
>   		return nfserr;
> @@ -4437,10 +4438,18 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
>   	maxcount = min_t(unsigned long, read->rd_length,
>   			 (xdr->buf->buflen - xdr->buf->len));
>   
> +	if (read->rd_wd_stid) {
> +		/* allow READ using write delegation stateid */
> +		o_fmode = file->f_mode;
> +		file->f_mode |= FMODE_READ;
> +	}
>   	if (file->f_op->splice_read && splice_ok)
>   		nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
>   	else
>   		nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
> +	if (o_fmode)
> +		file->f_mode = o_fmode;
> +
>   	if (nfserr) {
>   		xdr_truncate_encode(xdr, starting_len);
>   		return nfserr;
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index ffc217099d19..c1f13b5877c6 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -780,6 +780,8 @@ static inline bool try_to_expire_client(struct nfs4_client *clp)
>   	return clp->cl_state == NFSD4_EXPIRABLE;
>   }
>   
> +extern __be32 nfsd4_deleg_read_conflict(struct svc_rqst *rqstp,
> +		struct nfs4_client *clp, struct svc_fh *fhp);
>   extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp,
>   		struct inode *inode, bool *file_modified, u64 *size);
>   #endif   /* NFSD4_STATE_H */
> diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
> index fbdd42cde1fa..434973a6a8b1 100644
> --- a/fs/nfsd/xdr4.h
> +++ b/fs/nfsd/xdr4.h
> @@ -426,6 +426,8 @@ struct nfsd4_read {
>   	struct svc_rqst		*rd_rqstp;          /* response */
>   	struct svc_fh		*rd_fhp;            /* response */
>   	u32			rd_eof;             /* response */
> +
> +	struct nfs4_stid	*rd_wd_stid;		/* internal */
>   };
>   
>   struct nfsd4_readdir {
Chuck Lever III July 27, 2024, 4:52 p.m. UTC | #8
On Sat, Jul 27, 2024 at 09:46:34AM -0700, Dai Ngo wrote:
> 
> On 7/24/24 10:01 AM, Sagi Grimberg wrote:
> > Based on a patch fom Dai Ngo, allow NFSv4 client to use write delegation
> > stateid for READ operation. Per RFC 8881 section 9.1.2. Use of the
> > Stateid and Locking.
> > 
> > In addition, for anonymous stateids, check for pending delegations by
> > the filehandle and client_id, and if a conflict found, recall the delegation
> > before allowing the read to take place.
> > 
> > Suggested-by: Dai Ngo <dai.ngo@oracle.com>
> > Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> > ---
> >   fs/nfsd/nfs4proc.c  | 22 +++++++++++++++++++--
> >   fs/nfsd/nfs4state.c | 47 +++++++++++++++++++++++++++++++++++++++++++++
> >   fs/nfsd/nfs4xdr.c   |  9 +++++++++
> >   fs/nfsd/state.h     |  2 ++
> >   fs/nfsd/xdr4.h      |  2 ++
> >   5 files changed, 80 insertions(+), 2 deletions(-)
> > 
> > diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> > index 7b70309ad8fb..324984ec70c6 100644
> > --- a/fs/nfsd/nfs4proc.c
> > +++ b/fs/nfsd/nfs4proc.c
> > @@ -979,8 +979,24 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> >   	/* check stateid */
> >   	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
> >   					&read->rd_stateid, RD_STATE,
> > -					&read->rd_nf, NULL);
> > -
> > +					&read->rd_nf, &read->rd_wd_stid);
> > +	/*
> > +	 * rd_wd_stid is needed for nfsd4_encode_read to allow write
> > +	 * delegation stateid used for read. Its refcount is decremented
> > +	 * by nfsd4_read_release when read is done.
> > +	 */
> > +	if (!status) {
> > +		if (!read->rd_wd_stid) {
> > +			/* special stateid? */
> > +			status = nfsd4_deleg_read_conflict(rqstp, cstate->clp,
> > +				&cstate->current_fh);
> > +		} else if (read->rd_wd_stid->sc_type != SC_TYPE_DELEG ||
> > +			   delegstateid(read->rd_wd_stid)->dl_type !=
> > +						NFS4_OPEN_DELEGATE_WRITE) {
> > +			nfs4_put_stid(read->rd_wd_stid);
> > +			read->rd_wd_stid = NULL;
> > +		}
> > +	}
> >   	read->rd_rqstp = rqstp;
> >   	read->rd_fhp = &cstate->current_fh;
> >   	return status;
> > @@ -990,6 +1006,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> >   static void
> >   nfsd4_read_release(union nfsd4_op_u *u)
> >   {
> > +	if (u->read.rd_wd_stid)
> > +		nfs4_put_stid(u->read.rd_wd_stid);
> >   	if (u->read.rd_nf)
> >   		nfsd_file_put(u->read.rd_nf);
> >   	trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp,
> > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > index dc61a8adfcd4..7e6b9fb31a4c 100644
> > --- a/fs/nfsd/nfs4state.c
> > +++ b/fs/nfsd/nfs4state.c
> > @@ -8805,6 +8805,53 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
> >   	get_stateid(cstate, &u->write.wr_stateid);
> >   }
> > +/**
> > + * nfsd4_deleg_read_conflict - Recall if read causes conflict
> > + * @rqstp: RPC transaction context
> > + * @clp: nfs client
> > + * @fhp: nfs file handle
> > + * @inode: file to be checked for a conflict
> > + * @modified: return true if file was modified
> > + * @size: new size of file if modified is true
> > + *
> > + * This function is called when there is a conflict between a write
> > + * delegation and a read that is using a special stateid where the
> > + * we cannot derive the client stateid exsistence. The server
> > + * must recall a conflicting delegation before allowing the read
> > + * to continue.
> > + *
> > + * Returns 0 if there is no conflict; otherwise an nfs_stat
> > + * code is returned.
> > + */
> > +__be32 nfsd4_deleg_read_conflict(struct svc_rqst *rqstp,
> > +		struct nfs4_client *clp, struct svc_fh *fhp)
> > +{
> > +	struct nfs4_file *fp;
> > +	__be32 status = 0;
> > +
> > +	fp = nfsd4_file_hash_lookup(fhp);
> > +	if (!fp)
> > +		return nfs_ok;
> > +
> > +	spin_lock(&state_lock);
> > +	spin_lock(&fp->fi_lock);
> > +	if (!list_empty(&fp->fi_delegations) &&
> > +	    !nfs4_delegation_exists(clp, fp)) {
> > +		/* conflict, recall deleg */
> 
> I don't see how we can have a delegation conflict here. If a client
> has a write delegation then there should not be any delegations from
> other clients.

A delegation conflict is possible if the client is using an
anonymous stateid to perform the READ.

One thing we could perhaps do is add support for the use of
anonymous stateids as a separate patch. Sagi, how necessary is
support for "READ with anonymous stateid" for supporting WR_ONLY
write delegation?


> > +		status = nfserrno(nfsd_open_break_lease(fp->fi_inode,
> > +					NFSD_MAY_READ));
> > +		if (status)
> > +			goto out;
> > +		if (!nfsd_wait_for_delegreturn(rqstp, fp->fi_inode))
> > +			status = nfserr_jukebox;
> > +	}
> > +out:
> > +	spin_unlock(&fp->fi_lock);
> > +	spin_unlock(&state_lock);
> > +	return status;
> > +}
> > +
> > +
> >   /**
> >    * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict
> >    * @rqstp: RPC transaction context
> > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> > index c7bfd2180e3f..f0fe526fac3c 100644
> > --- a/fs/nfsd/nfs4xdr.c
> > +++ b/fs/nfsd/nfs4xdr.c
> > @@ -4418,6 +4418,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
> >   	unsigned long maxcount;
> >   	struct file *file;
> >   	__be32 *p;
> > +	fmode_t o_fmode = 0;
> >   	if (nfserr)
> >   		return nfserr;
> > @@ -4437,10 +4438,18 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
> >   	maxcount = min_t(unsigned long, read->rd_length,
> >   			 (xdr->buf->buflen - xdr->buf->len));
> > +	if (read->rd_wd_stid) {
> > +		/* allow READ using write delegation stateid */
> > +		o_fmode = file->f_mode;
> > +		file->f_mode |= FMODE_READ;
> > +	}
> >   	if (file->f_op->splice_read && splice_ok)
> >   		nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
> >   	else
> >   		nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
> > +	if (o_fmode)
> > +		file->f_mode = o_fmode;
> > +
> >   	if (nfserr) {
> >   		xdr_truncate_encode(xdr, starting_len);
> >   		return nfserr;
> > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> > index ffc217099d19..c1f13b5877c6 100644
> > --- a/fs/nfsd/state.h
> > +++ b/fs/nfsd/state.h
> > @@ -780,6 +780,8 @@ static inline bool try_to_expire_client(struct nfs4_client *clp)
> >   	return clp->cl_state == NFSD4_EXPIRABLE;
> >   }
> > +extern __be32 nfsd4_deleg_read_conflict(struct svc_rqst *rqstp,
> > +		struct nfs4_client *clp, struct svc_fh *fhp);
> >   extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp,
> >   		struct inode *inode, bool *file_modified, u64 *size);
> >   #endif   /* NFSD4_STATE_H */
> > diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
> > index fbdd42cde1fa..434973a6a8b1 100644
> > --- a/fs/nfsd/xdr4.h
> > +++ b/fs/nfsd/xdr4.h
> > @@ -426,6 +426,8 @@ struct nfsd4_read {
> >   	struct svc_rqst		*rd_rqstp;          /* response */
> >   	struct svc_fh		*rd_fhp;            /* response */
> >   	u32			rd_eof;             /* response */
> > +
> > +	struct nfs4_stid	*rd_wd_stid;		/* internal */
> >   };
> >   struct nfsd4_readdir {
Dai Ngo July 27, 2024, 5 p.m. UTC | #9
On 7/26/24 7:06 AM, Chuck Lever wrote:
> On Wed, Jul 24, 2024 at 10:01:38AM -0700, Sagi Grimberg wrote:
>> Based on a patch fom Dai Ngo, allow NFSv4 client to use write delegation
>> stateid for READ operation. Per RFC 8881 section 9.1.2. Use of the
>> Stateid and Locking.
>>
>> In addition, for anonymous stateids, check for pending delegations by
>> the filehandle and client_id, and if a conflict found, recall the delegation
>> before allowing the read to take place.
>>
>> Suggested-by: Dai Ngo <dai.ngo@oracle.com>
>> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
>> ---
>>   fs/nfsd/nfs4proc.c  | 22 +++++++++++++++++++--
>>   fs/nfsd/nfs4state.c | 47 +++++++++++++++++++++++++++++++++++++++++++++
>>   fs/nfsd/nfs4xdr.c   |  9 +++++++++
>>   fs/nfsd/state.h     |  2 ++
>>   fs/nfsd/xdr4.h      |  2 ++
>>   5 files changed, 80 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
>> index 7b70309ad8fb..324984ec70c6 100644
>> --- a/fs/nfsd/nfs4proc.c
>> +++ b/fs/nfsd/nfs4proc.c
>> @@ -979,8 +979,24 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>>   	/* check stateid */
>>   	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
>>   					&read->rd_stateid, RD_STATE,
>> -					&read->rd_nf, NULL);
>> -
>> +					&read->rd_nf, &read->rd_wd_stid);
>> +	/*
>> +	 * rd_wd_stid is needed for nfsd4_encode_read to allow write
>> +	 * delegation stateid used for read. Its refcount is decremented
>> +	 * by nfsd4_read_release when read is done.
>> +	 */
>> +	if (!status) {
>> +		if (!read->rd_wd_stid) {
>> +			/* special stateid? */
>> +			status = nfsd4_deleg_read_conflict(rqstp, cstate->clp,
>> +				&cstate->current_fh);
>> +		} else if (read->rd_wd_stid->sc_type != SC_TYPE_DELEG ||
>> +			   delegstateid(read->rd_wd_stid)->dl_type !=
>> +						NFS4_OPEN_DELEGATE_WRITE) {
>> +			nfs4_put_stid(read->rd_wd_stid);
>> +			read->rd_wd_stid = NULL;
>> +		}
>> +	}
>>   	read->rd_rqstp = rqstp;
>>   	read->rd_fhp = &cstate->current_fh;
>>   	return status;
>> @@ -990,6 +1006,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>>   static void
>>   nfsd4_read_release(union nfsd4_op_u *u)
>>   {
>> +	if (u->read.rd_wd_stid)
>> +		nfs4_put_stid(u->read.rd_wd_stid);
>>   	if (u->read.rd_nf)
>>   		nfsd_file_put(u->read.rd_nf);
>>   	trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp,
>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>> index dc61a8adfcd4..7e6b9fb31a4c 100644
>> --- a/fs/nfsd/nfs4state.c
>> +++ b/fs/nfsd/nfs4state.c
>> @@ -8805,6 +8805,53 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
>>   	get_stateid(cstate, &u->write.wr_stateid);
>>   }
>>   
>> +/**
>> + * nfsd4_deleg_read_conflict - Recall if read causes conflict
>> + * @rqstp: RPC transaction context
>> + * @clp: nfs client
>> + * @fhp: nfs file handle
>> + * @inode: file to be checked for a conflict
>> + * @modified: return true if file was modified
>> + * @size: new size of file if modified is true
>> + *
>> + * This function is called when there is a conflict between a write
>> + * delegation and a read that is using a special stateid where the
>> + * we cannot derive the client stateid exsistence. The server
>> + * must recall a conflicting delegation before allowing the read
>> + * to continue.
>> + *
>> + * Returns 0 if there is no conflict; otherwise an nfs_stat
>> + * code is returned.
>> + */
>> +__be32 nfsd4_deleg_read_conflict(struct svc_rqst *rqstp,
>> +		struct nfs4_client *clp, struct svc_fh *fhp)
>> +{
>> +	struct nfs4_file *fp;
>> +	__be32 status = 0;
>> +
>> +	fp = nfsd4_file_hash_lookup(fhp);
>> +	if (!fp)
>> +		return nfs_ok;
>> +
>> +	spin_lock(&state_lock);
>> +	spin_lock(&fp->fi_lock);
>> +	if (!list_empty(&fp->fi_delegations) &&
>> +	    !nfs4_delegation_exists(clp, fp)) {
>> +		/* conflict, recall deleg */
>> +		status = nfserrno(nfsd_open_break_lease(fp->fi_inode,
>> +					NFSD_MAY_READ));
>> +		if (status)
>> +			goto out;
>> +		if (!nfsd_wait_for_delegreturn(rqstp, fp->fi_inode))
>> +			status = nfserr_jukebox;
>> +	}
>> +out:
>> +	spin_unlock(&fp->fi_lock);
>> +	spin_unlock(&state_lock);
>> +	return status;
>> +}
>> +
>> +
>>   /**
>>    * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict
>>    * @rqstp: RPC transaction context
>> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
>> index c7bfd2180e3f..f0fe526fac3c 100644
>> --- a/fs/nfsd/nfs4xdr.c
>> +++ b/fs/nfsd/nfs4xdr.c
>> @@ -4418,6 +4418,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
>>   	unsigned long maxcount;
>>   	struct file *file;
>>   	__be32 *p;
>> +	fmode_t o_fmode = 0;
>>   
>>   	if (nfserr)
>>   		return nfserr;
>> @@ -4437,10 +4438,18 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
>>   	maxcount = min_t(unsigned long, read->rd_length,
>>   			 (xdr->buf->buflen - xdr->buf->len));
>>   
>> +	if (read->rd_wd_stid) {
>> +		/* allow READ using write delegation stateid */
>> +		o_fmode = file->f_mode;
>> +		file->f_mode |= FMODE_READ;
>> +	}
> nfsd4_encode_read_plus() needs to handle write delegation stateids
> as well.
>
> I'm not too sure about modifying the f_mode on an nfsd_file you
> just got from a cache of shared nfsd_files.
>
> I think I'd prefer if preprocess_stateid returned an nfsd_file that
> was already open for read.

There would not be any nfsd_file that was already open for read since
the file still has a write delegation on it, unless the open for read
was from the same client.

I'm also not sure if a client would send an open for read to the server
when it already owned the write delegation of the file.

-Dai


>   IIUC that would mean that no changes
> would be needed here or in nfsd4_encode_read_plus().
>
> Would that be difficult?
>
>
>>   	if (file->f_op->splice_read && splice_ok)
>>   		nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
>>   	else
>>   		nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
>> +	if (o_fmode)
>> +		file->f_mode = o_fmode;
>> +
>>   	if (nfserr) {
>>   		xdr_truncate_encode(xdr, starting_len);
>>   		return nfserr;
>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>> index ffc217099d19..c1f13b5877c6 100644
>> --- a/fs/nfsd/state.h
>> +++ b/fs/nfsd/state.h
>> @@ -780,6 +780,8 @@ static inline bool try_to_expire_client(struct nfs4_client *clp)
>>   	return clp->cl_state == NFSD4_EXPIRABLE;
>>   }
>>   
>> +extern __be32 nfsd4_deleg_read_conflict(struct svc_rqst *rqstp,
>> +		struct nfs4_client *clp, struct svc_fh *fhp);
>>   extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp,
>>   		struct inode *inode, bool *file_modified, u64 *size);
>>   #endif   /* NFSD4_STATE_H */
>> diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
>> index fbdd42cde1fa..434973a6a8b1 100644
>> --- a/fs/nfsd/xdr4.h
>> +++ b/fs/nfsd/xdr4.h
>> @@ -426,6 +426,8 @@ struct nfsd4_read {
>>   	struct svc_rqst		*rd_rqstp;          /* response */
>>   	struct svc_fh		*rd_fhp;            /* response */
>>   	u32			rd_eof;             /* response */
>> +
>> +	struct nfs4_stid	*rd_wd_stid;		/* internal */
>>   };
>>   
>>   struct nfsd4_readdir {
>> -- 
>> 2.43.0
>>
Dai Ngo July 27, 2024, 6:55 p.m. UTC | #10
On 7/27/24 9:52 AM, Chuck Lever wrote:
> On Sat, Jul 27, 2024 at 09:46:34AM -0700, Dai Ngo wrote:
>> On 7/24/24 10:01 AM, Sagi Grimberg wrote:
>>> Based on a patch fom Dai Ngo, allow NFSv4 client to use write delegation
>>> stateid for READ operation. Per RFC 8881 section 9.1.2. Use of the
>>> Stateid and Locking.
>>>
>>> In addition, for anonymous stateids, check for pending delegations by
>>> the filehandle and client_id, and if a conflict found, recall the delegation
>>> before allowing the read to take place.
>>>
>>> Suggested-by: Dai Ngo <dai.ngo@oracle.com>
>>> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
>>> ---
>>>    fs/nfsd/nfs4proc.c  | 22 +++++++++++++++++++--
>>>    fs/nfsd/nfs4state.c | 47 +++++++++++++++++++++++++++++++++++++++++++++
>>>    fs/nfsd/nfs4xdr.c   |  9 +++++++++
>>>    fs/nfsd/state.h     |  2 ++
>>>    fs/nfsd/xdr4.h      |  2 ++
>>>    5 files changed, 80 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
>>> index 7b70309ad8fb..324984ec70c6 100644
>>> --- a/fs/nfsd/nfs4proc.c
>>> +++ b/fs/nfsd/nfs4proc.c
>>> @@ -979,8 +979,24 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>>>    	/* check stateid */
>>>    	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
>>>    					&read->rd_stateid, RD_STATE,
>>> -					&read->rd_nf, NULL);
>>> -
>>> +					&read->rd_nf, &read->rd_wd_stid);
>>> +	/*
>>> +	 * rd_wd_stid is needed for nfsd4_encode_read to allow write
>>> +	 * delegation stateid used for read. Its refcount is decremented
>>> +	 * by nfsd4_read_release when read is done.
>>> +	 */
>>> +	if (!status) {
>>> +		if (!read->rd_wd_stid) {
>>> +			/* special stateid? */
>>> +			status = nfsd4_deleg_read_conflict(rqstp, cstate->clp,
>>> +				&cstate->current_fh);
>>> +		} else if (read->rd_wd_stid->sc_type != SC_TYPE_DELEG ||
>>> +			   delegstateid(read->rd_wd_stid)->dl_type !=
>>> +						NFS4_OPEN_DELEGATE_WRITE) {
>>> +			nfs4_put_stid(read->rd_wd_stid);
>>> +			read->rd_wd_stid = NULL;
>>> +		}
>>> +	}
>>>    	read->rd_rqstp = rqstp;
>>>    	read->rd_fhp = &cstate->current_fh;
>>>    	return status;
>>> @@ -990,6 +1006,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>>>    static void
>>>    nfsd4_read_release(union nfsd4_op_u *u)
>>>    {
>>> +	if (u->read.rd_wd_stid)
>>> +		nfs4_put_stid(u->read.rd_wd_stid);
>>>    	if (u->read.rd_nf)
>>>    		nfsd_file_put(u->read.rd_nf);
>>>    	trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp,
>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>>> index dc61a8adfcd4..7e6b9fb31a4c 100644
>>> --- a/fs/nfsd/nfs4state.c
>>> +++ b/fs/nfsd/nfs4state.c
>>> @@ -8805,6 +8805,53 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
>>>    	get_stateid(cstate, &u->write.wr_stateid);
>>>    }
>>> +/**
>>> + * nfsd4_deleg_read_conflict - Recall if read causes conflict
>>> + * @rqstp: RPC transaction context
>>> + * @clp: nfs client
>>> + * @fhp: nfs file handle
>>> + * @inode: file to be checked for a conflict
>>> + * @modified: return true if file was modified
>>> + * @size: new size of file if modified is true
>>> + *
>>> + * This function is called when there is a conflict between a write
>>> + * delegation and a read that is using a special stateid where the
>>> + * we cannot derive the client stateid exsistence. The server
>>> + * must recall a conflicting delegation before allowing the read
>>> + * to continue.
>>> + *
>>> + * Returns 0 if there is no conflict; otherwise an nfs_stat
>>> + * code is returned.
>>> + */
>>> +__be32 nfsd4_deleg_read_conflict(struct svc_rqst *rqstp,
>>> +		struct nfs4_client *clp, struct svc_fh *fhp)
>>> +{
>>> +	struct nfs4_file *fp;
>>> +	__be32 status = 0;
>>> +
>>> +	fp = nfsd4_file_hash_lookup(fhp);
>>> +	if (!fp)
>>> +		return nfs_ok;
>>> +
>>> +	spin_lock(&state_lock);
>>> +	spin_lock(&fp->fi_lock);
>>> +	if (!list_empty(&fp->fi_delegations) &&
>>> +	    !nfs4_delegation_exists(clp, fp)) {
>>> +		/* conflict, recall deleg */
>> I don't see how we can have a delegation conflict here. If a client
>> has a write delegation then there should not be any delegations from
>> other clients.
> A delegation conflict is possible if the client is using an
> anonymous stateid to perform the READ.

Then we should not detect any delegation conflict from this
function.

>
> One thing we could perhaps do is add support for the use of
> anonymous stateids as a separate patch.

This would be a separate issue from allowing write delegation
stateid to read. This would be to detect the scenario where a
client using special stateid to read while another client has
a write delegation on the file.

-Dai

>   Sagi, how necessary is
> support for "READ with anonymous stateid" for supporting WR_ONLY
> write delegation?
>
>
>>> +		status = nfserrno(nfsd_open_break_lease(fp->fi_inode,
>>> +					NFSD_MAY_READ));
>>> +		if (status)
>>> +			goto out;
>>> +		if (!nfsd_wait_for_delegreturn(rqstp, fp->fi_inode))
>>> +			status = nfserr_jukebox;
>>> +	}
>>> +out:
>>> +	spin_unlock(&fp->fi_lock);
>>> +	spin_unlock(&state_lock);
>>> +	return status;
>>> +}
>>> +
>>> +
>>>    /**
>>>     * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict
>>>     * @rqstp: RPC transaction context
>>> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
>>> index c7bfd2180e3f..f0fe526fac3c 100644
>>> --- a/fs/nfsd/nfs4xdr.c
>>> +++ b/fs/nfsd/nfs4xdr.c
>>> @@ -4418,6 +4418,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
>>>    	unsigned long maxcount;
>>>    	struct file *file;
>>>    	__be32 *p;
>>> +	fmode_t o_fmode = 0;
>>>    	if (nfserr)
>>>    		return nfserr;
>>> @@ -4437,10 +4438,18 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
>>>    	maxcount = min_t(unsigned long, read->rd_length,
>>>    			 (xdr->buf->buflen - xdr->buf->len));
>>> +	if (read->rd_wd_stid) {
>>> +		/* allow READ using write delegation stateid */
>>> +		o_fmode = file->f_mode;
>>> +		file->f_mode |= FMODE_READ;
>>> +	}
>>>    	if (file->f_op->splice_read && splice_ok)
>>>    		nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
>>>    	else
>>>    		nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
>>> +	if (o_fmode)
>>> +		file->f_mode = o_fmode;
>>> +
>>>    	if (nfserr) {
>>>    		xdr_truncate_encode(xdr, starting_len);
>>>    		return nfserr;
>>> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
>>> index ffc217099d19..c1f13b5877c6 100644
>>> --- a/fs/nfsd/state.h
>>> +++ b/fs/nfsd/state.h
>>> @@ -780,6 +780,8 @@ static inline bool try_to_expire_client(struct nfs4_client *clp)
>>>    	return clp->cl_state == NFSD4_EXPIRABLE;
>>>    }
>>> +extern __be32 nfsd4_deleg_read_conflict(struct svc_rqst *rqstp,
>>> +		struct nfs4_client *clp, struct svc_fh *fhp);
>>>    extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp,
>>>    		struct inode *inode, bool *file_modified, u64 *size);
>>>    #endif   /* NFSD4_STATE_H */
>>> diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
>>> index fbdd42cde1fa..434973a6a8b1 100644
>>> --- a/fs/nfsd/xdr4.h
>>> +++ b/fs/nfsd/xdr4.h
>>> @@ -426,6 +426,8 @@ struct nfsd4_read {
>>>    	struct svc_rqst		*rd_rqstp;          /* response */
>>>    	struct svc_fh		*rd_fhp;            /* response */
>>>    	u32			rd_eof;             /* response */
>>> +
>>> +	struct nfs4_stid	*rd_wd_stid;		/* internal */
>>>    };
>>>    struct nfsd4_readdir {
Sagi Grimberg July 27, 2024, 7:22 p.m. UTC | #11
>> nfsd4_encode_read_plus() needs to handle write delegation stateids
>> as well.
>>
>> I'm not too sure about modifying the f_mode on an nfsd_file you
>> just got from a cache of shared nfsd_files.
>>
>> I think I'd prefer if preprocess_stateid returned an nfsd_file that
>> was already open for read.
>
> There would not be any nfsd_file that was already open for read since
> the file still has a write delegation on it, unless the open for read
> was from the same client.
>
> I'm also not sure if a client would send an open for read to the server
> when it already owned the write delegation of the file.

It doesn't have to, IIUC. So this needs to be addressed.
One thing that we could do is open the file for RW to begin with if we hand
out a write delegation?

BTW, the patch is missing the part that actually offers the write 
delegation,
which was sent initially.
Sagi Grimberg July 27, 2024, 7:26 p.m. UTC | #12
On 26/07/2024 17:06, Chuck Lever wrote:
> On Wed, Jul 24, 2024 at 10:01:38AM -0700, Sagi Grimberg wrote:
>> Based on a patch fom Dai Ngo, allow NFSv4 client to use write delegation
>> stateid for READ operation. Per RFC 8881 section 9.1.2. Use of the
>> Stateid and Locking.
>>
>> In addition, for anonymous stateids, check for pending delegations by
>> the filehandle and client_id, and if a conflict found, recall the delegation
>> before allowing the read to take place.
>>
>> Suggested-by: Dai Ngo <dai.ngo@oracle.com>
>> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
>> ---
>>   fs/nfsd/nfs4proc.c  | 22 +++++++++++++++++++--
>>   fs/nfsd/nfs4state.c | 47 +++++++++++++++++++++++++++++++++++++++++++++
>>   fs/nfsd/nfs4xdr.c   |  9 +++++++++
>>   fs/nfsd/state.h     |  2 ++
>>   fs/nfsd/xdr4.h      |  2 ++
>>   5 files changed, 80 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
>> index 7b70309ad8fb..324984ec70c6 100644
>> --- a/fs/nfsd/nfs4proc.c
>> +++ b/fs/nfsd/nfs4proc.c
>> @@ -979,8 +979,24 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>>   	/* check stateid */
>>   	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
>>   					&read->rd_stateid, RD_STATE,
>> -					&read->rd_nf, NULL);
>> -
>> +					&read->rd_nf, &read->rd_wd_stid);
>> +	/*
>> +	 * rd_wd_stid is needed for nfsd4_encode_read to allow write
>> +	 * delegation stateid used for read. Its refcount is decremented
>> +	 * by nfsd4_read_release when read is done.
>> +	 */
>> +	if (!status) {
>> +		if (!read->rd_wd_stid) {
>> +			/* special stateid? */
>> +			status = nfsd4_deleg_read_conflict(rqstp, cstate->clp,
>> +				&cstate->current_fh);
>> +		} else if (read->rd_wd_stid->sc_type != SC_TYPE_DELEG ||
>> +			   delegstateid(read->rd_wd_stid)->dl_type !=
>> +						NFS4_OPEN_DELEGATE_WRITE) {
>> +			nfs4_put_stid(read->rd_wd_stid);
>> +			read->rd_wd_stid = NULL;
>> +		}
>> +	}
>>   	read->rd_rqstp = rqstp;
>>   	read->rd_fhp = &cstate->current_fh;
>>   	return status;
>> @@ -990,6 +1006,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>>   static void
>>   nfsd4_read_release(union nfsd4_op_u *u)
>>   {
>> +	if (u->read.rd_wd_stid)
>> +		nfs4_put_stid(u->read.rd_wd_stid);
>>   	if (u->read.rd_nf)
>>   		nfsd_file_put(u->read.rd_nf);
>>   	trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp,
>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>> index dc61a8adfcd4..7e6b9fb31a4c 100644
>> --- a/fs/nfsd/nfs4state.c
>> +++ b/fs/nfsd/nfs4state.c
>> @@ -8805,6 +8805,53 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
>>   	get_stateid(cstate, &u->write.wr_stateid);
>>   }
>>   
>> +/**
>> + * nfsd4_deleg_read_conflict - Recall if read causes conflict
>> + * @rqstp: RPC transaction context
>> + * @clp: nfs client
>> + * @fhp: nfs file handle
>> + * @inode: file to be checked for a conflict
>> + * @modified: return true if file was modified
>> + * @size: new size of file if modified is true
>> + *
>> + * This function is called when there is a conflict between a write
>> + * delegation and a read that is using a special stateid where the
>> + * we cannot derive the client stateid exsistence. The server
>> + * must recall a conflicting delegation before allowing the read
>> + * to continue.
>> + *
>> + * Returns 0 if there is no conflict; otherwise an nfs_stat
>> + * code is returned.
>> + */
>> +__be32 nfsd4_deleg_read_conflict(struct svc_rqst *rqstp,
>> +		struct nfs4_client *clp, struct svc_fh *fhp)
>> +{
>> +	struct nfs4_file *fp;
>> +	__be32 status = 0;
>> +
>> +	fp = nfsd4_file_hash_lookup(fhp);
>> +	if (!fp)
>> +		return nfs_ok;
>> +
>> +	spin_lock(&state_lock);
>> +	spin_lock(&fp->fi_lock);
>> +	if (!list_empty(&fp->fi_delegations) &&
>> +	    !nfs4_delegation_exists(clp, fp)) {
>> +		/* conflict, recall deleg */
>> +		status = nfserrno(nfsd_open_break_lease(fp->fi_inode,
>> +					NFSD_MAY_READ));
>> +		if (status)
>> +			goto out;
>> +		if (!nfsd_wait_for_delegreturn(rqstp, fp->fi_inode))
>> +			status = nfserr_jukebox;
>> +	}
>> +out:
>> +	spin_unlock(&fp->fi_lock);
>> +	spin_unlock(&state_lock);
>> +	return status;
>> +}
>> +
>> +
>>   /**
>>    * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict
>>    * @rqstp: RPC transaction context
>> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
>> index c7bfd2180e3f..f0fe526fac3c 100644
>> --- a/fs/nfsd/nfs4xdr.c
>> +++ b/fs/nfsd/nfs4xdr.c
>> @@ -4418,6 +4418,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
>>   	unsigned long maxcount;
>>   	struct file *file;
>>   	__be32 *p;
>> +	fmode_t o_fmode = 0;
>>   
>>   	if (nfserr)
>>   		return nfserr;
>> @@ -4437,10 +4438,18 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
>>   	maxcount = min_t(unsigned long, read->rd_length,
>>   			 (xdr->buf->buflen - xdr->buf->len));
>>   
>> +	if (read->rd_wd_stid) {
>> +		/* allow READ using write delegation stateid */
>> +		o_fmode = file->f_mode;
>> +		file->f_mode |= FMODE_READ;
>> +	}
> nfsd4_encode_read_plus() needs to handle write delegation stateids
> as well.

Yes, missed that one.

>
> I'm not too sure about modifying the f_mode on an nfsd_file you
> just got from a cache of shared nfsd_files.

If that is a problem, nfsd can open the file with rw access to begin with
if a write deleg was given?

>
> I think I'd prefer if preprocess_stateid returned an nfsd_file that
> was already open for read. IIUC that would mean that no changes
> would be needed here or in nfsd4_encode_read_plus().
>
> Would that be difficult?

preprocess_stateif to return a nfsd_file? not sure. But Nai correctly 
pointed out
that the file may have not been opened for read.
Sagi Grimberg July 27, 2024, 7:28 p.m. UTC | #13
>> I don't see how we can have a delegation conflict here. If a client
>> has a write delegation then there should not be any delegations from
>> other clients.
> A delegation conflict is possible if the client is using an
> anonymous stateid to perform the READ.
>
> One thing we could perhaps do is add support for the use of
> anonymous stateids as a separate patch. Sagi, how necessary is
> support for "READ with anonymous stateid" for supporting WR_ONLY
> write delegation?

Without this in the patch, there is a pynfs tests that breaks (read with 
anon stateid).
I can separate it if you want, but didn't want to introduce a patch that 
causes a regression
on its own.
Sagi Grimberg July 27, 2024, 7:33 p.m. UTC | #14
>>>> id(cstate, &u->write.wr_stateid);
>>>>    }
>>>> +/**
>>>> + * nfsd4_deleg_read_conflict - Recall if read causes conflict
>>>> + * @rqstp: RPC transaction context
>>>> + * @clp: nfs client
>>>> + * @fhp: nfs file handle
>>>> + * @inode: file to be checked for a conflict
>>>> + * @modified: return true if file was modified
>>>> + * @size: new size of file if modified is true
>>>> + *
>>>> + * This function is called when there is a conflict between a write
>>>> + * delegation and a read that is using a special stateid where the
>>>> + * we cannot derive the client stateid exsistence. The server
>>>> + * must recall a conflicting delegation before allowing the read
>>>> + * to continue.
>>>> + *
>>>> + * Returns 0 if there is no conflict; otherwise an nfs_stat
>>>> + * code is returned.
>>>> + */
>>>> +__be32 nfsd4_deleg_read_conflict(struct svc_rqst *rqstp,
>>>> +        struct nfs4_client *clp, struct svc_fh *fhp)
>>>> +{
>>>> +    struct nfs4_file *fp;
>>>> +    __be32 status = 0;
>>>> +
>>>> +    fp = nfsd4_file_hash_lookup(fhp);
>>>> +    if (!fp)
>>>> +        return nfs_ok;
>>>> +
>>>> +    spin_lock(&state_lock);
>>>> +    spin_lock(&fp->fi_lock);
>>>> +    if (!list_empty(&fp->fi_delegations) &&
>>>> +        !nfs4_delegation_exists(clp, fp)) {
>>>> +        /* conflict, recall deleg */
>>> I don't see how we can have a delegation conflict here. If a client
>>> has a write delegation then there should not be any delegations from
>>> other clients.
>> A delegation conflict is possible if the client is using an
>> anonymous stateid to perform the READ.
>
> Then we should not detect any delegation conflict from this
> function.

Can you explain why?

If the client sent a raed with anon stateid, this function checks the 
pending
delegations (fi_delegations), and not from the same client 
(!nfs4_delegation_exists),
then it should detect a conflict.

>
>>
>> One thing we could perhaps do is add support for the use of
>> anonymous stateids as a separate patch.
>
> This would be a separate issue from allowing write delegation
> stateid to read. This would be to detect the scenario where a
> client using special stateid to read while another client has
> a write delegation on the file.

As mentioned, I can separate it, but this would make this patch
break a pynfs test.
Chuck Lever III July 27, 2024, 7:53 p.m. UTC | #15
On Sat, Jul 27, 2024 at 10:26:24PM +0300, Sagi Grimberg wrote:
> 
> 
> 
> On 26/07/2024 17:06, Chuck Lever wrote:
> > On Wed, Jul 24, 2024 at 10:01:38AM -0700, Sagi Grimberg wrote:
> > > Based on a patch fom Dai Ngo, allow NFSv4 client to use write delegation
> > > stateid for READ operation. Per RFC 8881 section 9.1.2. Use of the
> > > Stateid and Locking.
> > > 
> > > In addition, for anonymous stateids, check for pending delegations by
> > > the filehandle and client_id, and if a conflict found, recall the delegation
> > > before allowing the read to take place.
> > > 
> > > Suggested-by: Dai Ngo <dai.ngo@oracle.com>
> > > Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> > > ---
> > >   fs/nfsd/nfs4proc.c  | 22 +++++++++++++++++++--
> > >   fs/nfsd/nfs4state.c | 47 +++++++++++++++++++++++++++++++++++++++++++++
> > >   fs/nfsd/nfs4xdr.c   |  9 +++++++++
> > >   fs/nfsd/state.h     |  2 ++
> > >   fs/nfsd/xdr4.h      |  2 ++
> > >   5 files changed, 80 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> > > index 7b70309ad8fb..324984ec70c6 100644
> > > --- a/fs/nfsd/nfs4proc.c
> > > +++ b/fs/nfsd/nfs4proc.c
> > > @@ -979,8 +979,24 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> > >   	/* check stateid */
> > >   	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
> > >   					&read->rd_stateid, RD_STATE,
> > > -					&read->rd_nf, NULL);
> > > -
> > > +					&read->rd_nf, &read->rd_wd_stid);
> > > +	/*
> > > +	 * rd_wd_stid is needed for nfsd4_encode_read to allow write
> > > +	 * delegation stateid used for read. Its refcount is decremented
> > > +	 * by nfsd4_read_release when read is done.
> > > +	 */
> > > +	if (!status) {
> > > +		if (!read->rd_wd_stid) {
> > > +			/* special stateid? */
> > > +			status = nfsd4_deleg_read_conflict(rqstp, cstate->clp,
> > > +				&cstate->current_fh);
> > > +		} else if (read->rd_wd_stid->sc_type != SC_TYPE_DELEG ||
> > > +			   delegstateid(read->rd_wd_stid)->dl_type !=
> > > +						NFS4_OPEN_DELEGATE_WRITE) {
> > > +			nfs4_put_stid(read->rd_wd_stid);
> > > +			read->rd_wd_stid = NULL;
> > > +		}
> > > +	}
> > >   	read->rd_rqstp = rqstp;
> > >   	read->rd_fhp = &cstate->current_fh;
> > >   	return status;
> > > @@ -990,6 +1006,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> > >   static void
> > >   nfsd4_read_release(union nfsd4_op_u *u)
> > >   {
> > > +	if (u->read.rd_wd_stid)
> > > +		nfs4_put_stid(u->read.rd_wd_stid);
> > >   	if (u->read.rd_nf)
> > >   		nfsd_file_put(u->read.rd_nf);
> > >   	trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp,
> > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > index dc61a8adfcd4..7e6b9fb31a4c 100644
> > > --- a/fs/nfsd/nfs4state.c
> > > +++ b/fs/nfsd/nfs4state.c
> > > @@ -8805,6 +8805,53 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
> > >   	get_stateid(cstate, &u->write.wr_stateid);
> > >   }
> > > +/**
> > > + * nfsd4_deleg_read_conflict - Recall if read causes conflict
> > > + * @rqstp: RPC transaction context
> > > + * @clp: nfs client
> > > + * @fhp: nfs file handle
> > > + * @inode: file to be checked for a conflict
> > > + * @modified: return true if file was modified
> > > + * @size: new size of file if modified is true
> > > + *
> > > + * This function is called when there is a conflict between a write
> > > + * delegation and a read that is using a special stateid where the
> > > + * we cannot derive the client stateid exsistence. The server
> > > + * must recall a conflicting delegation before allowing the read
> > > + * to continue.
> > > + *
> > > + * Returns 0 if there is no conflict; otherwise an nfs_stat
> > > + * code is returned.
> > > + */
> > > +__be32 nfsd4_deleg_read_conflict(struct svc_rqst *rqstp,
> > > +		struct nfs4_client *clp, struct svc_fh *fhp)
> > > +{
> > > +	struct nfs4_file *fp;
> > > +	__be32 status = 0;
> > > +
> > > +	fp = nfsd4_file_hash_lookup(fhp);
> > > +	if (!fp)
> > > +		return nfs_ok;
> > > +
> > > +	spin_lock(&state_lock);
> > > +	spin_lock(&fp->fi_lock);
> > > +	if (!list_empty(&fp->fi_delegations) &&
> > > +	    !nfs4_delegation_exists(clp, fp)) {
> > > +		/* conflict, recall deleg */
> > > +		status = nfserrno(nfsd_open_break_lease(fp->fi_inode,
> > > +					NFSD_MAY_READ));
> > > +		if (status)
> > > +			goto out;
> > > +		if (!nfsd_wait_for_delegreturn(rqstp, fp->fi_inode))
> > > +			status = nfserr_jukebox;
> > > +	}
> > > +out:
> > > +	spin_unlock(&fp->fi_lock);
> > > +	spin_unlock(&state_lock);
> > > +	return status;
> > > +}
> > > +
> > > +
> > >   /**
> > >    * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict
> > >    * @rqstp: RPC transaction context
> > > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> > > index c7bfd2180e3f..f0fe526fac3c 100644
> > > --- a/fs/nfsd/nfs4xdr.c
> > > +++ b/fs/nfsd/nfs4xdr.c
> > > @@ -4418,6 +4418,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
> > >   	unsigned long maxcount;
> > >   	struct file *file;
> > >   	__be32 *p;
> > > +	fmode_t o_fmode = 0;
> > >   	if (nfserr)
> > >   		return nfserr;
> > > @@ -4437,10 +4438,18 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
> > >   	maxcount = min_t(unsigned long, read->rd_length,
> > >   			 (xdr->buf->buflen - xdr->buf->len));
> > > +	if (read->rd_wd_stid) {
> > > +		/* allow READ using write delegation stateid */
> > > +		o_fmode = file->f_mode;
> > > +		file->f_mode |= FMODE_READ;
> > > +	}
> > nfsd4_encode_read_plus() needs to handle write delegation stateids
> > as well.
> 
> Yes, missed that one.
> 
> > 
> > I'm not too sure about modifying the f_mode on an nfsd_file you
> > just got from a cache of shared nfsd_files.
> 
> If that is a problem, nfsd can open the file with rw access to begin with
> if a write deleg was given?

IMO, it's not a question of whether there is a write delegation, but
rather what intent the client told the server during the OPEN. The
server would need to open the local file for R/W when it gets a
O_WRONLY open. Not saying this is the right thing to do, just
thinking out loud...


> > I think I'd prefer if preprocess_stateid returned an nfsd_file that
> > was already open for read. IIUC that would mean that no changes
> > would be needed here or in nfsd4_encode_read_plus().
> > 
> > Would that be difficult?
> 
> preprocess_stateif to return a nfsd_file? not sure. But Nai correctly
> pointed out
> that the file may have not been opened for read.

My impression is that nfsd_file_acquire(), which underpins
preprocess_stateid, can open a file itself. I believe that is what
happens in the NFSv3 case, for instance.

So if the caller requests an nfsd_file that is open for read and
there isn't already one on hand, wouldn't preprocess_stateid (and
nfsd_file_acquire) just open the file on demand using the given
mode and credentials, and return the new nfsd_file?

For NFSv4, that would mean the subsequent nfsd_file_put() would
close that file immediately rather than cache it. So maybe not
as efficient as we could be.
Chuck Lever III July 27, 2024, 7:58 p.m. UTC | #16
On Sat, Jul 27, 2024 at 03:53:41PM -0400, Chuck Lever wrote:
> On Sat, Jul 27, 2024 at 10:26:24PM +0300, Sagi Grimberg wrote:
> > 
> > 
> > 
> > On 26/07/2024 17:06, Chuck Lever wrote:
> > > On Wed, Jul 24, 2024 at 10:01:38AM -0700, Sagi Grimberg wrote:
> > > > Based on a patch fom Dai Ngo, allow NFSv4 client to use write delegation
> > > > stateid for READ operation. Per RFC 8881 section 9.1.2. Use of the
> > > > Stateid and Locking.
> > > > 
> > > > In addition, for anonymous stateids, check for pending delegations by
> > > > the filehandle and client_id, and if a conflict found, recall the delegation
> > > > before allowing the read to take place.
> > > > 
> > > > Suggested-by: Dai Ngo <dai.ngo@oracle.com>
> > > > Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> > > > ---
> > > >   fs/nfsd/nfs4proc.c  | 22 +++++++++++++++++++--
> > > >   fs/nfsd/nfs4state.c | 47 +++++++++++++++++++++++++++++++++++++++++++++
> > > >   fs/nfsd/nfs4xdr.c   |  9 +++++++++
> > > >   fs/nfsd/state.h     |  2 ++
> > > >   fs/nfsd/xdr4.h      |  2 ++
> > > >   5 files changed, 80 insertions(+), 2 deletions(-)
> > > > 
> > > > diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> > > > index 7b70309ad8fb..324984ec70c6 100644
> > > > --- a/fs/nfsd/nfs4proc.c
> > > > +++ b/fs/nfsd/nfs4proc.c
> > > > @@ -979,8 +979,24 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> > > >   	/* check stateid */
> > > >   	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
> > > >   					&read->rd_stateid, RD_STATE,
> > > > -					&read->rd_nf, NULL);
> > > > -
> > > > +					&read->rd_nf, &read->rd_wd_stid);
> > > > +	/*
> > > > +	 * rd_wd_stid is needed for nfsd4_encode_read to allow write
> > > > +	 * delegation stateid used for read. Its refcount is decremented
> > > > +	 * by nfsd4_read_release when read is done.
> > > > +	 */
> > > > +	if (!status) {
> > > > +		if (!read->rd_wd_stid) {
> > > > +			/* special stateid? */
> > > > +			status = nfsd4_deleg_read_conflict(rqstp, cstate->clp,
> > > > +				&cstate->current_fh);
> > > > +		} else if (read->rd_wd_stid->sc_type != SC_TYPE_DELEG ||
> > > > +			   delegstateid(read->rd_wd_stid)->dl_type !=
> > > > +						NFS4_OPEN_DELEGATE_WRITE) {
> > > > +			nfs4_put_stid(read->rd_wd_stid);
> > > > +			read->rd_wd_stid = NULL;
> > > > +		}
> > > > +	}
> > > >   	read->rd_rqstp = rqstp;
> > > >   	read->rd_fhp = &cstate->current_fh;
> > > >   	return status;
> > > > @@ -990,6 +1006,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> > > >   static void
> > > >   nfsd4_read_release(union nfsd4_op_u *u)
> > > >   {
> > > > +	if (u->read.rd_wd_stid)
> > > > +		nfs4_put_stid(u->read.rd_wd_stid);
> > > >   	if (u->read.rd_nf)
> > > >   		nfsd_file_put(u->read.rd_nf);
> > > >   	trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp,
> > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > > > index dc61a8adfcd4..7e6b9fb31a4c 100644
> > > > --- a/fs/nfsd/nfs4state.c
> > > > +++ b/fs/nfsd/nfs4state.c
> > > > @@ -8805,6 +8805,53 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
> > > >   	get_stateid(cstate, &u->write.wr_stateid);
> > > >   }
> > > > +/**
> > > > + * nfsd4_deleg_read_conflict - Recall if read causes conflict
> > > > + * @rqstp: RPC transaction context
> > > > + * @clp: nfs client
> > > > + * @fhp: nfs file handle
> > > > + * @inode: file to be checked for a conflict
> > > > + * @modified: return true if file was modified
> > > > + * @size: new size of file if modified is true
> > > > + *
> > > > + * This function is called when there is a conflict between a write
> > > > + * delegation and a read that is using a special stateid where the
> > > > + * we cannot derive the client stateid exsistence. The server
> > > > + * must recall a conflicting delegation before allowing the read
> > > > + * to continue.
> > > > + *
> > > > + * Returns 0 if there is no conflict; otherwise an nfs_stat
> > > > + * code is returned.
> > > > + */
> > > > +__be32 nfsd4_deleg_read_conflict(struct svc_rqst *rqstp,
> > > > +		struct nfs4_client *clp, struct svc_fh *fhp)
> > > > +{
> > > > +	struct nfs4_file *fp;
> > > > +	__be32 status = 0;
> > > > +
> > > > +	fp = nfsd4_file_hash_lookup(fhp);
> > > > +	if (!fp)
> > > > +		return nfs_ok;
> > > > +
> > > > +	spin_lock(&state_lock);
> > > > +	spin_lock(&fp->fi_lock);
> > > > +	if (!list_empty(&fp->fi_delegations) &&
> > > > +	    !nfs4_delegation_exists(clp, fp)) {
> > > > +		/* conflict, recall deleg */
> > > > +		status = nfserrno(nfsd_open_break_lease(fp->fi_inode,
> > > > +					NFSD_MAY_READ));
> > > > +		if (status)
> > > > +			goto out;
> > > > +		if (!nfsd_wait_for_delegreturn(rqstp, fp->fi_inode))
> > > > +			status = nfserr_jukebox;
> > > > +	}
> > > > +out:
> > > > +	spin_unlock(&fp->fi_lock);
> > > > +	spin_unlock(&state_lock);
> > > > +	return status;
> > > > +}
> > > > +
> > > > +
> > > >   /**
> > > >    * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict
> > > >    * @rqstp: RPC transaction context
> > > > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> > > > index c7bfd2180e3f..f0fe526fac3c 100644
> > > > --- a/fs/nfsd/nfs4xdr.c
> > > > +++ b/fs/nfsd/nfs4xdr.c
> > > > @@ -4418,6 +4418,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
> > > >   	unsigned long maxcount;
> > > >   	struct file *file;
> > > >   	__be32 *p;
> > > > +	fmode_t o_fmode = 0;
> > > >   	if (nfserr)
> > > >   		return nfserr;
> > > > @@ -4437,10 +4438,18 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
> > > >   	maxcount = min_t(unsigned long, read->rd_length,
> > > >   			 (xdr->buf->buflen - xdr->buf->len));
> > > > +	if (read->rd_wd_stid) {
> > > > +		/* allow READ using write delegation stateid */
> > > > +		o_fmode = file->f_mode;
> > > > +		file->f_mode |= FMODE_READ;
> > > > +	}
> > > nfsd4_encode_read_plus() needs to handle write delegation stateids
> > > as well.
> > 
> > Yes, missed that one.
> > 
> > > 
> > > I'm not too sure about modifying the f_mode on an nfsd_file you
> > > just got from a cache of shared nfsd_files.
> > 
> > If that is a problem, nfsd can open the file with rw access to begin with
> > if a write deleg was given?
> 
> IMO, it's not a question of whether there is a write delegation, but
> rather what intent the client told the server during the OPEN. The
> server would need to open the local file for R/W when it gets a
> O_WRONLY open. Not saying this is the right thing to do, just
> thinking out loud...

OK, thinking about this again... The nfsd_file that is associated
with a write delegation is probably going to need to be a local
O_RDWR open, if a write delegation state ID can be used for READ or
WRITE.
Dai Ngo July 27, 2024, 10:13 p.m. UTC | #17
On 7/27/24 12:33 PM, Sagi Grimberg wrote:
>>>>> id(cstate, &u->write.wr_stateid);
>>>>>    }
>>>>> +/**
>>>>> + * nfsd4_deleg_read_conflict - Recall if read causes conflict
>>>>> + * @rqstp: RPC transaction context
>>>>> + * @clp: nfs client
>>>>> + * @fhp: nfs file handle
>>>>> + * @inode: file to be checked for a conflict
>>>>> + * @modified: return true if file was modified
>>>>> + * @size: new size of file if modified is true
>>>>> + *
>>>>> + * This function is called when there is a conflict between a write
>>>>> + * delegation and a read that is using a special stateid where the
>>>>> + * we cannot derive the client stateid exsistence. The server
>>>>> + * must recall a conflicting delegation before allowing the read
>>>>> + * to continue.
>>>>> + *
>>>>> + * Returns 0 if there is no conflict; otherwise an nfs_stat
>>>>> + * code is returned.
>>>>> + */
>>>>> +__be32 nfsd4_deleg_read_conflict(struct svc_rqst *rqstp,
>>>>> +        struct nfs4_client *clp, struct svc_fh *fhp)
>>>>> +{
>>>>> +    struct nfs4_file *fp;
>>>>> +    __be32 status = 0;
>>>>> +
>>>>> +    fp = nfsd4_file_hash_lookup(fhp);
>>>>> +    if (!fp)
>>>>> +        return nfs_ok;
>>>>> +
>>>>> +    spin_lock(&state_lock);
>>>>> +    spin_lock(&fp->fi_lock);
>>>>> +    if (!list_empty(&fp->fi_delegations) &&
>>>>> +        !nfs4_delegation_exists(clp, fp)) {
>>>>> +        /* conflict, recall deleg */
>>>> I don't see how we can have a delegation conflict here. If a client
>>>> has a write delegation then there should not be any delegations from
>>>> other clients.
>>> A delegation conflict is possible if the client is using an
>>> anonymous stateid to perform the READ.
>>
>> Then we should not detect any delegation conflict from this
>> function.
>
> Can you explain why?

It was my mistake. I missed the part that the nfsd4_deleg_read_conflict
was called only from the context of the client sending the read with the
special stateid.

It's probably clearer to separate this patch into 2 patches: one to allow
a write delegation (opened with wr-only) to read the file and the other
is to detect write delegation conflict, regardless of whether the delegation
was from an open with rw or wr-only, when a special stateid is used to
do the read.

-Dai

>
> If the client sent a raed with anon stateid, this function checks the 
> pending
> delegations (fi_delegations), and not from the same client 
> (!nfs4_delegation_exists),
> then it should detect a conflict.
>
>>
>>>
>>> One thing we could perhaps do is add support for the use of
>>> anonymous stateids as a separate patch.
>>
>> This would be a separate issue from allowing write delegation
>> stateid to read. This would be to detect the scenario where a
>> client using special stateid to read while another client has
>> a write delegation on the file.
>
> As mentioned, I can separate it, but this would make this patch
> break a pynfs test.
diff mbox series

Patch

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 7b70309ad8fb..324984ec70c6 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -979,8 +979,24 @@  nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	/* check stateid */
 	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
 					&read->rd_stateid, RD_STATE,
-					&read->rd_nf, NULL);
-
+					&read->rd_nf, &read->rd_wd_stid);
+	/*
+	 * rd_wd_stid is needed for nfsd4_encode_read to allow write
+	 * delegation stateid used for read. Its refcount is decremented
+	 * by nfsd4_read_release when read is done.
+	 */
+	if (!status) {
+		if (!read->rd_wd_stid) {
+			/* special stateid? */
+			status = nfsd4_deleg_read_conflict(rqstp, cstate->clp,
+				&cstate->current_fh);
+		} else if (read->rd_wd_stid->sc_type != SC_TYPE_DELEG ||
+			   delegstateid(read->rd_wd_stid)->dl_type !=
+						NFS4_OPEN_DELEGATE_WRITE) {
+			nfs4_put_stid(read->rd_wd_stid);
+			read->rd_wd_stid = NULL;
+		}
+	}
 	read->rd_rqstp = rqstp;
 	read->rd_fhp = &cstate->current_fh;
 	return status;
@@ -990,6 +1006,8 @@  nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 static void
 nfsd4_read_release(union nfsd4_op_u *u)
 {
+	if (u->read.rd_wd_stid)
+		nfs4_put_stid(u->read.rd_wd_stid);
 	if (u->read.rd_nf)
 		nfsd_file_put(u->read.rd_nf);
 	trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index dc61a8adfcd4..7e6b9fb31a4c 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -8805,6 +8805,53 @@  nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
 	get_stateid(cstate, &u->write.wr_stateid);
 }
 
+/**
+ * nfsd4_deleg_read_conflict - Recall if read causes conflict
+ * @rqstp: RPC transaction context
+ * @clp: nfs client
+ * @fhp: nfs file handle
+ * @inode: file to be checked for a conflict
+ * @modified: return true if file was modified
+ * @size: new size of file if modified is true
+ *
+ * This function is called when there is a conflict between a write
+ * delegation and a read that is using a special stateid where the
+ * we cannot derive the client stateid exsistence. The server
+ * must recall a conflicting delegation before allowing the read
+ * to continue.
+ *
+ * Returns 0 if there is no conflict; otherwise an nfs_stat
+ * code is returned.
+ */
+__be32 nfsd4_deleg_read_conflict(struct svc_rqst *rqstp,
+		struct nfs4_client *clp, struct svc_fh *fhp)
+{
+	struct nfs4_file *fp;
+	__be32 status = 0;
+
+	fp = nfsd4_file_hash_lookup(fhp);
+	if (!fp)
+		return nfs_ok;
+
+	spin_lock(&state_lock);
+	spin_lock(&fp->fi_lock);
+	if (!list_empty(&fp->fi_delegations) &&
+	    !nfs4_delegation_exists(clp, fp)) {
+		/* conflict, recall deleg */
+		status = nfserrno(nfsd_open_break_lease(fp->fi_inode,
+					NFSD_MAY_READ));
+		if (status)
+			goto out;
+		if (!nfsd_wait_for_delegreturn(rqstp, fp->fi_inode))
+			status = nfserr_jukebox;
+	}
+out:
+	spin_unlock(&fp->fi_lock);
+	spin_unlock(&state_lock);
+	return status;
+}
+
+
 /**
  * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict
  * @rqstp: RPC transaction context
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c7bfd2180e3f..f0fe526fac3c 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4418,6 +4418,7 @@  nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
 	unsigned long maxcount;
 	struct file *file;
 	__be32 *p;
+	fmode_t o_fmode = 0;
 
 	if (nfserr)
 		return nfserr;
@@ -4437,10 +4438,18 @@  nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
 	maxcount = min_t(unsigned long, read->rd_length,
 			 (xdr->buf->buflen - xdr->buf->len));
 
+	if (read->rd_wd_stid) {
+		/* allow READ using write delegation stateid */
+		o_fmode = file->f_mode;
+		file->f_mode |= FMODE_READ;
+	}
 	if (file->f_op->splice_read && splice_ok)
 		nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
 	else
 		nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
+	if (o_fmode)
+		file->f_mode = o_fmode;
+
 	if (nfserr) {
 		xdr_truncate_encode(xdr, starting_len);
 		return nfserr;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index ffc217099d19..c1f13b5877c6 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -780,6 +780,8 @@  static inline bool try_to_expire_client(struct nfs4_client *clp)
 	return clp->cl_state == NFSD4_EXPIRABLE;
 }
 
+extern __be32 nfsd4_deleg_read_conflict(struct svc_rqst *rqstp,
+		struct nfs4_client *clp, struct svc_fh *fhp);
 extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp,
 		struct inode *inode, bool *file_modified, u64 *size);
 #endif   /* NFSD4_STATE_H */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index fbdd42cde1fa..434973a6a8b1 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -426,6 +426,8 @@  struct nfsd4_read {
 	struct svc_rqst		*rd_rqstp;          /* response */
 	struct svc_fh		*rd_fhp;            /* response */
 	u32			rd_eof;             /* response */
+
+	struct nfs4_stid	*rd_wd_stid;		/* internal */
 };
 
 struct nfsd4_readdir {