diff mbox series

[RFC,5/6] NFS: Use NFSv4.2's OFFLOAD_STATUS operation

Message ID 20240702151947.549814-13-cel@kernel.org (mailing list archive)
State New
Headers show
Series Add a client-side OFFLOAD_STATUS implementation | expand

Commit Message

Chuck Lever July 2, 2024, 3:19 p.m. UTC
From: Chuck Lever <chuck.lever@oracle.com>

We've found that there are cases where a transport disconnection
results in the loss of callback RPCs. NFS servers typically do not
retransmit callback operations after a disconnect.

This can be a problem for the Linux NFS client's current
implementation of asynchronous COPY, which waits indefinitely for a
CB_OFFLOAD callback. If a transport disconnect occurs while an async
COPY is running, there's a good chance the client will never get the
completing CB_OFFLOAD.

Fix this by implementing the OFFLOAD_STATUS operation so that the
Linux NFS client can probe the NFS server if it doesn't see a
CB_OFFLOAD in a reasonable amount of time.

This patch implements a simplistic check. As future work, the client
might also be able to detect whether there is no forward progress on
the request asynchronous COPY operation, and CANCEL it.

Suggested-by: Olga Kornievskaia <kolga@netapp.com>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=218735
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfs/nfs42proc.c | 40 +++++++++++++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 7 deletions(-)

Comments

Olga Kornievskaia July 2, 2024, 6:30 p.m. UTC | #1
On Tue, Jul 2, 2024 at 11:21 AM <cel@kernel.org> wrote:
>
> From: Chuck Lever <chuck.lever@oracle.com>
>
> We've found that there are cases where a transport disconnection
> results in the loss of callback RPCs. NFS servers typically do not
> retransmit callback operations after a disconnect.
>
> This can be a problem for the Linux NFS client's current
> implementation of asynchronous COPY, which waits indefinitely for a
> CB_OFFLOAD callback. If a transport disconnect occurs while an async
> COPY is running, there's a good chance the client will never get the
> completing CB_OFFLOAD.
>
> Fix this by implementing the OFFLOAD_STATUS operation so that the
> Linux NFS client can probe the NFS server if it doesn't see a
> CB_OFFLOAD in a reasonable amount of time.
>
> This patch implements a simplistic check. As future work, the client
> might also be able to detect whether there is no forward progress on
> the request asynchronous COPY operation, and CANCEL it.
>
> Suggested-by: Olga Kornievskaia <kolga@netapp.com>
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=218735
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
>  fs/nfs/nfs42proc.c | 40 +++++++++++++++++++++++++++++++++-------
>  1 file changed, 33 insertions(+), 7 deletions(-)
>
> diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
> index c55247da8e49..246534bfc946 100644
> --- a/fs/nfs/nfs42proc.c
> +++ b/fs/nfs/nfs42proc.c
> @@ -175,6 +175,11 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
>         return err;
>  }
>
> +/* Wait this long before checking progress on a COPY operation */
> +enum {
> +       NFS42_COPY_TIMEOUT      = 5 * HZ,
> +};
> +
>  static int handle_async_copy(struct nfs42_copy_res *res,
>                              struct nfs_server *dst_server,
>                              struct nfs_server *src_server,
> @@ -184,9 +189,10 @@ static int handle_async_copy(struct nfs42_copy_res *res,
>                              bool *restart)
>  {
>         struct nfs4_copy_state *copy, *tmp_copy = NULL, *iter;
> -       int status = NFS4_OK;
>         struct nfs_open_context *dst_ctx = nfs_file_open_context(dst);
>         struct nfs_open_context *src_ctx = nfs_file_open_context(src);
> +       int status = NFS4_OK;
> +       u64 copied;
>
>         copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL);
>         if (!copy)
> @@ -224,7 +230,9 @@ static int handle_async_copy(struct nfs42_copy_res *res,
>                 spin_unlock(&src_server->nfs_client->cl_lock);
>         }
>
> -       status = wait_for_completion_interruptible(&copy->completion);
> +wait:
> +       status = wait_for_completion_interruptible_timeout(&copy->completion,
> +                                                          NFS42_COPY_TIMEOUT);
>         spin_lock(&dst_server->nfs_client->cl_lock);
>         list_del_init(&copy->copies);
>         spin_unlock(&dst_server->nfs_client->cl_lock);
> @@ -233,12 +241,17 @@ static int handle_async_copy(struct nfs42_copy_res *res,
>                 list_del_init(&copy->src_copies);
>                 spin_unlock(&src_server->nfs_client->cl_lock);
>         }
> -       if (status == -ERESTARTSYS) {
> -               goto out_cancel;
> -       } else if (copy->flags || copy->error == NFS4ERR_PARTNER_NO_AUTH) {
> -               status = -EAGAIN;
> -               *restart = true;
> +       switch (status) {
> +       case 0:
> +               goto timeout;
> +       case -ERESTARTSYS:
>                 goto out_cancel;
> +       default:
> +               if (copy->flags || copy->error == NFS4ERR_PARTNER_NO_AUTH) {
> +                       status = -EAGAIN;
> +                       *restart = true;
> +                       goto out_cancel;
> +               }
>         }
>  out:
>         res->write_res.count = copy->count;
> @@ -253,6 +266,19 @@ static int handle_async_copy(struct nfs42_copy_res *res,
>         if (!nfs42_files_from_same_server(src, dst))
>                 nfs42_do_offload_cancel_async(src, src_stateid);
>         goto out_free;
> +timeout:
> +       status = nfs42_proc_offload_status(src, src_stateid, &copied);
> +       switch (status) {
> +       case 0:
> +       case -EREMOTEIO:
> +               res->write_res.count = copied;
> +               memcpy(&res->write_res.verifier, &copy->verf, sizeof(copy->verf));
> +               goto out_free;

Setting aside the grouping these 2cases together, I don't understand
why the assumption that if we received a reply from OFFLOAD_STATUS
with some value back means that we should consider copy done? Say the
copy was for 1M, client queried and got back that 500M done, I don't
think the server by replying to the OFFLOAD_STATUS says it's done with
the copy? I think it replies with how much was done but it might still
be inprogress? So shouldn't we check that everything was done and if
not done go back to waiting again?

Again I don't think this approach is going to work because of how
callback and the copy thread are handled (as of now). In
handle_async_copy() when we come out of
wait_for_completion_interruptible() we know the callback has arrived
and it has signalled the copy thread and thus we remove the copy
request from the list. However, on the timeout, we didn't receive the
wake up and thus if we remove the copy from the list then, when the
callback thread asynchronously receives the callback it won't have the
request to match it too. And in case the server does support an
offload_status query I guess that's ok, but imagine it didn't. So now,
we'd send the offload_status and get not supported and we'd go back to
waiting but we'd already missed the callback because it came and
didn't find the matching request is now just dropped on the floor.

When we wake up from wait_for_completion_interruptible() we need to
know if we timed out or got woken. If we timed out, I think we need to
keep the request in.


> +       case -EINPROGRESS:
> +       case -EOPNOTSUPP:
> +               goto wait;
> +       }
> +       goto out;
>  }
>
>  static int process_copy_commit(struct file *dst, loff_t pos_dst,
> --
> 2.45.2
>
>
Chuck Lever July 2, 2024, 6:43 p.m. UTC | #2
> On Jul 2, 2024, at 2:30 PM, Olga Kornievskaia <aglo@umich.edu> wrote:
> 
> On Tue, Jul 2, 2024 at 11:21 AM <cel@kernel.org> wrote:
>> 
>> From: Chuck Lever <chuck.lever@oracle.com>
>> 
>> We've found that there are cases where a transport disconnection
>> results in the loss of callback RPCs. NFS servers typically do not
>> retransmit callback operations after a disconnect.
>> 
>> This can be a problem for the Linux NFS client's current
>> implementation of asynchronous COPY, which waits indefinitely for a
>> CB_OFFLOAD callback. If a transport disconnect occurs while an async
>> COPY is running, there's a good chance the client will never get the
>> completing CB_OFFLOAD.
>> 
>> Fix this by implementing the OFFLOAD_STATUS operation so that the
>> Linux NFS client can probe the NFS server if it doesn't see a
>> CB_OFFLOAD in a reasonable amount of time.
>> 
>> This patch implements a simplistic check. As future work, the client
>> might also be able to detect whether there is no forward progress on
>> the request asynchronous COPY operation, and CANCEL it.
>> 
>> Suggested-by: Olga Kornievskaia <kolga@netapp.com>
>> Link: https://bugzilla.kernel.org/show_bug.cgi?id=218735
>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>> ---
>> fs/nfs/nfs42proc.c | 40 +++++++++++++++++++++++++++++++++-------
>> 1 file changed, 33 insertions(+), 7 deletions(-)
>> 
>> diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
>> index c55247da8e49..246534bfc946 100644
>> --- a/fs/nfs/nfs42proc.c
>> +++ b/fs/nfs/nfs42proc.c
>> @@ -175,6 +175,11 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
>>        return err;
>> }
>> 
>> +/* Wait this long before checking progress on a COPY operation */
>> +enum {
>> +       NFS42_COPY_TIMEOUT      = 5 * HZ,
>> +};
>> +
>> static int handle_async_copy(struct nfs42_copy_res *res,
>>                             struct nfs_server *dst_server,
>>                             struct nfs_server *src_server,
>> @@ -184,9 +189,10 @@ static int handle_async_copy(struct nfs42_copy_res *res,
>>                             bool *restart)
>> {
>>        struct nfs4_copy_state *copy, *tmp_copy = NULL, *iter;
>> -       int status = NFS4_OK;
>>        struct nfs_open_context *dst_ctx = nfs_file_open_context(dst);
>>        struct nfs_open_context *src_ctx = nfs_file_open_context(src);
>> +       int status = NFS4_OK;
>> +       u64 copied;
>> 
>>        copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL);
>>        if (!copy)
>> @@ -224,7 +230,9 @@ static int handle_async_copy(struct nfs42_copy_res *res,
>>                spin_unlock(&src_server->nfs_client->cl_lock);
>>        }
>> 
>> -       status = wait_for_completion_interruptible(&copy->completion);
>> +wait:
>> +       status = wait_for_completion_interruptible_timeout(&copy->completion,
>> +                                                          NFS42_COPY_TIMEOUT);
>>        spin_lock(&dst_server->nfs_client->cl_lock);
>>        list_del_init(&copy->copies);
>>        spin_unlock(&dst_server->nfs_client->cl_lock);
>> @@ -233,12 +241,17 @@ static int handle_async_copy(struct nfs42_copy_res *res,
>>                list_del_init(&copy->src_copies);
>>                spin_unlock(&src_server->nfs_client->cl_lock);
>>        }
>> -       if (status == -ERESTARTSYS) {
>> -               goto out_cancel;
>> -       } else if (copy->flags || copy->error == NFS4ERR_PARTNER_NO_AUTH) {
>> -               status = -EAGAIN;
>> -               *restart = true;
>> +       switch (status) {
>> +       case 0:
>> +               goto timeout;
>> +       case -ERESTARTSYS:
>>                goto out_cancel;
>> +       default:
>> +               if (copy->flags || copy->error == NFS4ERR_PARTNER_NO_AUTH) {
>> +                       status = -EAGAIN;
>> +                       *restart = true;
>> +                       goto out_cancel;
>> +               }
>>        }
>> out:
>>        res->write_res.count = copy->count;
>> @@ -253,6 +266,19 @@ static int handle_async_copy(struct nfs42_copy_res *res,
>>        if (!nfs42_files_from_same_server(src, dst))
>>                nfs42_do_offload_cancel_async(src, src_stateid);
>>        goto out_free;
>> +timeout:
>> +       status = nfs42_proc_offload_status(src, src_stateid, &copied);
>> +       switch (status) {
>> +       case 0:
>> +       case -EREMOTEIO:
>> +               res->write_res.count = copied;
>> +               memcpy(&res->write_res.verifier, &copy->verf, sizeof(copy->verf));
>> +               goto out_free;
> 
> Setting aside the grouping these 2cases together, I don't understand
> why the assumption that if we received a reply from OFFLOAD_STATUS
> with some value back means that we should consider copy done? Say the
> copy was for 1M, client queried and got back that 500M done, I don't
> think the server by replying to the OFFLOAD_STATUS says it's done with
> the copy? I think it replies with how much was done but it might still
> be inprogress? So shouldn't we check that everything was done and if
> not done go back to waiting again?

The server responds to OFFLOAD_STATUS in one of the following ways:

1. nfsstat = NFS4_OK

1a. osr_complete has size zero. This means the copy is still in progress.
    osr_count contains the number of bytes copied so far.

1b. osr_complete has size one, and contains NFS4_OK. This means the
    copy operation has completed. osr_count contains the total
    number of bytes copied.

1c. osr_complete has size one, and contains an error status. This means
    the copy failed. osr_count contains the total number of byte
    that were copied before the failure occurred.

2. nfsstat != NFS4_OK generally means that the server does not
   recognize the copy stateid.

So, yes, the client can check, based on this information, whether
it needs to wait, retry, or abort. The code I added is not tested
so the new logic is not totally correct. And we can refine this
logic to match our preferred responses, as you listed above.


> Again I don't think this approach is going to work because of how
> callback and the copy thread are handled (as of now). In
> handle_async_copy() when we come out of
> wait_for_completion_interruptible() we know the callback has arrived
> and it has signalled the copy thread and thus we remove the copy
> request from the list. However, on the timeout, we didn't receive the
> wake up and thus if we remove the copy from the list then, when the
> callback thread asynchronously receives the callback it won't have the
> request to match it too. And in case the server does support an
> offload_status query I guess that's ok, but imagine it didn't. So now,
> we'd send the offload_status and get not supported and we'd go back to
> waiting but we'd already missed the callback because it came and
> didn't find the matching request is now just dropped on the floor.

If the client receives "OFFLOAD_STATUS is not supported" I would say
it should try the COPY again with the synchronous bit set. That's a
broken server, IMO.


> When we wake up from wait_for_completion_interruptible() we need to
> know if we timed out or got woken. If we timed out, I think we need to
> keep the request in.

Fair enough, I will try to remember to fix the list handling. The
wait_for_completion_interruptible does provide a return code that
enables the caller to distinguish between these cases.


>> +       case -EINPROGRESS:
>> +       case -EOPNOTSUPP:
>> +               goto wait;
>> +       }
>> +       goto out;
>> }
>> 
>> static int process_copy_commit(struct file *dst, loff_t pos_dst,
>> --
>> 2.45.2


--
Chuck Lever
Chuck Lever July 3, 2024, 6:52 p.m. UTC | #3
> On Jul 2, 2024, at 2:30 PM, Olga Kornievskaia <aglo@umich.edu> wrote:
> 
> Again I don't think this approach is going to work because of how
> callback and the copy thread are handled (as of now). In
> handle_async_copy() when we come out of
> wait_for_completion_interruptible() we know the callback has arrived
> and it has signalled the copy thread and thus we remove the copy
> request from the list. However, on the timeout, we didn't receive the
> wake up and thus if we remove the copy from the list then, when the
> callback thread asynchronously receives the callback it won't have the
> request to match it too. And in case the server does support an
> offload_status query I guess that's ok, but imagine it didn't. So now,
> we'd send the offload_status and get not supported and we'd go back to
> waiting but we'd already missed the callback because it came and
> didn't find the matching request is now just dropped on the floor.

If the client reports that it can't find a matching request,
then the server will keep the copy state ID (it's allowed to
delete the copy stateid /only if/ it gets an NFS4_OK in the
CB_OFFLOAD reply, as I read the spec).

The client will wait again, then send another OFFLOAD_STATUS
in a few seconds, and will see that the COPY completed. The
server is then allowed to delete the copy stateid.

---

Again, if a server doesn't support OFFLOAD_STATUS, the only
reliable recourse is for the client to stop using async COPY.
My patch series doesn't implement that, currently. It could,
say, send a dummy OFFLOAD_STATUS before its first COPY
operation to determine whether to use sync or async COPY
going forward.

The block layer folks I talked to at LSF unanimously stated
that there is no way to implement COPY offload reliably
without the ability for the client/initiator to probe copy
status.

IMO the spec should say (but probably does not) that server
MUST implement OFFLOAD_STATUS if it supports async COPY.
Likewise for the client.

--
Chuck Lever
diff mbox series

Patch

diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index c55247da8e49..246534bfc946 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -175,6 +175,11 @@  int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
 	return err;
 }
 
+/* Wait this long before checking progress on a COPY operation */
+enum {
+	NFS42_COPY_TIMEOUT	= 5 * HZ,
+};
+
 static int handle_async_copy(struct nfs42_copy_res *res,
 			     struct nfs_server *dst_server,
 			     struct nfs_server *src_server,
@@ -184,9 +189,10 @@  static int handle_async_copy(struct nfs42_copy_res *res,
 			     bool *restart)
 {
 	struct nfs4_copy_state *copy, *tmp_copy = NULL, *iter;
-	int status = NFS4_OK;
 	struct nfs_open_context *dst_ctx = nfs_file_open_context(dst);
 	struct nfs_open_context *src_ctx = nfs_file_open_context(src);
+	int status = NFS4_OK;
+	u64 copied;
 
 	copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL);
 	if (!copy)
@@ -224,7 +230,9 @@  static int handle_async_copy(struct nfs42_copy_res *res,
 		spin_unlock(&src_server->nfs_client->cl_lock);
 	}
 
-	status = wait_for_completion_interruptible(&copy->completion);
+wait:
+	status = wait_for_completion_interruptible_timeout(&copy->completion,
+							   NFS42_COPY_TIMEOUT);
 	spin_lock(&dst_server->nfs_client->cl_lock);
 	list_del_init(&copy->copies);
 	spin_unlock(&dst_server->nfs_client->cl_lock);
@@ -233,12 +241,17 @@  static int handle_async_copy(struct nfs42_copy_res *res,
 		list_del_init(&copy->src_copies);
 		spin_unlock(&src_server->nfs_client->cl_lock);
 	}
-	if (status == -ERESTARTSYS) {
-		goto out_cancel;
-	} else if (copy->flags || copy->error == NFS4ERR_PARTNER_NO_AUTH) {
-		status = -EAGAIN;
-		*restart = true;
+	switch (status) {
+	case 0:
+		goto timeout;
+	case -ERESTARTSYS:
 		goto out_cancel;
+	default:
+		if (copy->flags || copy->error == NFS4ERR_PARTNER_NO_AUTH) {
+			status = -EAGAIN;
+			*restart = true;
+			goto out_cancel;
+		}
 	}
 out:
 	res->write_res.count = copy->count;
@@ -253,6 +266,19 @@  static int handle_async_copy(struct nfs42_copy_res *res,
 	if (!nfs42_files_from_same_server(src, dst))
 		nfs42_do_offload_cancel_async(src, src_stateid);
 	goto out_free;
+timeout:
+	status = nfs42_proc_offload_status(src, src_stateid, &copied);
+	switch (status) {
+	case 0:
+	case -EREMOTEIO:
+		res->write_res.count = copied;
+		memcpy(&res->write_res.verifier, &copy->verf, sizeof(copy->verf));
+		goto out_free;
+	case -EINPROGRESS:
+	case -EOPNOTSUPP:
+		goto wait;
+	}
+	goto out;
 }
 
 static int process_copy_commit(struct file *dst, loff_t pos_dst,