diff mbox series

[v2,1/5] NFS: Fix error handling for O_DIRECT write scheduling

Message ID 20230904163441.11950-2-trondmy@kernel.org (mailing list archive)
State New, archived
Headers show
Series Fix O_DIRECT writeback error paths | expand

Commit Message

Trond Myklebust Sept. 4, 2023, 4:34 p.m. UTC
From: Trond Myklebust <trond.myklebust@hammerspace.com>

If we fail to schedule a request for transmission, there are 2
possibilities:
1) Either we hit a fatal error, and we just want to drop the remaining
   requests on the floor.
2) We were asked to try again, in which case we should allow the
   outstanding RPC calls to complete, so that we can recoalesce requests
   and try again.

Fixes: d600ad1f2bdb ("NFS41: pop some layoutget errors to application")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/direct.c | 62 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 46 insertions(+), 16 deletions(-)

Comments

Benjamin Coddington Nov. 9, 2023, 3:05 p.m. UTC | #1
Hi Trond,

On 4 Sep 2023, at 12:34, trondmy@kernel.org wrote:

> From: Trond Myklebust <trond.myklebust@hammerspace.com>
>
> If we fail to schedule a request for transmission, there are 2
> possibilities:
> 1) Either we hit a fatal error, and we just want to drop the remaining
>    requests on the floor.
> 2) We were asked to try again, in which case we should allow the
>    outstanding RPC calls to complete, so that we can recoalesce requests
>    and try again.
>
> Fixes: d600ad1f2bdb ("NFS41: pop some layoutget errors to application")
> Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> ---
>  fs/nfs/direct.c | 62 ++++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 46 insertions(+), 16 deletions(-)
>
> diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
> index 47d892a1d363..ee88f0a6e7b8 100644
> --- a/fs/nfs/direct.c
> +++ b/fs/nfs/direct.c
> @@ -528,10 +528,9 @@ nfs_direct_write_scan_commit_list(struct inode *inode,
>  static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
>  {
>  	struct nfs_pageio_descriptor desc;
> -	struct nfs_page *req, *tmp;
> +	struct nfs_page *req;
>  	LIST_HEAD(reqs);
>  	struct nfs_commit_info cinfo;
> -	LIST_HEAD(failed);
>
>  	nfs_init_cinfo_from_dreq(&cinfo, dreq);
>  	nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
> @@ -549,27 +548,36 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
>  			      &nfs_direct_write_completion_ops);
>  	desc.pg_dreq = dreq;
>
> -	list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
> +	while (!list_empty(&reqs)) {
> +		req = nfs_list_entry(reqs.next);
>  		/* Bump the transmission count */
>  		req->wb_nio++;
>  		if (!nfs_pageio_add_request(&desc, req)) {
> -			nfs_list_move_request(req, &failed);
>  			spin_lock(&cinfo.inode->i_lock);
> -			dreq->flags = 0;
> -			if (desc.pg_error < 0)
> +			if (dreq->error < 0) {
> +				desc.pg_error = dreq->error;
> +			} else if (desc.pg_error != -EAGAIN) {
> +				dreq->flags = 0;
> +				if (!desc.pg_error)
> +					desc.pg_error = -EIO;
>  				dreq->error = desc.pg_error;
> -			else
> -				dreq->error = -EIO;
> +			} else
> +				dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
>  			spin_unlock(&cinfo.inode->i_lock);
> +			break;
>  		}
>  		nfs_release_request(req);
>  	}
>  	nfs_pageio_complete(&desc);
>
> -	while (!list_empty(&failed)) {
> -		req = nfs_list_entry(failed.next);
> +	while (!list_empty(&reqs)) {
> +		req = nfs_list_entry(reqs.next);
>  		nfs_list_remove_request(req);
>  		nfs_unlock_and_release_request(req);
> +		if (desc.pg_error == -EAGAIN)
> +			nfs_mark_request_commit(req, NULL, &cinfo, 0);
> +		else
> +			nfs_release_request(req);
>  	}
>
>  	if (put_dreq(dreq))
> @@ -794,9 +802,11 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
>  {
>  	struct nfs_pageio_descriptor desc;
>  	struct inode *inode = dreq->inode;
> +	struct nfs_commit_info cinfo;
>  	ssize_t result = 0;
>  	size_t requested_bytes = 0;
>  	size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
> +	bool defer = false;
>
>  	trace_nfs_direct_write_schedule_iovec(dreq);
>
> @@ -837,17 +847,37 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
>  				break;
>  			}
>
> -			nfs_lock_request(req);
> -			if (!nfs_pageio_add_request(&desc, req)) {
> -				result = desc.pg_error;
> -				nfs_unlock_and_release_request(req);
> -				break;
> -			}
>  			pgbase = 0;
>  			bytes -= req_len;
>  			requested_bytes += req_len;
>  			pos += req_len;
>  			dreq->bytes_left -= req_len;
> +
> +			if (defer) {
> +				nfs_mark_request_commit(req, NULL, &cinfo, 0);
> +				continue;
> +			}
> +
> +			nfs_lock_request(req);
> +			if (nfs_pageio_add_request(&desc, req))
> +				continue;

Just a note, we've hit some trouble with this one on pNFS SCSI.

When we re-order the update of dreq->bytes_left and
nfs_pageio_add_request(), the blocklayout driver machinery ends up with bad
calculations for LAYOUTGET on the first req.   What I see is a short
LAYOUTGET, and then a 2nd LAYOUGET for the last req with loga_length 0,
causing some havoc.

Eventually, there's a corruption somewhere - I think because
nfs_pageio_add_request() ends up doing nfs_pageio_do() in the middle of this
thing and then we race to something in completion - I haven't quite been
able to figure that part out yet.

Ben
Trond Myklebust Nov. 9, 2023, 4:53 p.m. UTC | #2
On Thu, 2023-11-09 at 10:05 -0500, Benjamin Coddington wrote:
> Hi Trond,
> 
> On 4 Sep 2023, at 12:34, trondmy@kernel.org wrote:
> 
> > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > 
> > If we fail to schedule a request for transmission, there are 2
> > possibilities:
> > 1) Either we hit a fatal error, and we just want to drop the
> > remaining
> >    requests on the floor.
> > 2) We were asked to try again, in which case we should allow the
> >    outstanding RPC calls to complete, so that we can recoalesce
> > requests
> >    and try again.
> > 
> > Fixes: d600ad1f2bdb ("NFS41: pop some layoutget errors to
> > application")
> > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> > ---
> >  fs/nfs/direct.c | 62 ++++++++++++++++++++++++++++++++++++---------
> > ----
> >  1 file changed, 46 insertions(+), 16 deletions(-)
> > 
> > diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
> > index 47d892a1d363..ee88f0a6e7b8 100644
> > --- a/fs/nfs/direct.c
> > +++ b/fs/nfs/direct.c
> > @@ -528,10 +528,9 @@ nfs_direct_write_scan_commit_list(struct inode
> > *inode,
> >  static void nfs_direct_write_reschedule(struct nfs_direct_req
> > *dreq)
> >  {
> >  	struct nfs_pageio_descriptor desc;
> > -	struct nfs_page *req, *tmp;
> > +	struct nfs_page *req;
> >  	LIST_HEAD(reqs);
> >  	struct nfs_commit_info cinfo;
> > -	LIST_HEAD(failed);
> > 
> >  	nfs_init_cinfo_from_dreq(&cinfo, dreq);
> >  	nfs_direct_write_scan_commit_list(dreq->inode, &reqs,
> > &cinfo);
> > @@ -549,27 +548,36 @@ static void
> > nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
> >  			      &nfs_direct_write_completion_ops);
> >  	desc.pg_dreq = dreq;
> > 
> > -	list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
> > +	while (!list_empty(&reqs)) {
> > +		req = nfs_list_entry(reqs.next);
> >  		/* Bump the transmission count */
> >  		req->wb_nio++;
> >  		if (!nfs_pageio_add_request(&desc, req)) {
> > -			nfs_list_move_request(req, &failed);
> >  			spin_lock(&cinfo.inode->i_lock);
> > -			dreq->flags = 0;
> > -			if (desc.pg_error < 0)
> > +			if (dreq->error < 0) {
> > +				desc.pg_error = dreq->error;
> > +			} else if (desc.pg_error != -EAGAIN) {
> > +				dreq->flags = 0;
> > +				if (!desc.pg_error)
> > +					desc.pg_error = -EIO;
> >  				dreq->error = desc.pg_error;
> > -			else
> > -				dreq->error = -EIO;
> > +			} else
> > +				dreq->flags =
> > NFS_ODIRECT_RESCHED_WRITES;
> >  			spin_unlock(&cinfo.inode->i_lock);
> > +			break;
> >  		}
> >  		nfs_release_request(req);
> >  	}
> >  	nfs_pageio_complete(&desc);
> > 
> > -	while (!list_empty(&failed)) {
> > -		req = nfs_list_entry(failed.next);
> > +	while (!list_empty(&reqs)) {
> > +		req = nfs_list_entry(reqs.next);
> >  		nfs_list_remove_request(req);
> >  		nfs_unlock_and_release_request(req);
> > +		if (desc.pg_error == -EAGAIN)
> > +			nfs_mark_request_commit(req, NULL, &cinfo,
> > 0);
> > +		else
> > +			nfs_release_request(req);
> >  	}
> > 
> >  	if (put_dreq(dreq))
> > @@ -794,9 +802,11 @@ static ssize_t
> > nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
> >  {
> >  	struct nfs_pageio_descriptor desc;
> >  	struct inode *inode = dreq->inode;
> > +	struct nfs_commit_info cinfo;
> >  	ssize_t result = 0;
> >  	size_t requested_bytes = 0;
> >  	size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize,
> > PAGE_SIZE);
> > +	bool defer = false;
> > 
> >  	trace_nfs_direct_write_schedule_iovec(dreq);
> > 
> > @@ -837,17 +847,37 @@ static ssize_t
> > nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
> >  				break;
> >  			}
> > 
> > -			nfs_lock_request(req);
> > -			if (!nfs_pageio_add_request(&desc, req)) {
> > -				result = desc.pg_error;
> > -
> > 				nfs_unlock_and_release_request(req);
> > -				break;
> > -			}
> >  			pgbase = 0;
> >  			bytes -= req_len;
> >  			requested_bytes += req_len;
> >  			pos += req_len;
> >  			dreq->bytes_left -= req_len;
> > +
> > +			if (defer) {
> > +				nfs_mark_request_commit(req, NULL,
> > &cinfo, 0);
> > +				continue;
> > +			}
> > +
> > +			nfs_lock_request(req);
> > +			if (nfs_pageio_add_request(&desc, req))
> > +				continue;
> 
> Just a note, we've hit some trouble with this one on pNFS SCSI.
> 
> When we re-order the update of dreq->bytes_left and
> nfs_pageio_add_request(), the blocklayout driver machinery ends up
> with bad
> calculations for LAYOUTGET on the first req.   What I see is a short
> LAYOUTGET, and then a 2nd LAYOUGET for the last req with loga_length
> 0,
> causing some havoc.
> 
> Eventually, there's a corruption somewhere - I think because
> nfs_pageio_add_request() ends up doing nfs_pageio_do() in the middle
> of this
> thing and then we race to something in completion - I haven't quite
> been
> able to figure that part out yet.
> 

Hi Ben,

Relying on the value of dreq->bytes_left is just not a good idea, given
that the layoutget request could end up returning NFS4ERR_DELAY.

How about something like the following patch?

8<-----------------------------------------------------
From e68e9c928c9d843c482ec08e1d26350b7999cafa Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 9 Nov 2023 11:46:28 -0500
Subject: [PATCH] pNFS: Fix the pnfs block driver's calculation of layoutget
 size

Instead of relying on the value of the 'bytes_left' field, we should
calculate the layout size based on the offset of the request that is
being written out.

Reported-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/blocklayout/blocklayout.c | 5 ++---
 fs/nfs/direct.c                  | 5 +++--
 fs/nfs/internal.h                | 2 +-
 fs/nfs/pnfs.c                    | 3 ++-
 4 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 943aeea1eb16..c1cc9fe93dd4 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -893,10 +893,9 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
 	}
 
 	if (pgio->pg_dreq == NULL)
-		wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
-					      req->wb_index);
+		wb_size = pnfs_num_cont_bytes(pgio->pg_inode, req->wb_index);
 	else
-		wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
+		wb_size = nfs_dreq_bytes_left(pgio->pg_dreq, req_offset(req));
 
 	pnfs_generic_pg_init_write(pgio, req, wb_size);
 
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index f6c74f424691..5918c67dae0d 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -205,9 +205,10 @@ static void nfs_direct_req_release(struct nfs_direct_req *dreq)
 	kref_put(&dreq->kref, nfs_direct_req_free);
 }
 
-ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq)
+ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset)
 {
-	return dreq->bytes_left;
+	loff_t start = offset - dreq->io_start;
+	return dreq->max_count - start;
 }
 EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left);
 
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9c9cf764f600..b1fa81c9dff6 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -655,7 +655,7 @@ extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry);
 /* direct.c */
 void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
 			      struct nfs_direct_req *dreq);
-extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
+extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset);
 
 /* nfs4proc.c */
 extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 21a365357629..0c0fed1ecd0b 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2733,7 +2733,8 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
 		if (pgio->pg_dreq == NULL)
 			rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
 		else
-			rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
+			rd_size = nfs_dreq_bytes_left(pgio->pg_dreq,
+						      req_offset(req));
 
 		pgio->pg_lseg =
 			pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
Benjamin Coddington Nov. 9, 2023, 5:25 p.m. UTC | #3
On 9 Nov 2023, at 11:53, Trond Myklebust wrote:
>
> Hi Ben,
>
> Relying on the value of dreq->bytes_left is just not a good idea, given
> that the layoutget request could end up returning NFS4ERR_DELAY.
>
> How about something like the following patch?

That looks promising!  I think ->bytes_left could get dropped after this.

I'll send it through some testing and report back, thanks!

Ben
Benjamin Coddington Nov. 15, 2023, 1:04 p.m. UTC | #4
On 9 Nov 2023, at 12:25, Benjamin Coddington wrote:

> On 9 Nov 2023, at 11:53, Trond Myklebust wrote:
>>
>> Hi Ben,
>>
>> Relying on the value of dreq->bytes_left is just not a good idea, given
>> that the layoutget request could end up returning NFS4ERR_DELAY.
>>
>> How about something like the following patch?
>
> That looks promising!  I think ->bytes_left could get dropped after this.
>
> I'll send it through some testing and report back, thanks!

This definitely fixes it, sorry for the delay getting back.

Fixes: 954998b60caa ("NFS: Fix error handling for O_DIRECT write scheduling")
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>

It creates some clear additional work to remove nfs_direct_req->bytes_left
(I don't think its needed anymore) and fixup the nfs_direct_req_class
tracepoint, which could be a follow-up patch or get folded in.

Ben
Trond Myklebust Nov. 15, 2023, 5:16 p.m. UTC | #5
On Wed, 2023-11-15 at 08:04 -0500, Benjamin Coddington wrote:
> On 9 Nov 2023, at 12:25, Benjamin Coddington wrote:
> 
> > On 9 Nov 2023, at 11:53, Trond Myklebust wrote:
> > > 
> > > Hi Ben,
> > > 
> > > Relying on the value of dreq->bytes_left is just not a good idea,
> > > given
> > > that the layoutget request could end up returning NFS4ERR_DELAY.
> > > 
> > > How about something like the following patch?
> > 
> > That looks promising!  I think ->bytes_left could get dropped after
> > this.
> > 
> > I'll send it through some testing and report back, thanks!
> 
> This definitely fixes it, sorry for the delay getting back.
> 
> Fixes: 954998b60caa ("NFS: Fix error handling for O_DIRECT write
> scheduling")
> Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
> Tested-by: Benjamin Coddington <bcodding@redhat.com>
> 
> It creates some clear additional work to remove nfs_direct_req-
> >bytes_left
> (I don't think its needed anymore) and fixup the nfs_direct_req_class
> tracepoint, which could be a follow-up patch or get folded in.
> 

Thank you! I'll queue that patch up so it gets included in the next
bugfix pull request.

I agree that we should get rid of the bytes_left field. We can queue
something up for that in the next merge window.
Benjamin Coddington Nov. 15, 2023, 9:30 p.m. UTC | #6
On 15 Nov 2023, at 12:16, Trond Myklebust wrote:

> On Wed, 2023-11-15 at 08:04 -0500, Benjamin Coddington wrote:
>> On 9 Nov 2023, at 12:25, Benjamin Coddington wrote:
>>
>>> On 9 Nov 2023, at 11:53, Trond Myklebust wrote:
>>>>
>>>> Hi Ben,
>>>>
>>>> Relying on the value of dreq->bytes_left is just not a good idea,
>>>> given
>>>> that the layoutget request could end up returning NFS4ERR_DELAY.
>>>>
>>>> How about something like the following patch?
>>>
>>> That looks promising!  I think ->bytes_left could get dropped after
>>> this.
>>>
>>> I'll send it through some testing and report back, thanks!
>>
>> This definitely fixes it, sorry for the delay getting back.
>>
>> Fixes: 954998b60caa ("NFS: Fix error handling for O_DIRECT write
>> scheduling")
>> Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
>> Tested-by: Benjamin Coddington <bcodding@redhat.com>
>>
>> It creates some clear additional work to remove nfs_direct_req-
>>> bytes_left
>> (I don't think its needed anymore) and fixup the nfs_direct_req_class
>> tracepoint, which could be a follow-up patch or get folded in.
>>
>
> Thank you! I'll queue that patch up so it gets included in the next
> bugfix pull request.

Thank you for the fix.

> I agree that we should get rid of the bytes_left field. We can queue
> something up for that in the next merge window.

I have it tested already with yours, I'll send it along.

Ben
diff mbox series

Patch

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 47d892a1d363..ee88f0a6e7b8 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -528,10 +528,9 @@  nfs_direct_write_scan_commit_list(struct inode *inode,
 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 {
 	struct nfs_pageio_descriptor desc;
-	struct nfs_page *req, *tmp;
+	struct nfs_page *req;
 	LIST_HEAD(reqs);
 	struct nfs_commit_info cinfo;
-	LIST_HEAD(failed);
 
 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
 	nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
@@ -549,27 +548,36 @@  static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 			      &nfs_direct_write_completion_ops);
 	desc.pg_dreq = dreq;
 
-	list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
+	while (!list_empty(&reqs)) {
+		req = nfs_list_entry(reqs.next);
 		/* Bump the transmission count */
 		req->wb_nio++;
 		if (!nfs_pageio_add_request(&desc, req)) {
-			nfs_list_move_request(req, &failed);
 			spin_lock(&cinfo.inode->i_lock);
-			dreq->flags = 0;
-			if (desc.pg_error < 0)
+			if (dreq->error < 0) {
+				desc.pg_error = dreq->error;
+			} else if (desc.pg_error != -EAGAIN) {
+				dreq->flags = 0;
+				if (!desc.pg_error)
+					desc.pg_error = -EIO;
 				dreq->error = desc.pg_error;
-			else
-				dreq->error = -EIO;
+			} else
+				dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
 			spin_unlock(&cinfo.inode->i_lock);
+			break;
 		}
 		nfs_release_request(req);
 	}
 	nfs_pageio_complete(&desc);
 
-	while (!list_empty(&failed)) {
-		req = nfs_list_entry(failed.next);
+	while (!list_empty(&reqs)) {
+		req = nfs_list_entry(reqs.next);
 		nfs_list_remove_request(req);
 		nfs_unlock_and_release_request(req);
+		if (desc.pg_error == -EAGAIN)
+			nfs_mark_request_commit(req, NULL, &cinfo, 0);
+		else
+			nfs_release_request(req);
 	}
 
 	if (put_dreq(dreq))
@@ -794,9 +802,11 @@  static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 {
 	struct nfs_pageio_descriptor desc;
 	struct inode *inode = dreq->inode;
+	struct nfs_commit_info cinfo;
 	ssize_t result = 0;
 	size_t requested_bytes = 0;
 	size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
+	bool defer = false;
 
 	trace_nfs_direct_write_schedule_iovec(dreq);
 
@@ -837,17 +847,37 @@  static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 				break;
 			}
 
-			nfs_lock_request(req);
-			if (!nfs_pageio_add_request(&desc, req)) {
-				result = desc.pg_error;
-				nfs_unlock_and_release_request(req);
-				break;
-			}
 			pgbase = 0;
 			bytes -= req_len;
 			requested_bytes += req_len;
 			pos += req_len;
 			dreq->bytes_left -= req_len;
+
+			if (defer) {
+				nfs_mark_request_commit(req, NULL, &cinfo, 0);
+				continue;
+			}
+
+			nfs_lock_request(req);
+			if (nfs_pageio_add_request(&desc, req))
+				continue;
+
+			/* Exit on hard errors */
+			if (desc.pg_error < 0 && desc.pg_error != -EAGAIN) {
+				result = desc.pg_error;
+				nfs_unlock_and_release_request(req);
+				break;
+			}
+
+			/* If the error is soft, defer remaining requests */
+			nfs_init_cinfo_from_dreq(&cinfo, dreq);
+			spin_lock(&cinfo.inode->i_lock);
+			dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+			spin_unlock(&cinfo.inode->i_lock);
+			nfs_unlock_request(req);
+			nfs_mark_request_commit(req, NULL, &cinfo, 0);
+			desc.pg_error = 0;
+			defer = true;
 		}
 		nfs_direct_release_pages(pagevec, npages);
 		kvfree(pagevec);