Message ID | 20230904163441.11950-2-trondmy@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Fix O_DIRECT writeback error paths | expand |
Hi Trond, On 4 Sep 2023, at 12:34, trondmy@kernel.org wrote: > From: Trond Myklebust <trond.myklebust@hammerspace.com> > > If we fail to schedule a request for transmission, there are 2 > possibilities: > 1) Either we hit a fatal error, and we just want to drop the remaining > requests on the floor. > 2) We were asked to try again, in which case we should allow the > outstanding RPC calls to complete, so that we can recoalesce requests > and try again. > > Fixes: d600ad1f2bdb ("NFS41: pop some layoutget errors to application") > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> > --- > fs/nfs/direct.c | 62 ++++++++++++++++++++++++++++++++++++------------- > 1 file changed, 46 insertions(+), 16 deletions(-) > > diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c > index 47d892a1d363..ee88f0a6e7b8 100644 > --- a/fs/nfs/direct.c > +++ b/fs/nfs/direct.c > @@ -528,10 +528,9 @@ nfs_direct_write_scan_commit_list(struct inode *inode, > static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) > { > struct nfs_pageio_descriptor desc; > - struct nfs_page *req, *tmp; > + struct nfs_page *req; > LIST_HEAD(reqs); > struct nfs_commit_info cinfo; > - LIST_HEAD(failed); > > nfs_init_cinfo_from_dreq(&cinfo, dreq); > nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); > @@ -549,27 +548,36 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) > &nfs_direct_write_completion_ops); > desc.pg_dreq = dreq; > > - list_for_each_entry_safe(req, tmp, &reqs, wb_list) { > + while (!list_empty(&reqs)) { > + req = nfs_list_entry(reqs.next); > /* Bump the transmission count */ > req->wb_nio++; > if (!nfs_pageio_add_request(&desc, req)) { > - nfs_list_move_request(req, &failed); > spin_lock(&cinfo.inode->i_lock); > - dreq->flags = 0; > - if (desc.pg_error < 0) > + if (dreq->error < 0) { > + desc.pg_error = dreq->error; > + } else if (desc.pg_error != -EAGAIN) { > + dreq->flags = 0; > + if (!desc.pg_error) > + desc.pg_error = -EIO; > dreq->error = desc.pg_error; > - else > - dreq->error = -EIO; > + } else > + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; > spin_unlock(&cinfo.inode->i_lock); > + break; > } > nfs_release_request(req); > } > nfs_pageio_complete(&desc); > > - while (!list_empty(&failed)) { > - req = nfs_list_entry(failed.next); > + while (!list_empty(&reqs)) { > + req = nfs_list_entry(reqs.next); > nfs_list_remove_request(req); > nfs_unlock_and_release_request(req); > + if (desc.pg_error == -EAGAIN) > + nfs_mark_request_commit(req, NULL, &cinfo, 0); > + else > + nfs_release_request(req); > } > > if (put_dreq(dreq)) > @@ -794,9 +802,11 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, > { > struct nfs_pageio_descriptor desc; > struct inode *inode = dreq->inode; > + struct nfs_commit_info cinfo; > ssize_t result = 0; > size_t requested_bytes = 0; > size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); > + bool defer = false; > > trace_nfs_direct_write_schedule_iovec(dreq); > > @@ -837,17 +847,37 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, > break; > } > > - nfs_lock_request(req); > - if (!nfs_pageio_add_request(&desc, req)) { > - result = desc.pg_error; > - nfs_unlock_and_release_request(req); > - break; > - } > pgbase = 0; > bytes -= req_len; > requested_bytes += req_len; > pos += req_len; > dreq->bytes_left -= req_len; > + > + if (defer) { > + nfs_mark_request_commit(req, NULL, &cinfo, 0); > + continue; > + } > + > + nfs_lock_request(req); > + if (nfs_pageio_add_request(&desc, req)) > + continue; Just a note, we've hit some trouble with this one on pNFS SCSI. When we re-order the update of dreq->bytes_left and nfs_pageio_add_request(), the blocklayout driver machinery ends up with bad calculations for LAYOUTGET on the first req. What I see is a short LAYOUTGET, and then a 2nd LAYOUGET for the last req with loga_length 0, causing some havoc. Eventually, there's a corruption somewhere - I think because nfs_pageio_add_request() ends up doing nfs_pageio_do() in the middle of this thing and then we race to something in completion - I haven't quite been able to figure that part out yet. Ben
On Thu, 2023-11-09 at 10:05 -0500, Benjamin Coddington wrote: > Hi Trond, > > On 4 Sep 2023, at 12:34, trondmy@kernel.org wrote: > > > From: Trond Myklebust <trond.myklebust@hammerspace.com> > > > > If we fail to schedule a request for transmission, there are 2 > > possibilities: > > 1) Either we hit a fatal error, and we just want to drop the > > remaining > > requests on the floor. > > 2) We were asked to try again, in which case we should allow the > > outstanding RPC calls to complete, so that we can recoalesce > > requests > > and try again. > > > > Fixes: d600ad1f2bdb ("NFS41: pop some layoutget errors to > > application") > > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> > > --- > > fs/nfs/direct.c | 62 ++++++++++++++++++++++++++++++++++++--------- > > ---- > > 1 file changed, 46 insertions(+), 16 deletions(-) > > > > diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c > > index 47d892a1d363..ee88f0a6e7b8 100644 > > --- a/fs/nfs/direct.c > > +++ b/fs/nfs/direct.c > > @@ -528,10 +528,9 @@ nfs_direct_write_scan_commit_list(struct inode > > *inode, > > static void nfs_direct_write_reschedule(struct nfs_direct_req > > *dreq) > > { > > struct nfs_pageio_descriptor desc; > > - struct nfs_page *req, *tmp; > > + struct nfs_page *req; > > LIST_HEAD(reqs); > > struct nfs_commit_info cinfo; > > - LIST_HEAD(failed); > > > > nfs_init_cinfo_from_dreq(&cinfo, dreq); > > nfs_direct_write_scan_commit_list(dreq->inode, &reqs, > > &cinfo); > > @@ -549,27 +548,36 @@ static void > > nfs_direct_write_reschedule(struct nfs_direct_req *dreq) > > &nfs_direct_write_completion_ops); > > desc.pg_dreq = dreq; > > > > - list_for_each_entry_safe(req, tmp, &reqs, wb_list) { > > + while (!list_empty(&reqs)) { > > + req = nfs_list_entry(reqs.next); > > /* Bump the transmission count */ > > req->wb_nio++; > > if (!nfs_pageio_add_request(&desc, req)) { > > - nfs_list_move_request(req, &failed); > > spin_lock(&cinfo.inode->i_lock); > > - dreq->flags = 0; > > - if (desc.pg_error < 0) > > + if (dreq->error < 0) { > > + desc.pg_error = dreq->error; > > + } else if (desc.pg_error != -EAGAIN) { > > + dreq->flags = 0; > > + if (!desc.pg_error) > > + desc.pg_error = -EIO; > > dreq->error = desc.pg_error; > > - else > > - dreq->error = -EIO; > > + } else > > + dreq->flags = > > NFS_ODIRECT_RESCHED_WRITES; > > spin_unlock(&cinfo.inode->i_lock); > > + break; > > } > > nfs_release_request(req); > > } > > nfs_pageio_complete(&desc); > > > > - while (!list_empty(&failed)) { > > - req = nfs_list_entry(failed.next); > > + while (!list_empty(&reqs)) { > > + req = nfs_list_entry(reqs.next); > > nfs_list_remove_request(req); > > nfs_unlock_and_release_request(req); > > + if (desc.pg_error == -EAGAIN) > > + nfs_mark_request_commit(req, NULL, &cinfo, > > 0); > > + else > > + nfs_release_request(req); > > } > > > > if (put_dreq(dreq)) > > @@ -794,9 +802,11 @@ static ssize_t > > nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, > > { > > struct nfs_pageio_descriptor desc; > > struct inode *inode = dreq->inode; > > + struct nfs_commit_info cinfo; > > ssize_t result = 0; > > size_t requested_bytes = 0; > > size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, > > PAGE_SIZE); > > + bool defer = false; > > > > trace_nfs_direct_write_schedule_iovec(dreq); > > > > @@ -837,17 +847,37 @@ static ssize_t > > nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, > > break; > > } > > > > - nfs_lock_request(req); > > - if (!nfs_pageio_add_request(&desc, req)) { > > - result = desc.pg_error; > > - > > nfs_unlock_and_release_request(req); > > - break; > > - } > > pgbase = 0; > > bytes -= req_len; > > requested_bytes += req_len; > > pos += req_len; > > dreq->bytes_left -= req_len; > > + > > + if (defer) { > > + nfs_mark_request_commit(req, NULL, > > &cinfo, 0); > > + continue; > > + } > > + > > + nfs_lock_request(req); > > + if (nfs_pageio_add_request(&desc, req)) > > + continue; > > Just a note, we've hit some trouble with this one on pNFS SCSI. > > When we re-order the update of dreq->bytes_left and > nfs_pageio_add_request(), the blocklayout driver machinery ends up > with bad > calculations for LAYOUTGET on the first req. What I see is a short > LAYOUTGET, and then a 2nd LAYOUGET for the last req with loga_length > 0, > causing some havoc. > > Eventually, there's a corruption somewhere - I think because > nfs_pageio_add_request() ends up doing nfs_pageio_do() in the middle > of this > thing and then we race to something in completion - I haven't quite > been > able to figure that part out yet. > Hi Ben, Relying on the value of dreq->bytes_left is just not a good idea, given that the layoutget request could end up returning NFS4ERR_DELAY. How about something like the following patch? 8<----------------------------------------------------- From e68e9c928c9d843c482ec08e1d26350b7999cafa Mon Sep 17 00:00:00 2001 From: Trond Myklebust <trond.myklebust@hammerspace.com> Date: Thu, 9 Nov 2023 11:46:28 -0500 Subject: [PATCH] pNFS: Fix the pnfs block driver's calculation of layoutget size Instead of relying on the value of the 'bytes_left' field, we should calculate the layout size based on the offset of the request that is being written out. Reported-by: Benjamin Coddington <bcodding@redhat.com> Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> --- fs/nfs/blocklayout/blocklayout.c | 5 ++--- fs/nfs/direct.c | 5 +++-- fs/nfs/internal.h | 2 +- fs/nfs/pnfs.c | 3 ++- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 943aeea1eb16..c1cc9fe93dd4 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -893,10 +893,9 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) } if (pgio->pg_dreq == NULL) - wb_size = pnfs_num_cont_bytes(pgio->pg_inode, - req->wb_index); + wb_size = pnfs_num_cont_bytes(pgio->pg_inode, req->wb_index); else - wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); + wb_size = nfs_dreq_bytes_left(pgio->pg_dreq, req_offset(req)); pnfs_generic_pg_init_write(pgio, req, wb_size); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index f6c74f424691..5918c67dae0d 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -205,9 +205,10 @@ static void nfs_direct_req_release(struct nfs_direct_req *dreq) kref_put(&dreq->kref, nfs_direct_req_free); } -ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq) +ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset) { - return dreq->bytes_left; + loff_t start = offset - dreq->io_start; + return dreq->max_count - start; } EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 9c9cf764f600..b1fa81c9dff6 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -655,7 +655,7 @@ extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry); /* direct.c */ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, struct nfs_direct_req *dreq); -extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); +extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset); /* nfs4proc.c */ extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 21a365357629..0c0fed1ecd0b 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2733,7 +2733,8 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r if (pgio->pg_dreq == NULL) rd_size = i_size_read(pgio->pg_inode) - req_offset(req); else - rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); + rd_size = nfs_dreq_bytes_left(pgio->pg_dreq, + req_offset(req)); pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
On 9 Nov 2023, at 11:53, Trond Myklebust wrote: > > Hi Ben, > > Relying on the value of dreq->bytes_left is just not a good idea, given > that the layoutget request could end up returning NFS4ERR_DELAY. > > How about something like the following patch? That looks promising! I think ->bytes_left could get dropped after this. I'll send it through some testing and report back, thanks! Ben
On 9 Nov 2023, at 12:25, Benjamin Coddington wrote: > On 9 Nov 2023, at 11:53, Trond Myklebust wrote: >> >> Hi Ben, >> >> Relying on the value of dreq->bytes_left is just not a good idea, given >> that the layoutget request could end up returning NFS4ERR_DELAY. >> >> How about something like the following patch? > > That looks promising! I think ->bytes_left could get dropped after this. > > I'll send it through some testing and report back, thanks! This definitely fixes it, sorry for the delay getting back. Fixes: 954998b60caa ("NFS: Fix error handling for O_DIRECT write scheduling") Reviewed-by: Benjamin Coddington <bcodding@redhat.com> Tested-by: Benjamin Coddington <bcodding@redhat.com> It creates some clear additional work to remove nfs_direct_req->bytes_left (I don't think its needed anymore) and fixup the nfs_direct_req_class tracepoint, which could be a follow-up patch or get folded in. Ben
On Wed, 2023-11-15 at 08:04 -0500, Benjamin Coddington wrote: > On 9 Nov 2023, at 12:25, Benjamin Coddington wrote: > > > On 9 Nov 2023, at 11:53, Trond Myklebust wrote: > > > > > > Hi Ben, > > > > > > Relying on the value of dreq->bytes_left is just not a good idea, > > > given > > > that the layoutget request could end up returning NFS4ERR_DELAY. > > > > > > How about something like the following patch? > > > > That looks promising! I think ->bytes_left could get dropped after > > this. > > > > I'll send it through some testing and report back, thanks! > > This definitely fixes it, sorry for the delay getting back. > > Fixes: 954998b60caa ("NFS: Fix error handling for O_DIRECT write > scheduling") > Reviewed-by: Benjamin Coddington <bcodding@redhat.com> > Tested-by: Benjamin Coddington <bcodding@redhat.com> > > It creates some clear additional work to remove nfs_direct_req- > >bytes_left > (I don't think its needed anymore) and fixup the nfs_direct_req_class > tracepoint, which could be a follow-up patch or get folded in. > Thank you! I'll queue that patch up so it gets included in the next bugfix pull request. I agree that we should get rid of the bytes_left field. We can queue something up for that in the next merge window.
On 15 Nov 2023, at 12:16, Trond Myklebust wrote: > On Wed, 2023-11-15 at 08:04 -0500, Benjamin Coddington wrote: >> On 9 Nov 2023, at 12:25, Benjamin Coddington wrote: >> >>> On 9 Nov 2023, at 11:53, Trond Myklebust wrote: >>>> >>>> Hi Ben, >>>> >>>> Relying on the value of dreq->bytes_left is just not a good idea, >>>> given >>>> that the layoutget request could end up returning NFS4ERR_DELAY. >>>> >>>> How about something like the following patch? >>> >>> That looks promising! I think ->bytes_left could get dropped after >>> this. >>> >>> I'll send it through some testing and report back, thanks! >> >> This definitely fixes it, sorry for the delay getting back. >> >> Fixes: 954998b60caa ("NFS: Fix error handling for O_DIRECT write >> scheduling") >> Reviewed-by: Benjamin Coddington <bcodding@redhat.com> >> Tested-by: Benjamin Coddington <bcodding@redhat.com> >> >> It creates some clear additional work to remove nfs_direct_req- >>> bytes_left >> (I don't think its needed anymore) and fixup the nfs_direct_req_class >> tracepoint, which could be a follow-up patch or get folded in. >> > > Thank you! I'll queue that patch up so it gets included in the next > bugfix pull request. Thank you for the fix. > I agree that we should get rid of the bytes_left field. We can queue > something up for that in the next merge window. I have it tested already with yours, I'll send it along. Ben
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 47d892a1d363..ee88f0a6e7b8 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -528,10 +528,9 @@ nfs_direct_write_scan_commit_list(struct inode *inode, static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) { struct nfs_pageio_descriptor desc; - struct nfs_page *req, *tmp; + struct nfs_page *req; LIST_HEAD(reqs); struct nfs_commit_info cinfo; - LIST_HEAD(failed); nfs_init_cinfo_from_dreq(&cinfo, dreq); nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); @@ -549,27 +548,36 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; - list_for_each_entry_safe(req, tmp, &reqs, wb_list) { + while (!list_empty(&reqs)) { + req = nfs_list_entry(reqs.next); /* Bump the transmission count */ req->wb_nio++; if (!nfs_pageio_add_request(&desc, req)) { - nfs_list_move_request(req, &failed); spin_lock(&cinfo.inode->i_lock); - dreq->flags = 0; - if (desc.pg_error < 0) + if (dreq->error < 0) { + desc.pg_error = dreq->error; + } else if (desc.pg_error != -EAGAIN) { + dreq->flags = 0; + if (!desc.pg_error) + desc.pg_error = -EIO; dreq->error = desc.pg_error; - else - dreq->error = -EIO; + } else + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; spin_unlock(&cinfo.inode->i_lock); + break; } nfs_release_request(req); } nfs_pageio_complete(&desc); - while (!list_empty(&failed)) { - req = nfs_list_entry(failed.next); + while (!list_empty(&reqs)) { + req = nfs_list_entry(reqs.next); nfs_list_remove_request(req); nfs_unlock_and_release_request(req); + if (desc.pg_error == -EAGAIN) + nfs_mark_request_commit(req, NULL, &cinfo, 0); + else + nfs_release_request(req); } if (put_dreq(dreq)) @@ -794,9 +802,11 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, { struct nfs_pageio_descriptor desc; struct inode *inode = dreq->inode; + struct nfs_commit_info cinfo; ssize_t result = 0; size_t requested_bytes = 0; size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); + bool defer = false; trace_nfs_direct_write_schedule_iovec(dreq); @@ -837,17 +847,37 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, break; } - nfs_lock_request(req); - if (!nfs_pageio_add_request(&desc, req)) { - result = desc.pg_error; - nfs_unlock_and_release_request(req); - break; - } pgbase = 0; bytes -= req_len; requested_bytes += req_len; pos += req_len; dreq->bytes_left -= req_len; + + if (defer) { + nfs_mark_request_commit(req, NULL, &cinfo, 0); + continue; + } + + nfs_lock_request(req); + if (nfs_pageio_add_request(&desc, req)) + continue; + + /* Exit on hard errors */ + if (desc.pg_error < 0 && desc.pg_error != -EAGAIN) { + result = desc.pg_error; + nfs_unlock_and_release_request(req); + break; + } + + /* If the error is soft, defer remaining requests */ + nfs_init_cinfo_from_dreq(&cinfo, dreq); + spin_lock(&cinfo.inode->i_lock); + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&cinfo.inode->i_lock); + nfs_unlock_request(req); + nfs_mark_request_commit(req, NULL, &cinfo, 0); + desc.pg_error = 0; + defer = true; } nfs_direct_release_pages(pagevec, npages); kvfree(pagevec);