Message ID | 20231024164937.14684-3-mwilck@suse.com (mailing list archive) |
---|---|
State | Not Applicable, archived |
Delegated to: | christophe varoqui |
Headers | show |
Series | multipath: aio and systemd service improvements | expand |
On Tue, Oct 24, 2023 at 06:49:33PM +0200, mwilck@suse.com wrote: > From: Martin Wilck <mwilck@suse.com> > > io_cancel() never succeeds, and even if it did, io_getevents() must > still be called to reclaim the IO resources from the kernel. > Don't pretend the opposite by resetting ct->running, or freeing the > memory, before io_getevents() has indicated that the request is finished. > > Signed-off-by: Martin Wilck <mwilck@suse.com> > --- > libmultipath/checkers/directio.c | 13 ++----------- > tests/directio.c | 10 +--------- > 2 files changed, 3 insertions(+), 20 deletions(-) > > diff --git a/libmultipath/checkers/directio.c b/libmultipath/checkers/directio.c > index 25b2383..71ce4cb 100644 > --- a/libmultipath/checkers/directio.c > +++ b/libmultipath/checkers/directio.c > @@ -216,7 +216,6 @@ out: > void libcheck_free (struct checker * c) > { > struct directio_context * ct = (struct directio_context *)c->context; > - struct io_event event; > long flags; > > if (!ct) > @@ -232,9 +231,7 @@ void libcheck_free (struct checker * c) > } > } > > - if (ct->running && > - (ct->req->state != PATH_PENDING || > - io_cancel(ct->aio_grp->ioctx, &ct->req->io, &event) == 0)) Does io_cancel() do nothing? If does make the IO likely to end sooner, even if it always returns failure, then we should probably still run it when we have an outstanding request that we are going to orphan. We should just move the io_cancel() to the else block, where we add the request to the orphans list. Otherwise, this looks good. -Ben > + if (ct->running && ct->req->state != PATH_PENDING) > ct->running = 0; > if (!ct->running) { > free(ct->req->buf); > @@ -351,13 +348,7 @@ check_state(int fd, struct directio_context *ct, int sync, int timeout_secs) > > LOG(3, "abort check on timeout"); > > - r = io_cancel(ct->aio_grp->ioctx, &ct->req->io, &event); > - /* > - * Only reset ct->running if we really > - * could abort the pending I/O > - */ > - if (!r) > - ct->running = 0; > + io_cancel(ct->aio_grp->ioctx, &ct->req->io, &event); > rc = PATH_DOWN; > } else { > LOG(4, "async io pending"); > diff --git a/tests/directio.c b/tests/directio.c > index db9643e..b340498 100644 > --- a/tests/directio.c > +++ b/tests/directio.c > @@ -501,13 +501,8 @@ static void test_free_with_pending(void **state) > check_aio_grp(aio_grp, 2, 0); > libcheck_free(&c[0]); > check_aio_grp(aio_grp, 1, 0); > - will_return(__wrap_io_cancel, 0); > libcheck_free(&c[1]); > -#ifdef DIO_TEST_DEV > - check_aio_grp(aio_grp, 1, 1); /* real cancel doesn't remove request */ > -#else > - check_aio_grp(aio_grp, 0, 0); > -#endif > + check_aio_grp(aio_grp, 1, 1); /* cancel doesn't remove request */ > do_libcheck_reset(1); > } > > @@ -533,7 +528,6 @@ static void test_orphaned_aio_group(void **state) > assert_int_equal(i, 1); > for (i = 0; i < AIO_GROUP_SIZE; i++) { > assert_true(is_checker_running(&c[i])); > - will_return(__wrap_io_cancel, -1); > if (i == AIO_GROUP_SIZE - 1) { > /* remove the orphaned group and create a new one */ > will_return(__wrap_io_destroy, 0); > @@ -637,7 +631,6 @@ static void test_orphan_checker_cleanup(void **state) > aio_grp = get_aio_grp(c); > return_io_getevents_none(); > do_check_state(&c[0], 0, 30, PATH_PENDING); > - will_return(__wrap_io_cancel, -1); > check_aio_grp(aio_grp, 2, 0); > libcheck_free(&c[0]); > check_aio_grp(aio_grp, 2, 1); > @@ -662,7 +655,6 @@ static void test_orphan_reset_cleanup(void **state) > orphan_aio_grp = get_aio_grp(&c); > return_io_getevents_none(); > do_check_state(&c, 0, 30, PATH_PENDING); > - will_return(__wrap_io_cancel, -1); > check_aio_grp(orphan_aio_grp, 1, 0); > libcheck_free(&c); > check_aio_grp(orphan_aio_grp, 1, 1); > -- > 2.42.0
On 10/26/23 01:14, Benjamin Marzinski wrote: > On Tue, Oct 24, 2023 at 06:49:33PM +0200, mwilck@suse.com wrote: >> From: Martin Wilck <mwilck@suse.com> >> >> io_cancel() never succeeds, and even if it did, io_getevents() must >> still be called to reclaim the IO resources from the kernel. >> Don't pretend the opposite by resetting ct->running, or freeing the >> memory, before io_getevents() has indicated that the request is finished. >> >> Signed-off-by: Martin Wilck <mwilck@suse.com> >> --- >> libmultipath/checkers/directio.c | 13 ++----------- >> tests/directio.c | 10 +--------- >> 2 files changed, 3 insertions(+), 20 deletions(-) >> >> diff --git a/libmultipath/checkers/directio.c b/libmultipath/checkers/directio.c >> index 25b2383..71ce4cb 100644 >> --- a/libmultipath/checkers/directio.c >> +++ b/libmultipath/checkers/directio.c >> @@ -216,7 +216,6 @@ out: >> void libcheck_free (struct checker * c) >> { >> struct directio_context * ct = (struct directio_context *)c->context; >> - struct io_event event; >> long flags; >> >> if (!ct) >> @@ -232,9 +231,7 @@ void libcheck_free (struct checker * c) >> } >> } >> >> - if (ct->running && >> - (ct->req->state != PATH_PENDING || >> - io_cancel(ct->aio_grp->ioctx, &ct->req->io, &event) == 0)) > > Does io_cancel() do nothing? If does make the IO likely to end sooner, > even if it always returns failure, then we should probably still run it > when we have an outstanding request that we are going to orphan. We > should just move the io_cancel() to the else block, where we add the > request to the orphans list. > Well, it doesn't to _nothing_, but what matters is that it doesn't actually affect the running I/O. It just cancels the element in the libaio infrastructure, not the I/O itself. So Martin is perfectly right here. Reviewed-by: Hannes Reinecke <hare@suse.de> Cheers, Hannes
On Wed, 2023-10-25 at 19:14 -0400, Benjamin Marzinski wrote: > On Tue, Oct 24, 2023 at 06:49:33PM +0200, mwilck@suse.com wrote: > > > > --- a/libmultipath/checkers/directio.c > > +++ b/libmultipath/checkers/directio.c > > @@ -216,7 +216,6 @@ out: > > void libcheck_free (struct checker * c) > > { > > struct directio_context * ct = (struct directio_context > > *)c->context; > > - struct io_event event; > > long flags; > > > > if (!ct) > > @@ -232,9 +231,7 @@ void libcheck_free (struct checker * c) > > } > > } > > > > - if (ct->running && > > - (ct->req->state != PATH_PENDING || > > - io_cancel(ct->aio_grp->ioctx, &ct->req->io, &event) > > == 0)) > > Does io_cancel() do nothing? No it doesn't, as Hannes already said. Currently, at least. See https://elixir.bootlin.com/linux/latest/source/fs/aio.c: ki_cancel() is only set for aio_poll() and some USB gadgets. For regular block devices, io_cancel() is equivalent to "return -EINPROGRESS;". > If does make the IO likely to end sooner, > even if it always returns failure, then we should probably still run > it > when we have an outstanding request that we are going to orphan. > We > should just move the io_cancel() to the else block, where we add the > request to the orphans list. But I suppose it would be possible to hook ki_cancel() into the SCSI layer, translating it into a (series of) command aborts. Perhaps someone will implement that in the future; it seem that just nobody has bothered yet. So yes, I will re-add io_cancel() in the else clause in libcheck_free(). It won't do harm, and we'll be prepared for kernel 8.0 ;-) If we want more predictable timeouts for directio and the io_err_stat code _today_, we would need to move away from aio and use SG_IO instead. That would allow us to set SCSI timeouts explicitly, and (mostly) avoid retries on the SCSI mid layer. Regards Martin
diff --git a/libmultipath/checkers/directio.c b/libmultipath/checkers/directio.c index 25b2383..71ce4cb 100644 --- a/libmultipath/checkers/directio.c +++ b/libmultipath/checkers/directio.c @@ -216,7 +216,6 @@ out: void libcheck_free (struct checker * c) { struct directio_context * ct = (struct directio_context *)c->context; - struct io_event event; long flags; if (!ct) @@ -232,9 +231,7 @@ void libcheck_free (struct checker * c) } } - if (ct->running && - (ct->req->state != PATH_PENDING || - io_cancel(ct->aio_grp->ioctx, &ct->req->io, &event) == 0)) + if (ct->running && ct->req->state != PATH_PENDING) ct->running = 0; if (!ct->running) { free(ct->req->buf); @@ -351,13 +348,7 @@ check_state(int fd, struct directio_context *ct, int sync, int timeout_secs) LOG(3, "abort check on timeout"); - r = io_cancel(ct->aio_grp->ioctx, &ct->req->io, &event); - /* - * Only reset ct->running if we really - * could abort the pending I/O - */ - if (!r) - ct->running = 0; + io_cancel(ct->aio_grp->ioctx, &ct->req->io, &event); rc = PATH_DOWN; } else { LOG(4, "async io pending"); diff --git a/tests/directio.c b/tests/directio.c index db9643e..b340498 100644 --- a/tests/directio.c +++ b/tests/directio.c @@ -501,13 +501,8 @@ static void test_free_with_pending(void **state) check_aio_grp(aio_grp, 2, 0); libcheck_free(&c[0]); check_aio_grp(aio_grp, 1, 0); - will_return(__wrap_io_cancel, 0); libcheck_free(&c[1]); -#ifdef DIO_TEST_DEV - check_aio_grp(aio_grp, 1, 1); /* real cancel doesn't remove request */ -#else - check_aio_grp(aio_grp, 0, 0); -#endif + check_aio_grp(aio_grp, 1, 1); /* cancel doesn't remove request */ do_libcheck_reset(1); } @@ -533,7 +528,6 @@ static void test_orphaned_aio_group(void **state) assert_int_equal(i, 1); for (i = 0; i < AIO_GROUP_SIZE; i++) { assert_true(is_checker_running(&c[i])); - will_return(__wrap_io_cancel, -1); if (i == AIO_GROUP_SIZE - 1) { /* remove the orphaned group and create a new one */ will_return(__wrap_io_destroy, 0); @@ -637,7 +631,6 @@ static void test_orphan_checker_cleanup(void **state) aio_grp = get_aio_grp(c); return_io_getevents_none(); do_check_state(&c[0], 0, 30, PATH_PENDING); - will_return(__wrap_io_cancel, -1); check_aio_grp(aio_grp, 2, 0); libcheck_free(&c[0]); check_aio_grp(aio_grp, 2, 1); @@ -662,7 +655,6 @@ static void test_orphan_reset_cleanup(void **state) orphan_aio_grp = get_aio_grp(&c); return_io_getevents_none(); do_check_state(&c, 0, 30, PATH_PENDING); - will_return(__wrap_io_cancel, -1); check_aio_grp(orphan_aio_grp, 1, 0); libcheck_free(&c); check_aio_grp(orphan_aio_grp, 1, 1);