Message ID | 20240828221757.4060548-5-bmarzins@redhat.com (mailing list archive) |
---|---|
State | Not Applicable, archived |
Delegated to: | Benjamin Marzinski |
Headers | show |
Series | Yet Another path checker refactor | expand |
On Wed, 2024-08-28 at 18:17 -0400, Benjamin Marzinski wrote: > When the tur and directio checkers start an asynchronous checker, > they > now immediately return with the path in PATH_PENDING, instead of > waiting in checker_check(). Instead the wait now happens in > checker_get_state(). > > Additionally, the directio checker now waits for 1 ms, like the tur > checker does. Also like the tur checker it now only waits once. If it > is still pending after the first call to checker_get_state(). It will > not wait at all on future calls, and will just process the already > completed IOs. > > Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com> Now I understand what you're doing, but I find it somewhat confusing. I believe this can be simplified after the split between starting the checkers and looking at their result. Why not just fire off the checkers, wait 1ms for _all_ the just started checkers in checkerloop() [*], and then do the pending test like it is now (but just in polling mode, with no c->timeout)? Regards Martin [*] or maybe even a little more, 5ms, say: we'd still be waiting far less time than we used to, but greatly increase the odds that most checkers actually complete. > --- > libmultipath/checkers.c | 7 +++- > libmultipath/checkers/directio.c | 57 +++++++++++++++++------------- > -- > libmultipath/checkers/tur.c | 13 +++----- > 3 files changed, 41 insertions(+), 36 deletions(-) > > diff --git a/libmultipath/checkers.c b/libmultipath/checkers.c > index 298aec78..b44b856a 100644 > --- a/libmultipath/checkers.c > +++ b/libmultipath/checkers.c > @@ -303,7 +303,12 @@ void checker_put (struct checker * dst) > > int checker_get_state(struct checker *c) > { > - return c ? c->path_state : PATH_UNCHECKED; > + if (!c) > + return PATH_UNCHECKED; > + if (c->path_state != PATH_PENDING || !c->cls->pending) > + return c->path_state; > + c->path_state = c->cls->pending(c); > + return c->path_state; > } > > void checker_check (struct checker * c, int path_state) > diff --git a/libmultipath/checkers/directio.c > b/libmultipath/checkers/directio.c > index 3f88b40d..904e3071 100644 > --- a/libmultipath/checkers/directio.c > +++ b/libmultipath/checkers/directio.c > @@ -60,10 +60,11 @@ const char *libcheck_msgtable[] = { > #define LOG(prio, fmt, args...) condlog(prio, "directio: " fmt, > ##args) > > struct directio_context { > - int running; > + unsigned int running; > int reset_flags; > struct aio_group *aio_grp; > struct async_req *req; > + struct timespec endtime; > }; > > static struct aio_group * > @@ -314,19 +315,16 @@ check_pending(struct directio_context *ct, > struct timespec endtime) > static int > check_state(int fd, struct directio_context *ct, int sync, int > timeout_secs) > { > - struct timespec timeout = { .tv_nsec = 1000 }; > struct stat sb; > int rc; > + struct io_event event; > struct timespec endtime; > > if (fstat(fd, &sb) == 0) { > LOG(4, "called for %x", (unsigned) sb.st_rdev); > } > - if (sync > 0) { > + if (sync > 0) > LOG(4, "called in synchronous mode"); > - timeout.tv_sec = timeout_secs; > - timeout.tv_nsec = 0; > - } > > if (ct->running) { > if (ct->req->state != PATH_PENDING) { > @@ -345,31 +343,26 @@ check_state(int fd, struct directio_context > *ct, int sync, int timeout_secs) > LOG(3, "io_submit error %i", -rc); > return PATH_UNCHECKED; > } > + get_monotonic_time(&ct->endtime); > + ct->endtime.tv_nsec += 1000 * 1000; > + normalize_timespec(&ct->endtime); > } > ct->running++; > + if (!sync) > + return PATH_PENDING; > > get_monotonic_time(&endtime); > - endtime.tv_sec += timeout.tv_sec; > - endtime.tv_nsec += timeout.tv_nsec; > + endtime.tv_sec += timeout_secs; > normalize_timespec(&endtime); > > check_pending(ct, endtime); > if (ct->req->state != PATH_PENDING) > return ct->req->state; > > - if (ct->running > timeout_secs || sync) { > - struct io_event event; > - > - LOG(3, "abort check on timeout"); > - > - io_cancel(ct->aio_grp->ioctx, &ct->req->io, &event); > - rc = PATH_DOWN; > - } else { > - LOG(4, "async io pending"); > - rc = PATH_PENDING; > - } > + LOG(3, "abort check on timeout"); > > - return rc; > + io_cancel(ct->aio_grp->ioctx, &ct->req->io, &event); > + return PATH_DOWN; > } > > static void set_msgid(struct checker *c, int state) > @@ -395,21 +388,31 @@ static void set_msgid(struct checker *c, int > state) > > int libcheck_pending(struct checker *c) > { > - struct timespec endtime; > + int rc; > + struct io_event event; > struct directio_context *ct = (struct directio_context *)c- > >context; > > /* The if path checker isn't running, just return the > exiting value. */ > if (!ct || !ct->running) > return c->path_state; > > - if (ct->req->state == PATH_PENDING) { > - get_monotonic_time(&endtime); > - check_pending(ct, endtime); > - } else > + if (ct->req->state == PATH_PENDING) > + check_pending(ct, ct->endtime); > + else > ct->running = 0; > - set_msgid(c, ct->req->state); > + rc = ct->req->state; > + if (rc == PATH_PENDING) { > + if (ct->running > c->timeout) { > + LOG(3, "abort check on timeout"); > + io_cancel(ct->aio_grp->ioctx, &ct->req->io, > &event); > + rc = PATH_DOWN; > + } > + else > + LOG(4, "async io pending"); > + } > + set_msgid(c, rc); > > - return ct->req->state; > + return rc; > } > > int libcheck_check (struct checker * c) > diff --git a/libmultipath/checkers/tur.c > b/libmultipath/checkers/tur.c > index 95af5214..81db565b 100644 > --- a/libmultipath/checkers/tur.c > +++ b/libmultipath/checkers/tur.c > @@ -58,6 +58,7 @@ struct tur_checker_context { > int msgid; > struct checker_context ctx; > unsigned int nr_timeouts; > + struct timespec endtime; > }; > > int libcheck_init (struct checker * c) > @@ -323,7 +324,7 @@ static int tur_check_async_timeout(struct checker > *c) > return (now.tv_sec > ct->time); > } > > -int check_pending(struct checker *c, struct timespec endtime) > +int check_pending(struct checker *c) > { > struct tur_checker_context *ct = c->context; > int r, tur_status = PATH_PENDING; > @@ -333,7 +334,7 @@ int check_pending(struct checker *c, struct > timespec endtime) > for (r = 0; > r == 0 && ct->state == PATH_PENDING && > ct->msgid == MSG_TUR_RUNNING; > - r = pthread_cond_timedwait(&ct->active, &ct->lock, > &endtime)); > + r = pthread_cond_timedwait(&ct->active, &ct->lock, &ct- > >endtime)); > > if (!r) { > tur_status = ct->state; > @@ -355,21 +356,18 @@ int check_pending(struct checker *c, struct > timespec endtime) > > int libcheck_pending(struct checker *c) > { > - struct timespec endtime; > struct tur_checker_context *ct = c->context; > > /* The if path checker isn't running, just return the > exiting value. */ > if (!ct || !ct->thread) > return c->path_state; > > - get_monotonic_time(&endtime); > - return check_pending(c, endtime); > + return check_pending(c); > } > > int libcheck_check(struct checker * c) > { > struct tur_checker_context *ct = c->context; > - struct timespec tsp; > pthread_attr_t attr; > int tur_status, r; > > @@ -479,8 +477,7 @@ int libcheck_check(struct checker * c) > " sync mode", major(ct->devt), > minor(ct->devt)); > return tur_check(c->fd, c->timeout, &c- > >msgid); > } > - tur_timeout(&tsp); > - tur_status = check_pending(c, tsp); > + tur_timeout(&ct->endtime); > } > > return tur_status;
On Wed, Sep 04, 2024 at 10:05:30PM +0200, Martin Wilck wrote: > On Wed, 2024-08-28 at 18:17 -0400, Benjamin Marzinski wrote: > > When the tur and directio checkers start an asynchronous checker, > > they > > now immediately return with the path in PATH_PENDING, instead of > > waiting in checker_check(). Instead the wait now happens in > > checker_get_state(). > > > > Additionally, the directio checker now waits for 1 ms, like the tur > > checker does. Also like the tur checker it now only waits once. If it > > is still pending after the first call to checker_get_state(). It will > > not wait at all on future calls, and will just process the already > > completed IOs. > > > > Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com> > > Now I understand what you're doing, but I find it somewhat confusing. I > believe this can be simplified after the split between starting the > checkers and looking at their result. > > Why not just fire off the checkers, wait 1ms for _all_ the just started > checkers in checkerloop() [*], and then do the pending test like it is > now (but just in polling mode, with no c->timeout)? That's probably possible. You would also need to put that wait in pathinfo(). The big reason I did it this way was so nothing outside of the checker itself needed to care about things like if the checker was set to async and whether or not the checker was actually async capable and if this was a newly started async check or if the checker had already been pending for a cycle, where waiting another 1ms was pointless. This way, you just call checker_get_state() on all the paths were you ran the path checker, and if at least one of them needs to wait, then the first one that does will wait, and the rest won't. If none of them need to wait, then none of them will. -Ben > > Regards > Martin > > [*] or maybe even a little more, 5ms, say: we'd still be waiting far > less time than we used to, but greatly increase the odds that most > checkers actually complete. > > > > --- > > libmultipath/checkers.c | 7 +++- > > libmultipath/checkers/directio.c | 57 +++++++++++++++++------------- > > -- > > libmultipath/checkers/tur.c | 13 +++----- > > 3 files changed, 41 insertions(+), 36 deletions(-) > > > > diff --git a/libmultipath/checkers.c b/libmultipath/checkers.c > > index 298aec78..b44b856a 100644 > > --- a/libmultipath/checkers.c > > +++ b/libmultipath/checkers.c > > @@ -303,7 +303,12 @@ void checker_put (struct checker * dst) > > > > int checker_get_state(struct checker *c) > > { > > - return c ? c->path_state : PATH_UNCHECKED; > > + if (!c) > > + return PATH_UNCHECKED; > > + if (c->path_state != PATH_PENDING || !c->cls->pending) > > + return c->path_state; > > + c->path_state = c->cls->pending(c); > > + return c->path_state; > > } > > > > void checker_check (struct checker * c, int path_state) > > diff --git a/libmultipath/checkers/directio.c > > b/libmultipath/checkers/directio.c > > index 3f88b40d..904e3071 100644 > > --- a/libmultipath/checkers/directio.c > > +++ b/libmultipath/checkers/directio.c > > @@ -60,10 +60,11 @@ const char *libcheck_msgtable[] = { > > #define LOG(prio, fmt, args...) condlog(prio, "directio: " fmt, > > ##args) > > > > struct directio_context { > > - int running; > > + unsigned int running; > > int reset_flags; > > struct aio_group *aio_grp; > > struct async_req *req; > > + struct timespec endtime; > > }; > > > > static struct aio_group * > > @@ -314,19 +315,16 @@ check_pending(struct directio_context *ct, > > struct timespec endtime) > > static int > > check_state(int fd, struct directio_context *ct, int sync, int > > timeout_secs) > > { > > - struct timespec timeout = { .tv_nsec = 1000 }; > > struct stat sb; > > int rc; > > + struct io_event event; > > struct timespec endtime; > > > > if (fstat(fd, &sb) == 0) { > > LOG(4, "called for %x", (unsigned) sb.st_rdev); > > } > > - if (sync > 0) { > > + if (sync > 0) > > LOG(4, "called in synchronous mode"); > > - timeout.tv_sec = timeout_secs; > > - timeout.tv_nsec = 0; > > - } > > > > if (ct->running) { > > if (ct->req->state != PATH_PENDING) { > > @@ -345,31 +343,26 @@ check_state(int fd, struct directio_context > > *ct, int sync, int timeout_secs) > > LOG(3, "io_submit error %i", -rc); > > return PATH_UNCHECKED; > > } > > + get_monotonic_time(&ct->endtime); > > + ct->endtime.tv_nsec += 1000 * 1000; > > + normalize_timespec(&ct->endtime); > > } > > ct->running++; > > + if (!sync) > > + return PATH_PENDING; > > > > get_monotonic_time(&endtime); > > - endtime.tv_sec += timeout.tv_sec; > > - endtime.tv_nsec += timeout.tv_nsec; > > + endtime.tv_sec += timeout_secs; > > normalize_timespec(&endtime); > > > > check_pending(ct, endtime); > > if (ct->req->state != PATH_PENDING) > > return ct->req->state; > > > > - if (ct->running > timeout_secs || sync) { > > - struct io_event event; > > - > > - LOG(3, "abort check on timeout"); > > - > > - io_cancel(ct->aio_grp->ioctx, &ct->req->io, &event); > > - rc = PATH_DOWN; > > - } else { > > - LOG(4, "async io pending"); > > - rc = PATH_PENDING; > > - } > > + LOG(3, "abort check on timeout"); > > > > - return rc; > > + io_cancel(ct->aio_grp->ioctx, &ct->req->io, &event); > > + return PATH_DOWN; > > } > > > > static void set_msgid(struct checker *c, int state) > > @@ -395,21 +388,31 @@ static void set_msgid(struct checker *c, int > > state) > > > > int libcheck_pending(struct checker *c) > > { > > - struct timespec endtime; > > + int rc; > > + struct io_event event; > > struct directio_context *ct = (struct directio_context *)c- > > >context; > > > > /* The if path checker isn't running, just return the > > exiting value. */ > > if (!ct || !ct->running) > > return c->path_state; > > > > - if (ct->req->state == PATH_PENDING) { > > - get_monotonic_time(&endtime); > > - check_pending(ct, endtime); > > - } else > > + if (ct->req->state == PATH_PENDING) > > + check_pending(ct, ct->endtime); > > + else > > ct->running = 0; > > - set_msgid(c, ct->req->state); > > + rc = ct->req->state; > > + if (rc == PATH_PENDING) { > > + if (ct->running > c->timeout) { > > + LOG(3, "abort check on timeout"); > > + io_cancel(ct->aio_grp->ioctx, &ct->req->io, > > &event); > > + rc = PATH_DOWN; > > + } > > + else > > + LOG(4, "async io pending"); > > + } > > + set_msgid(c, rc); > > > > - return ct->req->state; > > + return rc; > > } > > > > int libcheck_check (struct checker * c) > > diff --git a/libmultipath/checkers/tur.c > > b/libmultipath/checkers/tur.c > > index 95af5214..81db565b 100644 > > --- a/libmultipath/checkers/tur.c > > +++ b/libmultipath/checkers/tur.c > > @@ -58,6 +58,7 @@ struct tur_checker_context { > > int msgid; > > struct checker_context ctx; > > unsigned int nr_timeouts; > > + struct timespec endtime; > > }; > > > > int libcheck_init (struct checker * c) > > @@ -323,7 +324,7 @@ static int tur_check_async_timeout(struct checker > > *c) > > return (now.tv_sec > ct->time); > > } > > > > -int check_pending(struct checker *c, struct timespec endtime) > > +int check_pending(struct checker *c) > > { > > struct tur_checker_context *ct = c->context; > > int r, tur_status = PATH_PENDING; > > @@ -333,7 +334,7 @@ int check_pending(struct checker *c, struct > > timespec endtime) > > for (r = 0; > > r == 0 && ct->state == PATH_PENDING && > > ct->msgid == MSG_TUR_RUNNING; > > - r = pthread_cond_timedwait(&ct->active, &ct->lock, > > &endtime)); > > + r = pthread_cond_timedwait(&ct->active, &ct->lock, &ct- > > >endtime)); > > > > if (!r) { > > tur_status = ct->state; > > @@ -355,21 +356,18 @@ int check_pending(struct checker *c, struct > > timespec endtime) > > > > int libcheck_pending(struct checker *c) > > { > > - struct timespec endtime; > > struct tur_checker_context *ct = c->context; > > > > /* The if path checker isn't running, just return the > > exiting value. */ > > if (!ct || !ct->thread) > > return c->path_state; > > > > - get_monotonic_time(&endtime); > > - return check_pending(c, endtime); > > + return check_pending(c); > > } > > > > int libcheck_check(struct checker * c) > > { > > struct tur_checker_context *ct = c->context; > > - struct timespec tsp; > > pthread_attr_t attr; > > int tur_status, r; > > > > @@ -479,8 +477,7 @@ int libcheck_check(struct checker * c) > > " sync mode", major(ct->devt), > > minor(ct->devt)); > > return tur_check(c->fd, c->timeout, &c- > > >msgid); > > } > > - tur_timeout(&tsp); > > - tur_status = check_pending(c, tsp); > > + tur_timeout(&ct->endtime); > > } > > > > return tur_status;
On Wed, 2024-09-04 at 17:17 -0400, Benjamin Marzinski wrote: > On Wed, Sep 04, 2024 at 10:05:30PM +0200, Martin Wilck wrote: > > On Wed, 2024-08-28 at 18:17 -0400, Benjamin Marzinski wrote: > > > When the tur and directio checkers start an asynchronous checker, > > > they > > > now immediately return with the path in PATH_PENDING, instead of > > > waiting in checker_check(). Instead the wait now happens in > > > checker_get_state(). > > > > > > Additionally, the directio checker now waits for 1 ms, like the > > > tur > > > checker does. Also like the tur checker it now only waits once. > > > If it > > > is still pending after the first call to checker_get_state(). It > > > will > > > not wait at all on future calls, and will just process the > > > already > > > completed IOs. > > > > > > Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com> > > > > Now I understand what you're doing, but I find it somewhat > > confusing. I > > believe this can be simplified after the split between starting the > > checkers and looking at their result. > > > > Why not just fire off the checkers, wait 1ms for _all_ the just > > started > > checkers in checkerloop() [*], and then do the pending test like it > > is > > now (but just in polling mode, with no c->timeout)? > > That's probably possible. You would also need to put that wait in > pathinfo(). pathinfo() is tricky because it's written with the assumption that the result is obtained synchronously. Maybe in pathinfo we should just fire up the checker and set the status to PENDING. But we might as well not run the checker at all, just set the checker interval to minimum, and let the checkers do their work. (I like that idea, actually). Another idea would be to define an exception for pathinfo(), and actually wait for completion there, not until c->timeout expires, but somewhat longer than we currently do. > The big reason I did it this way was so nothing outside of the > checker > itself needed to care about things like if the checker was set to > async > and whether or not the checker was actually async capable and if this > was a newly started async check or if the checker had already been > pending for a cycle, where waiting another 1ms was pointless. Right, the sync property should be hidden. If sync checkers are in use, none of the recent changes will have much of a positive effect. I'd like to convert all checkers to async mode. Just need some time to do it :-/ AFAICS, the only sync checker that still matters is rdac. Below is some brainstorming from my side. Ignore it if it makes no sense. We currently use the path state, in particular PATH_PENDING, to infer the state of the checker (thread, aio request), but that doesn't give us the full picture. We should be able to distinguish the path state and the state of the checker itself, which can be IDLE, RUNNING, or DONE. The checker provides an API checker_state() (I can't think of a better name) to retrieve its state. The path state can (only) be retrieved in DONE checker state. When the path state is retrieved, the checker state switches to IDLE. checker_start() switches from IDLE to RUNNING. checker_wait(timeout) waits until the RUNNING checker reaches DONE, or the passed timeout is reached. A timed-out checker (wrt c->timeout) has state DONE with path state PATH_TIMEOUT. Sync checkers would have a degenerated API where checker_start() and checker_state() would be noops. > This way, you just call checker_get_state() on all the paths were you > ran the path checker, and if at least one of them needs to wait, then > the first one that does will wait, and the rest won't. If none of > them > need to wait, then none of them will. This makes sense. But the "wait" you're talking about is the 1ms delay, after which the paths may well still be in PATH_PENDING. It's actually not much different from not waiting at all, we've just decreased the likelihood of PATH_PENDIN. 1ms is just an estimate, and, in my experience, too short in many environments. The real benefit of your patch set is that we now start all (async) checkers in parallel. Which means that we can wait longer, increasing the likelihood that the checkers have actually finished, while spending less total time waiting. "Waiting longer" is a bit hand-waving; IMO it should be handled further up in the stack rather than down in the checker code. We could achieve the behavior you describe in checkerloop, like this: - record start_timestamp - fire off all handlers - calculate end_time as start_timestamp + some_delay - use that end_time for waiting for results of all fired-off checkers Like in your example, we'd really wait for no more than 1 path. We could do this for the entire set of paths, or for each multipath map separately. Regards Martin
On Thu, Sep 05, 2024 at 09:53:52AM +0200, Martin Wilck wrote: > On Wed, 2024-09-04 at 17:17 -0400, Benjamin Marzinski wrote: > > On Wed, Sep 04, 2024 at 10:05:30PM +0200, Martin Wilck wrote: > > > On Wed, 2024-08-28 at 18:17 -0400, Benjamin Marzinski wrote: <snip> > > This way, you just call checker_get_state() on all the paths were you > > ran the path checker, and if at least one of them needs to wait, then > > the first one that does will wait, and the rest won't. If none of > > them > > need to wait, then none of them will. > > This makes sense. But the "wait" you're talking about is the 1ms delay, > after which the paths may well still be in PATH_PENDING. It's actually > not much different from not waiting at all, we've just decreased the > likelihood of PATH_PENDIN. 1ms is just an estimate, and, in my > experience, too short in many environments. > > The real benefit of your patch set is that we now start all (async) > checkers in parallel. Which means that we can wait longer, increasing > the likelihood that the checkers have actually finished, while spending > less total time waiting. "Waiting longer" is a bit hand-waving; IMO it > should be handled further up in the stack rather than down in the > checker code. > > We could achieve the behavior you describe in checkerloop, like this: > > - record start_timestamp > - fire off all handlers > - calculate end_time as start_timestamp + some_delay > - use that end_time for waiting for results of all fired-off checkers > > Like in your example, we'd really wait for no more than 1 path. > We could do this for the entire set of paths, or for each multipath map > separately. Actually, now that I think about it, the biggest benefit of doing the wait in checkerloop itself is that the code already handles the case where we drop the vecs lock while there are paths that have started the checker, but not yet updated the device based on the results. Given that, we can just do the wait for pending paths without holding the vecs lock. It's sort of embarassing that I did all this work to cut down on the time we're waiting but didn't think about the fact that with just a bit more work, we could avoid sleeping while holding the lock altogether. I already did some testing before to make sure that things like reconfigures, adding/removing paths and manually failing/restoring paths didn't mess things up if they happened in that window where we drop the lock in the middle of checking the paths, but I'll probably need to do some more tests and recheck that code if it's going to be non-rare occurance for other threads to run in the middle of checking paths. But this seems like too big of a benefit to avoid doing. -Ben > Regards > Martin
diff --git a/libmultipath/checkers.c b/libmultipath/checkers.c index 298aec78..b44b856a 100644 --- a/libmultipath/checkers.c +++ b/libmultipath/checkers.c @@ -303,7 +303,12 @@ void checker_put (struct checker * dst) int checker_get_state(struct checker *c) { - return c ? c->path_state : PATH_UNCHECKED; + if (!c) + return PATH_UNCHECKED; + if (c->path_state != PATH_PENDING || !c->cls->pending) + return c->path_state; + c->path_state = c->cls->pending(c); + return c->path_state; } void checker_check (struct checker * c, int path_state) diff --git a/libmultipath/checkers/directio.c b/libmultipath/checkers/directio.c index 3f88b40d..904e3071 100644 --- a/libmultipath/checkers/directio.c +++ b/libmultipath/checkers/directio.c @@ -60,10 +60,11 @@ const char *libcheck_msgtable[] = { #define LOG(prio, fmt, args...) condlog(prio, "directio: " fmt, ##args) struct directio_context { - int running; + unsigned int running; int reset_flags; struct aio_group *aio_grp; struct async_req *req; + struct timespec endtime; }; static struct aio_group * @@ -314,19 +315,16 @@ check_pending(struct directio_context *ct, struct timespec endtime) static int check_state(int fd, struct directio_context *ct, int sync, int timeout_secs) { - struct timespec timeout = { .tv_nsec = 1000 }; struct stat sb; int rc; + struct io_event event; struct timespec endtime; if (fstat(fd, &sb) == 0) { LOG(4, "called for %x", (unsigned) sb.st_rdev); } - if (sync > 0) { + if (sync > 0) LOG(4, "called in synchronous mode"); - timeout.tv_sec = timeout_secs; - timeout.tv_nsec = 0; - } if (ct->running) { if (ct->req->state != PATH_PENDING) { @@ -345,31 +343,26 @@ check_state(int fd, struct directio_context *ct, int sync, int timeout_secs) LOG(3, "io_submit error %i", -rc); return PATH_UNCHECKED; } + get_monotonic_time(&ct->endtime); + ct->endtime.tv_nsec += 1000 * 1000; + normalize_timespec(&ct->endtime); } ct->running++; + if (!sync) + return PATH_PENDING; get_monotonic_time(&endtime); - endtime.tv_sec += timeout.tv_sec; - endtime.tv_nsec += timeout.tv_nsec; + endtime.tv_sec += timeout_secs; normalize_timespec(&endtime); check_pending(ct, endtime); if (ct->req->state != PATH_PENDING) return ct->req->state; - if (ct->running > timeout_secs || sync) { - struct io_event event; - - LOG(3, "abort check on timeout"); - - io_cancel(ct->aio_grp->ioctx, &ct->req->io, &event); - rc = PATH_DOWN; - } else { - LOG(4, "async io pending"); - rc = PATH_PENDING; - } + LOG(3, "abort check on timeout"); - return rc; + io_cancel(ct->aio_grp->ioctx, &ct->req->io, &event); + return PATH_DOWN; } static void set_msgid(struct checker *c, int state) @@ -395,21 +388,31 @@ static void set_msgid(struct checker *c, int state) int libcheck_pending(struct checker *c) { - struct timespec endtime; + int rc; + struct io_event event; struct directio_context *ct = (struct directio_context *)c->context; /* The if path checker isn't running, just return the exiting value. */ if (!ct || !ct->running) return c->path_state; - if (ct->req->state == PATH_PENDING) { - get_monotonic_time(&endtime); - check_pending(ct, endtime); - } else + if (ct->req->state == PATH_PENDING) + check_pending(ct, ct->endtime); + else ct->running = 0; - set_msgid(c, ct->req->state); + rc = ct->req->state; + if (rc == PATH_PENDING) { + if (ct->running > c->timeout) { + LOG(3, "abort check on timeout"); + io_cancel(ct->aio_grp->ioctx, &ct->req->io, &event); + rc = PATH_DOWN; + } + else + LOG(4, "async io pending"); + } + set_msgid(c, rc); - return ct->req->state; + return rc; } int libcheck_check (struct checker * c) diff --git a/libmultipath/checkers/tur.c b/libmultipath/checkers/tur.c index 95af5214..81db565b 100644 --- a/libmultipath/checkers/tur.c +++ b/libmultipath/checkers/tur.c @@ -58,6 +58,7 @@ struct tur_checker_context { int msgid; struct checker_context ctx; unsigned int nr_timeouts; + struct timespec endtime; }; int libcheck_init (struct checker * c) @@ -323,7 +324,7 @@ static int tur_check_async_timeout(struct checker *c) return (now.tv_sec > ct->time); } -int check_pending(struct checker *c, struct timespec endtime) +int check_pending(struct checker *c) { struct tur_checker_context *ct = c->context; int r, tur_status = PATH_PENDING; @@ -333,7 +334,7 @@ int check_pending(struct checker *c, struct timespec endtime) for (r = 0; r == 0 && ct->state == PATH_PENDING && ct->msgid == MSG_TUR_RUNNING; - r = pthread_cond_timedwait(&ct->active, &ct->lock, &endtime)); + r = pthread_cond_timedwait(&ct->active, &ct->lock, &ct->endtime)); if (!r) { tur_status = ct->state; @@ -355,21 +356,18 @@ int check_pending(struct checker *c, struct timespec endtime) int libcheck_pending(struct checker *c) { - struct timespec endtime; struct tur_checker_context *ct = c->context; /* The if path checker isn't running, just return the exiting value. */ if (!ct || !ct->thread) return c->path_state; - get_monotonic_time(&endtime); - return check_pending(c, endtime); + return check_pending(c); } int libcheck_check(struct checker * c) { struct tur_checker_context *ct = c->context; - struct timespec tsp; pthread_attr_t attr; int tur_status, r; @@ -479,8 +477,7 @@ int libcheck_check(struct checker * c) " sync mode", major(ct->devt), minor(ct->devt)); return tur_check(c->fd, c->timeout, &c->msgid); } - tur_timeout(&tsp); - tur_status = check_pending(c, tsp); + tur_timeout(&ct->endtime); } return tur_status;
When the tur and directio checkers start an asynchronous checker, they now immediately return with the path in PATH_PENDING, instead of waiting in checker_check(). Instead the wait now happens in checker_get_state(). Additionally, the directio checker now waits for 1 ms, like the tur checker does. Also like the tur checker it now only waits once. If it is still pending after the first call to checker_get_state(). It will not wait at all on future calls, and will just process the already completed IOs. Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com> --- libmultipath/checkers.c | 7 +++- libmultipath/checkers/directio.c | 57 +++++++++++++++++--------------- libmultipath/checkers/tur.c | 13 +++----- 3 files changed, 41 insertions(+), 36 deletions(-)