diff mbox series

[v4,04/22] libmultipath: remove pending wait code from libcheck_check calls

Message ID 20241009011523.2381575-5-bmarzins@redhat.com (mailing list archive)
State Not Applicable, archived
Delegated to: Benjamin Marzinski
Headers show
Series path checker refactor and misc fixes | expand

Commit Message

Benjamin Marzinski Oct. 9, 2024, 1:15 a.m. UTC
When the tur and directio checkers start an asynchronous checker, they
now immediately return with the path in PATH_PENDING, instead of
waiting in checker_check(). Instead the wait now happens in
checker_get_state().

Additionally, the directio checker now waits for 1 ms, like the tur
checker does. Also like the tur checker it now only waits once. If it
is still pending after the first call to checker_get_state(). It will
not wait at all on future calls, and will just process the already
completed IOs.

Reviewed-by: Martin Wilck <mwilck@suse.com>
Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
---
 libmultipath/checkers.c          |  7 +++-
 libmultipath/checkers/directio.c | 57 +++++++++++++++++---------------
 libmultipath/checkers/tur.c      | 13 +++-----
 3 files changed, 41 insertions(+), 36 deletions(-)

Comments

Martin Wilck Oct. 9, 2024, 3 p.m. UTC | #1
On Tue, 2024-10-08 at 21:15 -0400, Benjamin Marzinski wrote:
> When the tur and directio checkers start an asynchronous checker,
> they
> now immediately return with the path in PATH_PENDING, instead of
> waiting in checker_check(). Instead the wait now happens in
> checker_get_state().
> 
> Additionally, the directio checker now waits for 1 ms, like the tur
> checker does. Also like the tur checker it now only waits once. If it
> is still pending after the first call to checker_get_state(). It will
> not wait at all on future calls, and will just process the already
> completed IOs.
> 
> Reviewed-by: Martin Wilck <mwilck@suse.com>
> Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
> ---
>  libmultipath/checkers.c          |  7 +++-
>  libmultipath/checkers/directio.c | 57 +++++++++++++++++-------------
> --
>  libmultipath/checkers/tur.c      | 13 +++-----
>  3 files changed, 41 insertions(+), 36 deletions(-)
> 
> diff --git a/libmultipath/checkers/directio.c
> b/libmultipath/checkers/directio.c
> index 3f88b40d..904e3071 100644
> --- a/libmultipath/checkers/directio.c
> +++ b/libmultipath/checkers/directio.c
> @@ -60,10 +60,11 @@ const char *libcheck_msgtable[] = {
>  #define LOG(prio, fmt, args...) condlog(prio, "directio: " fmt,
> ##args)
>  
>  struct directio_context {
> -	int		running;
> +	unsigned int	running;
>  	int		reset_flags;
>  	struct aio_group *aio_grp;
>  	struct async_req *req;
> +	struct timespec endtime;
>  };
>  
>  static struct aio_group *
> @@ -314,19 +315,16 @@ check_pending(struct directio_context *ct,
> struct timespec endtime)
>  static int
>  check_state(int fd, struct directio_context *ct, int sync, int
> timeout_secs)
>  {
> -	struct timespec	timeout = { .tv_nsec = 1000 };
>  	struct stat	sb;
>  	int		rc;
> +	struct io_event event;
>  	struct timespec endtime;
>  
>  	if (fstat(fd, &sb) == 0) {
>  		LOG(4, "called for %x", (unsigned) sb.st_rdev);
>  	}
> -	if (sync > 0) {
> +	if (sync > 0)
>  		LOG(4, "called in synchronous mode");
> -		timeout.tv_sec  = timeout_secs;
> -		timeout.tv_nsec = 0;
> -	}
>  
>  	if (ct->running) {
>  		if (ct->req->state != PATH_PENDING) {
> @@ -345,31 +343,26 @@ check_state(int fd, struct directio_context
> *ct, int sync, int timeout_secs)
>  			LOG(3, "io_submit error %i", -rc);
>  			return PATH_UNCHECKED;
>  		}
> +		get_monotonic_time(&ct->endtime);
> +		ct->endtime.tv_nsec += 1000 * 1000;
> +		normalize_timespec(&ct->endtime);
>  	}
>  	ct->running++;

Sorry, I have another concern here, despite my reviewed-by. This very
old statement, combined with "if (ct->running > c->timeout)" below,
somehow assumes that "running" counts seconds, which means that this
code must be called once a second. I guess this has been approximately
true in the past, when this code was originally written. It has never
been clean, we should have compared time stamps instead. 

But now, it seems to be plain wrong, "running" will never have a higher
value than 1 as it is only called once from libcheck_check(), and the
(ct->running - c->timeout) statement is broken.

Am I misreading this code?

Martin
Benjamin Marzinski Oct. 10, 2024, 7:45 p.m. UTC | #2
On Wed, Oct 09, 2024 at 05:00:55PM +0200, Martin Wilck wrote:
> On Tue, 2024-10-08 at 21:15 -0400, Benjamin Marzinski wrote:
> > When the tur and directio checkers start an asynchronous checker,
> > they
> > now immediately return with the path in PATH_PENDING, instead of
> > waiting in checker_check(). Instead the wait now happens in
> > checker_get_state().
> > 
> > Additionally, the directio checker now waits for 1 ms, like the tur
> > checker does. Also like the tur checker it now only waits once. If it
> > is still pending after the first call to checker_get_state(). It will
> > not wait at all on future calls, and will just process the already
> > completed IOs.
> > 
> > Reviewed-by: Martin Wilck <mwilck@suse.com>
> > Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
> > ---
> >  libmultipath/checkers.c          |  7 +++-
> >  libmultipath/checkers/directio.c | 57 +++++++++++++++++-------------
> > --
> >  libmultipath/checkers/tur.c      | 13 +++-----
> >  3 files changed, 41 insertions(+), 36 deletions(-)
> > 
> > diff --git a/libmultipath/checkers/directio.c
> > b/libmultipath/checkers/directio.c
> > index 3f88b40d..904e3071 100644
> > --- a/libmultipath/checkers/directio.c
> > +++ b/libmultipath/checkers/directio.c
> > @@ -60,10 +60,11 @@ const char *libcheck_msgtable[] = {
> >  #define LOG(prio, fmt, args...) condlog(prio, "directio: " fmt,
> > ##args)
> >  
> >  struct directio_context {
> > -	int		running;
> > +	unsigned int	running;
> >  	int		reset_flags;
> >  	struct aio_group *aio_grp;
> >  	struct async_req *req;
> > +	struct timespec endtime;
> >  };
> >  
> >  static struct aio_group *
> > @@ -314,19 +315,16 @@ check_pending(struct directio_context *ct,
> > struct timespec endtime)
> >  static int
> >  check_state(int fd, struct directio_context *ct, int sync, int
> > timeout_secs)
> >  {
> > -	struct timespec	timeout = { .tv_nsec = 1000 };
> >  	struct stat	sb;
> >  	int		rc;
> > +	struct io_event event;
> >  	struct timespec endtime;
> >  
> >  	if (fstat(fd, &sb) == 0) {
> >  		LOG(4, "called for %x", (unsigned) sb.st_rdev);
> >  	}
> > -	if (sync > 0) {
> > +	if (sync > 0)
> >  		LOG(4, "called in synchronous mode");
> > -		timeout.tv_sec  = timeout_secs;
> > -		timeout.tv_nsec = 0;
> > -	}
> >  
> >  	if (ct->running) {
> >  		if (ct->req->state != PATH_PENDING) {
> > @@ -345,31 +343,26 @@ check_state(int fd, struct directio_context
> > *ct, int sync, int timeout_secs)
> >  			LOG(3, "io_submit error %i", -rc);
> >  			return PATH_UNCHECKED;
> >  		}
> > +		get_monotonic_time(&ct->endtime);
> > +		ct->endtime.tv_nsec += 1000 * 1000;
> > +		normalize_timespec(&ct->endtime);
> >  	}
> >  	ct->running++;
> 
> Sorry, I have another concern here, despite my reviewed-by. This very
> old statement, combined with "if (ct->running > c->timeout)" below,
> somehow assumes that "running" counts seconds, which means that this
> code must be called once a second. I guess this has been approximately
> true in the past, when this code was originally written. It has never
> been clean, we should have compared time stamps instead. 

Yeah. That makes sense.

> 
> But now, it seems to be plain wrong, "running" will never have a higher
> value than 1 as it is only called once from libcheck_check(), and the
> (ct->running - c->timeout) statement is broken.
> 
> Am I misreading this code?

Either you are or I am. libcheck_check() is called by start_checker()
each time a path comes up for checking. libcheck_check() calls
check_state() with the number of seconds that the checker should wait
till it times out. If ct->running is non-zero, but the path is still
pending, check_state() increments ct->running and returns PATH_PENDING.

libcheck_pending() is called by get_state() every time a path gets
checked and its state is PATH_PENDING. If ct->running is greater than
c->timeout then the checker has timed out (it assumes that ct->running is
counting seconds) and gets cancelled.

Also, I only see (ct->running > c->timeout), not
(ct->running - c->timeout) are you talking about something else in the
code that I'm missing?

At any rate, I can change this to use actual timestamps instead of our
ct->running assumptions.

-Ben

> 
> Martin
Martin Wilck Oct. 11, 2024, 7:40 a.m. UTC | #3
On Thu, 2024-10-10 at 15:45 -0400, Benjamin Marzinski wrote:
> On Wed, Oct 09, 2024 at 05:00:55PM +0200, Martin Wilck wrote:
> > On Tue, 2024-10-08 at 21:15 -0400, Benjamin Marzinski wrote:
> > >  
> > >  	if (ct->running) {
> > >  		if (ct->req->state != PATH_PENDING) {
> > > @@ -345,31 +343,26 @@ check_state(int fd, struct directio_context
> > > *ct, int sync, int timeout_secs)
> > >  			LOG(3, "io_submit error %i", -rc);
> > >  			return PATH_UNCHECKED;
> > >  		}
> > > +		get_monotonic_time(&ct->endtime);
> > > +		ct->endtime.tv_nsec += 1000 * 1000;
> > > +		normalize_timespec(&ct->endtime);
> > >  	}
> > >  	ct->running++;
> > 
> > Sorry, I have another concern here, despite my reviewed-by. This
> > very
> > old statement, combined with "if (ct->running > c->timeout)" below,
> > somehow assumes that "running" counts seconds, which means that
> > this
> > code must be called once a second. I guess this has been
> > approximately
> > true in the past, when this code was originally written. It has
> > never
> > been clean, we should have compared time stamps instead. 
> 
> Yeah. That makes sense.
> 
> > 
> > But now, it seems to be plain wrong, "running" will never have a
> > higher
> > value than 1 as it is only called once from libcheck_check(), and
> > the
> > (ct->running - c->timeout) statement is broken.
> > 
> > Am I misreading this code?
> 
> Either you are or I am. 

I am :-)

> libcheck_check() is called by start_checker()
> each time a path comes up for checking. libcheck_check() calls
> check_state() with the number of seconds that the checker should wait
> till it times out. If ct->running is non-zero, but the path is still
> pending, check_state() increments ct->running and returns
> PATH_PENDING.

My misconception was that we call libcheck_check() once, and then call
libcheck_pending() repeatedly until the checker either finishes or
times out. The isn't true even with your patch set, both functions are
called in every checkerloop, just that libcheck_check() doesn't restart
the checker instance if it's already running (it just increments
running, as you mentioned).

(Why I had this "misconception"? I think it's kind of a vague idea that
I had in my mind how it should work; it's not in your code, and I'm not
asking your to change the code in this way).

> libcheck_pending() is called by get_state() every time a path gets
> checked and its state is PATH_PENDING. If ct->running is greater than
> c->timeout then the checker has timed out (it assumes that ct-
> >running is
> counting seconds) and gets cancelled.

> 
> Also, I only see (ct->running > c->timeout), not
> (ct->running - c->timeout) are you talking about something else in
> the
> code that I'm missing?

No, typo.

> At any rate, I can change this to use actual timestamps instead of
> our
> ct->running assumptions.

Yes, please. "counting seconds" this way is wrong and confusing.

Regards,
Martin

PS: I'll be out of office for 2 weeks, so final review of your series
will probably be delayed again. Sorry about that.
diff mbox series

Patch

diff --git a/libmultipath/checkers.c b/libmultipath/checkers.c
index 298aec78..ce3e48bd 100644
--- a/libmultipath/checkers.c
+++ b/libmultipath/checkers.c
@@ -303,7 +303,12 @@  void checker_put (struct checker * dst)
 
 int checker_get_state(struct checker *c)
 {
-	return c ? c->path_state : PATH_UNCHECKED;
+	if (!c || !c->cls)
+		return PATH_UNCHECKED;
+	if (c->path_state != PATH_PENDING || !c->cls->pending)
+		return c->path_state;
+	c->path_state = c->cls->pending(c);
+	return c->path_state;
 }
 
 void checker_check (struct checker * c, int path_state)
diff --git a/libmultipath/checkers/directio.c b/libmultipath/checkers/directio.c
index 3f88b40d..904e3071 100644
--- a/libmultipath/checkers/directio.c
+++ b/libmultipath/checkers/directio.c
@@ -60,10 +60,11 @@  const char *libcheck_msgtable[] = {
 #define LOG(prio, fmt, args...) condlog(prio, "directio: " fmt, ##args)
 
 struct directio_context {
-	int		running;
+	unsigned int	running;
 	int		reset_flags;
 	struct aio_group *aio_grp;
 	struct async_req *req;
+	struct timespec endtime;
 };
 
 static struct aio_group *
@@ -314,19 +315,16 @@  check_pending(struct directio_context *ct, struct timespec endtime)
 static int
 check_state(int fd, struct directio_context *ct, int sync, int timeout_secs)
 {
-	struct timespec	timeout = { .tv_nsec = 1000 };
 	struct stat	sb;
 	int		rc;
+	struct io_event event;
 	struct timespec endtime;
 
 	if (fstat(fd, &sb) == 0) {
 		LOG(4, "called for %x", (unsigned) sb.st_rdev);
 	}
-	if (sync > 0) {
+	if (sync > 0)
 		LOG(4, "called in synchronous mode");
-		timeout.tv_sec  = timeout_secs;
-		timeout.tv_nsec = 0;
-	}
 
 	if (ct->running) {
 		if (ct->req->state != PATH_PENDING) {
@@ -345,31 +343,26 @@  check_state(int fd, struct directio_context *ct, int sync, int timeout_secs)
 			LOG(3, "io_submit error %i", -rc);
 			return PATH_UNCHECKED;
 		}
+		get_monotonic_time(&ct->endtime);
+		ct->endtime.tv_nsec += 1000 * 1000;
+		normalize_timespec(&ct->endtime);
 	}
 	ct->running++;
+	if (!sync)
+		return PATH_PENDING;
 
 	get_monotonic_time(&endtime);
-	endtime.tv_sec += timeout.tv_sec;
-	endtime.tv_nsec += timeout.tv_nsec;
+	endtime.tv_sec += timeout_secs;
 	normalize_timespec(&endtime);
 
 	check_pending(ct, endtime);
 	if (ct->req->state != PATH_PENDING)
 		return ct->req->state;
 
-	if (ct->running > timeout_secs || sync) {
-		struct io_event event;
-
-		LOG(3, "abort check on timeout");
-
-		io_cancel(ct->aio_grp->ioctx, &ct->req->io, &event);
-		rc = PATH_DOWN;
-	} else {
-		LOG(4, "async io pending");
-		rc = PATH_PENDING;
-	}
+	LOG(3, "abort check on timeout");
 
-	return rc;
+	io_cancel(ct->aio_grp->ioctx, &ct->req->io, &event);
+	return PATH_DOWN;
 }
 
 static void set_msgid(struct checker *c, int state)
@@ -395,21 +388,31 @@  static void set_msgid(struct checker *c, int state)
 
 int libcheck_pending(struct checker *c)
 {
-	struct timespec endtime;
+	int rc;
+	struct io_event event;
 	struct directio_context *ct = (struct directio_context *)c->context;
 
 	/* The if path checker isn't running, just return the exiting value. */
 	if (!ct || !ct->running)
 		return c->path_state;
 
-	if (ct->req->state == PATH_PENDING) {
-		get_monotonic_time(&endtime);
-		check_pending(ct, endtime);
-	} else
+	if (ct->req->state == PATH_PENDING)
+		check_pending(ct, ct->endtime);
+	else
 		ct->running = 0;
-	set_msgid(c, ct->req->state);
+	rc = ct->req->state;
+	if (rc == PATH_PENDING) {
+		if (ct->running > c->timeout) {
+			LOG(3, "abort check on timeout");
+			io_cancel(ct->aio_grp->ioctx, &ct->req->io, &event);
+			rc = PATH_DOWN;
+		}
+		else
+			LOG(4, "async io pending");
+	}
+	set_msgid(c, rc);
 
-	return ct->req->state;
+	return rc;
 }
 
 int libcheck_check (struct checker * c)
diff --git a/libmultipath/checkers/tur.c b/libmultipath/checkers/tur.c
index 95af5214..81db565b 100644
--- a/libmultipath/checkers/tur.c
+++ b/libmultipath/checkers/tur.c
@@ -58,6 +58,7 @@  struct tur_checker_context {
 	int msgid;
 	struct checker_context ctx;
 	unsigned int nr_timeouts;
+	struct timespec endtime;
 };
 
 int libcheck_init (struct checker * c)
@@ -323,7 +324,7 @@  static int tur_check_async_timeout(struct checker *c)
 	return (now.tv_sec > ct->time);
 }
 
-int check_pending(struct checker *c, struct timespec endtime)
+int check_pending(struct checker *c)
 {
 	struct tur_checker_context *ct = c->context;
 	int r, tur_status = PATH_PENDING;
@@ -333,7 +334,7 @@  int check_pending(struct checker *c, struct timespec endtime)
 	for (r = 0;
 	     r == 0 && ct->state == PATH_PENDING &&
 	     ct->msgid == MSG_TUR_RUNNING;
-	     r = pthread_cond_timedwait(&ct->active, &ct->lock, &endtime));
+	     r = pthread_cond_timedwait(&ct->active, &ct->lock, &ct->endtime));
 
 	if (!r) {
 		tur_status = ct->state;
@@ -355,21 +356,18 @@  int check_pending(struct checker *c, struct timespec endtime)
 
 int libcheck_pending(struct checker *c)
 {
-	struct timespec endtime;
 	struct tur_checker_context *ct = c->context;
 
 	/* The if path checker isn't running, just return the exiting value. */
 	if (!ct || !ct->thread)
 		return c->path_state;
 
-	get_monotonic_time(&endtime);
-	return check_pending(c, endtime);
+	return check_pending(c);
 }
 
 int libcheck_check(struct checker * c)
 {
 	struct tur_checker_context *ct = c->context;
-	struct timespec tsp;
 	pthread_attr_t attr;
 	int tur_status, r;
 
@@ -479,8 +477,7 @@  int libcheck_check(struct checker * c)
 				" sync mode", major(ct->devt), minor(ct->devt));
 			return tur_check(c->fd, c->timeout, &c->msgid);
 		}
-		tur_timeout(&tsp);
-		tur_status = check_pending(c, tsp);
+		tur_timeout(&ct->endtime);
 	}
 
 	return tur_status;