diff mbox

jobs: Only clear gotsigchld when waiting for everything

Message ID 20180507090847.rb7xdvf6k4yzc22k@gondor.apana.org.au (mailing list archive)
State Superseded
Delegated to: Herbert Xu
Headers show

Commit Message

Herbert Xu May 7, 2018, 9:08 a.m. UTC
On Mon, May 07, 2018 at 12:40:34AM +0800, Herbert Xu wrote:
> 
> In fact there is another regression with the original patch:
> foreground jobs may trigger a showjob() if interrupted by SIGINT.
> 
> So here is a new version.
> 
> ---8<---
> Because of the nature of SIGCHLD, the process may have already been
> waited on and therefore we must be prepared for the case that wait
> may block.  So ensure that it doesn't by using WNOHANG.
> 
> Furthermore, multiple jobs may have exited when gotsigchld is set.
> Therefore we need to wait until there are no zombies left.

This turns out to still be incomplete.  The problem is that we
always clear gotsigchld but not all callers of dowait will wait
for everything which means that some exited jobs may not be cleaned
up until something else terminates down the track, thus triggering
a new SIGCHLD.

Here is patch to fix that completely and it is based on top of the
existing v3 patch.

---8<---
The gotsigchld flag is always cleared in dowait but not all callers
of dowait will wait for everything.  In particular, when jp is set
we only wait until the set job isn't running anymore.

This patch fixes this by only clearing gotsigchld if jp is unset.
It also changes the waitcmd to actually set jp which corresponds
to the behaviour of bash/ksh93/mksh.

The only other caller of dowait that doesn't wait for everything
is the jobless reaper.  This is in fact redundant now that we wait
after every simple command.  This patch removes it.

Finally as every caller of dowait needs to wait until either the
given job is not running, or until all terminated jobs have been
processed, this patch moves the loop into dowait itself.

Fixes: 03876c0743a5 ("eval: Reap zombies after built-in...")
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
diff mbox

Patch

diff --git a/src/jobs.c b/src/jobs.c
index 606d603..79a087e 100644
--- a/src/jobs.c
+++ b/src/jobs.c
@@ -96,8 +96,6 @@  static int ttyfd = -1;
 
 /* current job */
 static struct job *curjob;
-/* number of presumed living untracked jobs */
-static int jobless;
 
 STATIC void set_curjob(struct job *, unsigned);
 STATIC int jobno(const struct job *);
@@ -556,8 +554,7 @@  showjobs(struct output *out, int mode)
 	TRACE(("showjobs(%x) called\n", mode));
 
 	/* If not even one one job changed, there is nothing to do */
-	while (dowait(DOWAIT_NORMAL, NULL) > 0)
-		continue;
+	dowait(DOWAIT_NORMAL, NULL);
 
 	for (jp = curjob; jp; jp = jp->prev_job) {
 		if (!(mode & SHOW_CHANGED) || jp->changed)
@@ -614,7 +611,7 @@  waitcmd(int argc, char **argv)
 				jp->waited = 1;
 				jp = jp->prev_job;
 			}
-			if (dowait(DOWAIT_WAITCMD, 0) <= 0)
+			if (!dowait(DOWAIT_WAITCMD, 0))
 				goto sigout;
 		}
 	}
@@ -636,9 +633,8 @@  start:
 		} else
 			job = getjob(*argv, 0);
 		/* loop until process terminated or stopped */
-		while (job->state == JOBRUNNING)
-			if (dowait(DOWAIT_WAITCMD, 0) <= 0)
-				goto sigout;
+		if (!dowait(DOWAIT_WAITCMD, job))
+			goto sigout;
 		job->waited = 1;
 		retval = getstatus(job);
 repeat:
@@ -890,18 +886,14 @@  forkchild(struct job *jp, union node *n, int mode)
 	}
 	for (jp = curjob; jp; jp = jp->prev_job)
 		freejob(jp);
-	jobless = 0;
 }
 
 STATIC inline void
 forkparent(struct job *jp, union node *n, int mode, pid_t pid)
 {
 	TRACE(("In parent shell:  child = %d\n", pid));
-	if (!jp) {
-		while (jobless && dowait(DOWAIT_NORMAL, 0) > 0);
-		jobless++;
+	if (!jp)
 		return;
-	}
 #if JOBS
 	if (mode != FORK_NOJOB && jp->jobctl) {
 		int pgrp;
@@ -975,17 +967,10 @@  waitforjob(struct job *jp)
 	int st;
 
 	TRACE(("waitforjob(%%%d) called\n", jp ? jobno(jp) : 0));
-	if (!jp) {
-		int pid = gotsigchld;
-
-		while (pid > 0)
-			pid = dowait(DOWAIT_NORMAL, NULL);
-
+	dowait(jp ? DOWAIT_BLOCK : DOWAIT_NORMAL, jp);
+	if (!jp)
 		return exitstatus;
-	}
 
-	while (jp->state == JOBRUNNING)
-		dowait(DOWAIT_BLOCK, jp);
 	st = getstatus(jp);
 #if JOBS
 	if (jp->jobctl) {
@@ -1013,8 +998,7 @@  waitforjob(struct job *jp)
  * Wait for a process to terminate.
  */
 
-STATIC int
-dowait(int block, struct job *job)
+static int waitone(int block, struct job *job)
 {
 	int pid;
 	int status;
@@ -1057,8 +1041,6 @@  dowait(int block, struct job *job)
 		if (thisjob)
 			goto gotjob;
 	}
-	if (!JOBS || !WIFSTOPPED(status))
-		jobless--;
 	goto out;
 
 gotjob:
@@ -1093,45 +1075,34 @@  out:
 	return pid;
 }
 
+static int dowait(int block, struct job *jp)
+{
+	int pid = block == DOWAIT_NORMAL ? gotsigchld : 1;
+
+	while (jp ? jp->state == JOBRUNNING : pid > 0) {
+		if (!jp)
+			gotsigchld = 0;
+		pid = waitone(block, jp);
+	}
 
+	return pid;
+}
 
 /*
- * Do a wait system call.  If job control is compiled in, we accept
- * stopped processes.  If block is zero, we return a value of zero
- * rather than blocking.
+ * Do a wait system call.  If block is zero, we return -1 rather than
+ * blocking.  If block is DOWAIT_WAITCMD, we return 0 when a signal
+ * other than SIGCHLD interrupted the wait.
  *
- * System V doesn't have a non-blocking wait system call.  It does
- * have a SIGCLD signal that is sent to a process when one of it's
- * children dies.  The obvious way to use SIGCLD would be to install
- * a handler for SIGCLD which simply bumped a counter when a SIGCLD
- * was received, and have waitproc bump another counter when it got
- * the status of a process.  Waitproc would then know that a wait
- * system call would not block if the two counters were different.
- * This approach doesn't work because if a process has children that
- * have not been waited for, System V will send it a SIGCLD when it
- * installs a signal handler for SIGCLD.  What this means is that when
- * a child exits, the shell will be sent SIGCLD signals continuously
- * until is runs out of stack space, unless it does a wait call before
- * restoring the signal handler.  The code below takes advantage of
- * this (mis)feature by installing a signal handler for SIGCLD and
- * then checking to see whether it was called.  If there are any
- * children to be waited for, it will be.
+ * We use sigsuspend in conjunction with a non-blocking wait3 in
+ * order to ensure that waitcmd exits promptly upon the reception
+ * of a signal.
  *
- * If neither SYSV nor BSD is defined, we don't implement nonblocking
- * waits at all.  In this case, the user will not be informed when
- * a background process until the next time she runs a real program
- * (as opposed to running a builtin command or just typing return),
- * and the jobs command may give out of date information.
+ * For code paths other than waitcmd we either use a blocking wait3
+ * or a non-blocking wait3.  For the latter case the caller of dowait
+ * must ensure that it is called over and over again until all dead
+ * children have been reaped.  Otherwise zombies may linger.
  */
 
-#ifdef SYSV
-STATIC int gotsigchild;
-
-STATIC int onsigchild() {
-	gotsigchild = 1;
-}
-#endif
-
 
 STATIC int
 waitproc(int block, int *status)
@@ -1146,13 +1117,10 @@  waitproc(int block, int *status)
 #endif
 
 	do {
-		gotsigchld = 0;
 		err = wait3(status, flags, NULL);
-		if (err || !block)
+		if (err || (err = -!block))
 			break;
 
-		block = 0;
-
 		sigfillset(&mask);
 		sigprocmask(SIG_SETMASK, &mask, &oldmask);
 
@@ -1160,6 +1128,8 @@  waitproc(int block, int *status)
 			sigsuspend(&oldmask);
 
 		sigclearmask();
+
+		err = 0;
 	} while (gotsigchld);
 
 	return err;