diff mbox series

[v2,7/7] t/helper/simple-ipc: convert test-simple-ipc to use start_bg_command

Message ID 6b7a058284b93fab52458b12a6aede5e8aed6652.1632152179.git.gitgitgadget@gmail.com (mailing list archive)
State Accepted
Commit 05881a6fc9020b5c227a98490bf256d129da01f6
Headers show
Series Builtin FSMonitor Part 1 | expand

Commit Message

Jeff Hostetler Sept. 20, 2021, 3:36 p.m. UTC
From: Jeff Hostetler <jeffhost@microsoft.com>

Convert test helper to use `start_bg_command()` when spawning a server
daemon in the background rather than blocks of platform-specific code.

Also, while here, remove _() translation around error messages since
this is a test helper and not Git code.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 t/helper/test-simple-ipc.c | 199 ++++++++-----------------------------
 1 file changed, 43 insertions(+), 156 deletions(-)

Comments

Ævar Arnfjörð Bjarmason Sept. 23, 2021, 3:03 p.m. UTC | #1
On Mon, Sep 20 2021, Jeff Hostetler via GitGitGadget wrote:

> +	switch (sbgr) {
> +	case SBGR_READY:
> +		return 0;
>  
> -		else if (pid_seen == pid_child) {
> -			/*
> -			 * The new child daemon process shutdown while
> -			 * it was starting up, so it is not listening
> -			 * on the socket.
> -			 *
> -			 * Try to ping the socket in the odd chance
> -			 * that another daemon started (or was already
> -			 * running) while our child was starting.
> -			 *
> -			 * Again, we don't care who services the socket.
> -			 */
> -			s = ipc_get_active_state(cl_args.path);
> -			if (s == IPC_STATE__LISTENING)
> -				return 0;
> +	default:
> +	case SBGR_ERROR:
> +	case SBGR_CB_ERROR:
> +		return error("daemon failed to start");

There was a discussion on v1 about the "default" being redundant here
and hiding future compiler checks, this is another "not sure what you
thought of that" case (per [1]).

Interestingly in this case if I drop the "default" my local gcc
uncharacteristically complains about a missing "return" in this
function, but clang doesn't. I needed to add a BUG() to shut up the
former. Maybe I'm wrong, but perhaps it's a sign of some deeper
trouble. This is with gcc/clang 10.2.1-6/11.0.1-2.

1. https://lore.kernel.org/git/87v92r49mt.fsf@evledraar.gmail.com/

I played with the diff below on top of this, I can't remember if it was
noted already, but the way you declare function ptrs and use them isn't
the usual style:

-- >8 --
diff --git a/run-command.c b/run-command.c
index 76bbef9d96d..5c831545201 100644
--- a/run-command.c
+++ b/run-command.c
@@ -1903,7 +1903,7 @@ void prepare_other_repo_env(struct strvec *env_array, const char *new_git_dir)
 }
 
 enum start_bg_result start_bg_command(struct child_process *cmd,
-				      start_bg_wait_cb *wait_cb,
+				      start_bg_wait_cb wait_cb,
 				      void *cb_data,
 				      unsigned int timeout_sec)
 {
diff --git a/run-command.h b/run-command.h
index 17b1b80c3d7..e8a637d1967 100644
--- a/run-command.h
+++ b/run-command.h
@@ -527,7 +527,7 @@ enum start_bg_result {
  * Returns 0 if child is "ready".
  * Returns -1 on any error and cause start_bg_command() to also error out.
  */
-typedef int(start_bg_wait_cb)(const struct child_process *cmd, void *cb_data);
+typedef int (*start_bg_wait_cb)(const struct child_process *cmd, void *cb_data);
 
 /**
  * Start a command in the background.  Wait long enough for the child
@@ -549,7 +549,7 @@ typedef int(start_bg_wait_cb)(const struct child_process *cmd, void *cb_data);
  * any instance data that it might require.  This may be NULL.
  */
 enum start_bg_result start_bg_command(struct child_process *cmd,
-				      start_bg_wait_cb *wait_cb,
+				      start_bg_wait_cb wait_cb,
 				      void *cb_data,
 				      unsigned int timeout_sec);
 
diff --git a/t/helper/test-simple-ipc.c b/t/helper/test-simple-ipc.c
index 28365ff85b6..82dc2906a2a 100644
--- a/t/helper/test-simple-ipc.c
+++ b/t/helper/test-simple-ipc.c
@@ -275,9 +275,7 @@ static int daemon__run_server(void)
 	return ret;
 }
 
-static start_bg_wait_cb bg_wait_cb;
-
-static int bg_wait_cb(const struct child_process *cp, void *cb_data)
+static int bg_wait_cb(const struct child_process *cp, void *cb_data, int foo)
 {
 	int s = ipc_get_active_state(cl_args.path);
 
@@ -319,9 +317,8 @@ static int daemon__start_server(void)
 	switch (sbgr) {
 	case SBGR_READY:
 		return 0;
-
-	default:
 	case SBGR_ERROR:
+		return 0;
 	case SBGR_CB_ERROR:
 		return error("daemon failed to start");
 
@@ -331,6 +328,7 @@ static int daemon__start_server(void)
 	case SBGR_DIED:
 		return error("daemon terminated");
 	}
+	BUG("unreachable");
 }
 
 /*
Jeff Hostetler Sept. 23, 2021, 5:58 p.m. UTC | #2
On 9/23/21 11:03 AM, Ævar Arnfjörð Bjarmason wrote:
> 
> On Mon, Sep 20 2021, Jeff Hostetler via GitGitGadget wrote:
> 
>> +	switch (sbgr) {
>> +	case SBGR_READY:
>> +		return 0;
>>   
>> -		else if (pid_seen == pid_child) {
>> -			/*
>> -			 * The new child daemon process shutdown while
>> -			 * it was starting up, so it is not listening
>> -			 * on the socket.
>> -			 *
>> -			 * Try to ping the socket in the odd chance
>> -			 * that another daemon started (or was already
>> -			 * running) while our child was starting.
>> -			 *
>> -			 * Again, we don't care who services the socket.
>> -			 */
>> -			s = ipc_get_active_state(cl_args.path);
>> -			if (s == IPC_STATE__LISTENING)
>> -				return 0;
>> +	default:
>> +	case SBGR_ERROR:
>> +	case SBGR_CB_ERROR:
>> +		return error("daemon failed to start");
> 
> There was a discussion on v1 about the "default" being redundant here
> and hiding future compiler checks, this is another "not sure what you
> thought of that" case (per [1]).
> 
> Interestingly in this case if I drop the "default" my local gcc
> uncharacteristically complains about a missing "return" in this
> function, but clang doesn't. I needed to add a BUG() to shut up the
> former. Maybe I'm wrong, but perhaps it's a sign of some deeper
> trouble. This is with gcc/clang 10.2.1-6/11.0.1-2.

The issue of whether C needs a "default" case in switch statements
on an enum is one I didn't have bandwidth to think about (and is
completely independent of my series).

I was thinking that as a later task, someone could investigate which
compilers do and do not generate errors for missing enum values in
the switch.  Perhaps that leads to a macro in config.mak.uname on
a system-by-system basis that "does the right thing".

Then one could have something like:

     switch (e) {
     DEFAULT_HANDLER;
     case e1: ...
     case e2: ...
     }


> 
> 1. https://lore.kernel.org/git/87v92r49mt.fsf@evledraar.gmail.com/
> 
> I played with the diff below on top of this, I can't remember if it was
> noted already, but the way you declare function ptrs and use them isn't
> the usual style:
> 
> -- >8 --
> diff --git a/run-command.c b/run-command.c
> index 76bbef9d96d..5c831545201 100644
> --- a/run-command.c
> +++ b/run-command.c
> @@ -1903,7 +1903,7 @@ void prepare_other_repo_env(struct strvec *env_array, const char *new_git_dir)
>   }
>   
>   enum start_bg_result start_bg_command(struct child_process *cmd,
> -				      start_bg_wait_cb *wait_cb,
> +				      start_bg_wait_cb wait_cb,
>   				      void *cb_data,
>   				      unsigned int timeout_sec)
>   {
> diff --git a/run-command.h b/run-command.h
> index 17b1b80c3d7..e8a637d1967 100644
> --- a/run-command.h
> +++ b/run-command.h
> @@ -527,7 +527,7 @@ enum start_bg_result {
>    * Returns 0 if child is "ready".
>    * Returns -1 on any error and cause start_bg_command() to also error out.
>    */
> -typedef int(start_bg_wait_cb)(const struct child_process *cmd, void *cb_data);
> +typedef int (*start_bg_wait_cb)(const struct child_process *cmd, void *cb_data);

By defining the typedef WITHOUT the star, we get a function type.

We can then use it for forward function declarations.

But additionally, when declare a function that takes a function
pointer or when we define a vtable of function pointers, they look
like pointers.

start_bg_wait_cb *pfn = my_cb;

int foo(struct child_process *cmd, start_bg_wait_cb *cb);

Or if we look a the target vtable in Trace2.  This looks like
a structure of (function) pointers.

struct tr2_tgt {
	tr2_tgt_init_t                          *pfn_init;
	tr2_tgt_term_t                          *pfn_term;
	...
};

So I prefer to leave the star out of function typedef and then
we can use the typedef in both contexts.

Jeff
Junio C Hamano Sept. 23, 2021, 6:37 p.m. UTC | #3
Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:

> I played with the diff below on top of this, I can't remember if it was
> noted already, but the way you declare function ptrs and use them isn't
> the usual style:
>
> -- >8 --
> diff --git a/run-command.c b/run-command.c
> index 76bbef9d96d..5c831545201 100644
> --- a/run-command.c
> +++ b/run-command.c
> @@ -1903,7 +1903,7 @@ void prepare_other_repo_env(struct strvec *env_array, const char *new_git_dir)
>  }
>  
>  enum start_bg_result start_bg_command(struct child_process *cmd,
> -				      start_bg_wait_cb *wait_cb,
> +				      start_bg_wait_cb wait_cb,
> -typedef int(start_bg_wait_cb)(const struct child_process *cmd, void *cb_data);
> +typedef int (*start_bg_wait_cb)(const struct child_process *cmd, void *cb_data);

I have no comment on the "default" thing, but I agree that the
preimage does look _unusual_ in our codebase.  You cannot even
declare a "variable" of that type with the typedef, i.e.

	start_bg_wait_cb an_instance_of_that;

If there is a good reason behind choosing the unusual "the type of
the function is...", that is OK, but otherwise...
Adam Dinwoodie Nov. 4, 2021, 7:46 p.m. UTC | #4
On Monday 20 September 2021 at 03:36 pm +0000, Jeff Hostetler via GitGitGadget wrote:
> From: Jeff Hostetler <jeffhost@microsoft.com>
> 
> Convert test helper to use `start_bg_command()` when spawning a server
> daemon in the background rather than blocks of platform-specific code.
> 
> Also, while here, remove _() translation around error messages since
> this is a test helper and not Git code.

As part of testing the v2.34.0-rc0 release candidate on Cygwin, I've
found this commit -- 05881a6fc9 (t/helper/simple-ipc: convert
test-simple-ipc to use start_bg_command, 2021-09-20), according to my
bisecting -- is causing t0052.1 to fail on 32-bit Cygwin.  Somehow this
is only affecting the 32-bit Cygwin build; the 64-bit build is working
as expected.

Specifically, the failure I'm seeing is as below:

```
$ GIT_TRACE=1 sh t0052-simple-ipc.sh -vix
trace: built-in: git init '/cygdrive/d/git-32/t/trash directory.t0052-simple-ipc'
Initialized empty Git repository in /cygdrive/d/git-32/t/trash directory.t0052-simple-ipc/.git/
expecting success of 0052.1 'start simple command server': 
	test_atexit stop_simple_IPC_server &&
	test-tool simple-ipc start-daemon --threads=8 &&
	test-tool simple-ipc is-active

++ test_atexit stop_simple_IPC_server
++ test 0 = 0
++ test_atexit_cleanup='{ stop_simple_IPC_server
		} && (exit "$eval_ret"); eval_ret=$?; :'
++ test-tool simple-ipc start-daemon --threads=8
trace: run_command: test-tool simple-ipc run-daemon --name=ipc-test --threads=8
error: daemon failed to start
error: last command exited with $?=1
not ok 1 - start simple command server
#	
#		test_atexit stop_simple_IPC_server &&
#		test-tool simple-ipc start-daemon --threads=8 &&
#		test-tool simple-ipc is-active
#	
++ stop_simple_IPC_server
++ test-tool simple-ipc stop-daemon
++ exit 1
++ eval_ret=1
++ :
```

I've had a look at the code changes, and cannot work out what might be
being handled differently in 32-bit and 64-bit Cygwin environments.
Given the Cygwin project is considering dropping support for 32-bit
Cygwin anyway, it might not be worth doing anything about this.  But I
thought it worth reporting in case there's something obvious to folk
more familiar with this code.
Ramsay Jones Nov. 4, 2021, 8:14 p.m. UTC | #5
Hi Adam,

On 04/11/2021 19:46, Adam Dinwoodie wrote:
> On Monday 20 September 2021 at 03:36 pm +0000, Jeff Hostetler via GitGitGadget wrote:
>> From: Jeff Hostetler <jeffhost@microsoft.com>
>>
>> Convert test helper to use `start_bg_command()` when spawning a server
>> daemon in the background rather than blocks of platform-specific code.
>>
>> Also, while here, remove _() translation around error messages since
>> this is a test helper and not Git code.
> 
> As part of testing the v2.34.0-rc0 release candidate on Cygwin, I've
> found this commit -- 05881a6fc9 (t/helper/simple-ipc: convert
> test-simple-ipc to use start_bg_command, 2021-09-20), according to my
> bisecting -- is causing t0052.1 to fail on 32-bit Cygwin.  Somehow this
> is only affecting the 32-bit Cygwin build; the 64-bit build is working
> as expected.

Hmmm, I am seeing exactly the same, but on 64-bit cygwin!

I haven't found time to look at this in detail yet (except for
what you have already done). Unfortunately, about an hour ago,
I did a 'make test' for the '-rc1' build, so I won't be able to
take a look for hours yet, ... :(

ATB,
Ramsay Jones
Jeff Hostetler Nov. 8, 2021, 2:58 p.m. UTC | #6
On 11/4/21 3:46 PM, Adam Dinwoodie wrote:
> On Monday 20 September 2021 at 03:36 pm +0000, Jeff Hostetler via GitGitGadget wrote:
>> From: Jeff Hostetler <jeffhost@microsoft.com>
>>
>> Convert test helper to use `start_bg_command()` when spawning a server
>> daemon in the background rather than blocks of platform-specific code.
>>
>> Also, while here, remove _() translation around error messages since
>> this is a test helper and not Git code.
> 
> As part of testing the v2.34.0-rc0 release candidate on Cygwin, I've
> found this commit -- 05881a6fc9 (t/helper/simple-ipc: convert
> test-simple-ipc to use start_bg_command, 2021-09-20), according to my
> bisecting -- is causing t0052.1 to fail on 32-bit Cygwin.  Somehow this
> is only affecting the 32-bit Cygwin build; the 64-bit build is working
> as expected.
> 
> Specifically, the failure I'm seeing is as below:
> 
> ```
> $ GIT_TRACE=1 sh t0052-simple-ipc.sh -vix
> trace: built-in: git init '/cygdrive/d/git-32/t/trash directory.t0052-simple-ipc'
> Initialized empty Git repository in /cygdrive/d/git-32/t/trash directory.t0052-simple-ipc/.git/
> expecting success of 0052.1 'start simple command server':
> 	test_atexit stop_simple_IPC_server &&
> 	test-tool simple-ipc start-daemon --threads=8 &&
> 	test-tool simple-ipc is-active
> 
> ++ test_atexit stop_simple_IPC_server
> ++ test 0 = 0
> ++ test_atexit_cleanup='{ stop_simple_IPC_server
> 		} && (exit "$eval_ret"); eval_ret=$?; :'
> ++ test-tool simple-ipc start-daemon --threads=8
> trace: run_command: test-tool simple-ipc run-daemon --name=ipc-test --threads=8
> error: daemon failed to start
> error: last command exited with $?=1
> not ok 1 - start simple command server
> #	
> #		test_atexit stop_simple_IPC_server &&
> #		test-tool simple-ipc start-daemon --threads=8 &&
> #		test-tool simple-ipc is-active
> #	
> ++ stop_simple_IPC_server
> ++ test-tool simple-ipc stop-daemon
> ++ exit 1
> ++ eval_ret=1
> ++ :
> ```
> 
> I've had a look at the code changes, and cannot work out what might be
> being handled differently in 32-bit and 64-bit Cygwin environments.
> Given the Cygwin project is considering dropping support for 32-bit
> Cygwin anyway, it might not be worth doing anything about this.  But I
> thought it worth reporting in case there's something obvious to folk
> more familiar with this code.
> 

How odd!  Thanks for the report.  I'll investigate.
Jeff
Johannes Schindelin Nov. 8, 2021, 11:59 p.m. UTC | #7
Hi Adam,

On Thu, 4 Nov 2021, Adam Dinwoodie wrote:

> On Monday 20 September 2021 at 03:36 pm +0000, Jeff Hostetler via GitGitGadget wrote:
> > From: Jeff Hostetler <jeffhost@microsoft.com>
> >
> > Convert test helper to use `start_bg_command()` when spawning a server
> > daemon in the background rather than blocks of platform-specific code.
> >
> > Also, while here, remove _() translation around error messages since
> > this is a test helper and not Git code.
>
> As part of testing the v2.34.0-rc0 release candidate on Cygwin, I've
> found this commit -- 05881a6fc9 (t/helper/simple-ipc: convert
> test-simple-ipc to use start_bg_command, 2021-09-20), according to my
> bisecting -- is causing t0052.1 to fail on 32-bit Cygwin.  Somehow this
> is only affecting the 32-bit Cygwin build; the 64-bit build is working
> as expected.
>
> Specifically, the failure I'm seeing is as below:
>
> ```
> $ GIT_TRACE=1 sh t0052-simple-ipc.sh -vix
> trace: built-in: git init '/cygdrive/d/git-32/t/trash directory.t0052-simple-ipc'
> Initialized empty Git repository in /cygdrive/d/git-32/t/trash directory.t0052-simple-ipc/.git/
> expecting success of 0052.1 'start simple command server':
> 	test_atexit stop_simple_IPC_server &&
> 	test-tool simple-ipc start-daemon --threads=8 &&
> 	test-tool simple-ipc is-active
>
> ++ test_atexit stop_simple_IPC_server
> ++ test 0 = 0
> ++ test_atexit_cleanup='{ stop_simple_IPC_server
> 		} && (exit "$eval_ret"); eval_ret=$?; :'
> ++ test-tool simple-ipc start-daemon --threads=8
> trace: run_command: test-tool simple-ipc run-daemon --name=ipc-test --threads=8
> error: daemon failed to start
> error: last command exited with $?=1
> not ok 1 - start simple command server
> #
> #		test_atexit stop_simple_IPC_server &&
> #		test-tool simple-ipc start-daemon --threads=8 &&
> #		test-tool simple-ipc is-active
> #
> ++ stop_simple_IPC_server
> ++ test-tool simple-ipc stop-daemon
> ++ exit 1
> ++ eval_ret=1
> ++ :
> ```
>
> I've had a look at the code changes, and cannot work out what might be
> being handled differently in 32-bit and 64-bit Cygwin environments.
> Given the Cygwin project is considering dropping support for 32-bit
> Cygwin anyway, it might not be worth doing anything about this.  But I
> thought it worth reporting in case there's something obvious to folk
> more familiar with this code.

I had a look at this and could reproduce... partially. I only managed to
make it fail every once in a while.

Digging deeper, it turns out that the `lstat()` call in
`ipc_get_active_state()` does not receive an `st_mode` indicating a
socket, but rather a file (in my tests, it was usually 0100644, but
sometimes even 0100755).

The reason is, of course, that Cygwin _emulates_ Unix sockets. What is in
the file system is just a special file, it is marked with the `system` bit
(which only exists on Windows), and its contents start with the tell-tale
`!<socket>`.

And as you might have guessed, there is a race going on between Cygwin
writing that file _and_ flipping that `system` bit, and Git trying to
access the Unix socket and encountering an unexpected file.

Now, why this only happens in your 32-bit setup, I have no idea.

In my tests, the following patch works around the issue. Could I ask you
to test it in your environment?

-- snip --
diff --git a/compat/simple-ipc/ipc-unix-socket.c
b/compat/simple-ipc/ipc-unix-socket.c
index 4e28857a0a..1c591b2adf 100644
--- a/compat/simple-ipc/ipc-unix-socket.c
+++ b/compat/simple-ipc/ipc-unix-socket.c
@@ -36,6 +36,23 @@ enum ipc_active_state ipc_get_active_state(const char
*path)
 	}

 	/* also complain if a plain file is in the way */
+#ifdef __CYGWIN__
+	{
+		static const int delay[] = { 1, 10, 20, 40, -1 };
+		int i;
+
+		for (i = 0; S_ISREG(st.st_mode) && delay[i] > 0; i++) {
+			/*
+			 * Cygwin might still be in the process of marking the
+			 * underlying file as a system file.
+			 */
+			sleep_millisec(delay[i]);
+			if (lstat(path, &st) == -1)
+				return IPC_STATE__INVALID_PATH;
+		}
+	}
+#endif
+
 	if ((st.st_mode & S_IFMT) != S_IFSOCK)
 		return IPC_STATE__INVALID_PATH;

-- snap --

FWIW it looks as if the loop might be a bit of an overkill, as I could not
get the code to need more than a single one-millisecond sleep, but it's
probably safer to just keep the delay loop in place as-is.

Ciao,
Dscho
Ramsay Jones Nov. 9, 2021, 6:53 p.m. UTC | #8
On 08/11/2021 23:59, Johannes Schindelin wrote:
[snip]
> I had a look at this and could reproduce... partially. I only managed to
> make it fail every once in a while.
> 
> Digging deeper, it turns out that the `lstat()` call in
> `ipc_get_active_state()` does not receive an `st_mode` indicating a
> socket, but rather a file (in my tests, it was usually 0100644, but
> sometimes even 0100755).
> 
> The reason is, of course, that Cygwin _emulates_ Unix sockets. What is in
> the file system is just a special file, it is marked with the `system` bit
> (which only exists on Windows), and its contents start with the tell-tale
> `!<socket>`.
> 
> And as you might have guessed, there is a race going on between Cygwin
> writing that file _and_ flipping that `system` bit, and Git trying to
> access the Unix socket and encountering an unexpected file.
> 
> Now, why this only happens in your 32-bit setup, I have no idea.
> 
> In my tests, the following patch works around the issue. Could I ask you
> to test it in your environment?

Just FYI, I just tried the patch below (on 64-bit cygwin) and this test
now works fine for me. (well, run 5 times by hand - not with --stress).

This is on windows 10 21H1 and cygwin:

  $ uname -a
  CYGWIN_NT-10.0 satellite 3.3.2(0.341/5/3) 2021-11-08 16:55 x86_64 Cygwin
  $ 

[Yes, I updated last night!]

ATB,
Ramsay Jones

> -- snip --
> diff --git a/compat/simple-ipc/ipc-unix-socket.c
> b/compat/simple-ipc/ipc-unix-socket.c
> index 4e28857a0a..1c591b2adf 100644
> --- a/compat/simple-ipc/ipc-unix-socket.c
> +++ b/compat/simple-ipc/ipc-unix-socket.c
> @@ -36,6 +36,23 @@ enum ipc_active_state ipc_get_active_state(const char
> *path)
>  	}
> 
>  	/* also complain if a plain file is in the way */
> +#ifdef __CYGWIN__
> +	{
> +		static const int delay[] = { 1, 10, 20, 40, -1 };
> +		int i;
> +
> +		for (i = 0; S_ISREG(st.st_mode) && delay[i] > 0; i++) {
> +			/*
> +			 * Cygwin might still be in the process of marking the
> +			 * underlying file as a system file.
> +			 */
> +			sleep_millisec(delay[i]);
> +			if (lstat(path, &st) == -1)
> +				return IPC_STATE__INVALID_PATH;
> +		}
> +	}
> +#endif
> +
>  	if ((st.st_mode & S_IFMT) != S_IFSOCK)
>  		return IPC_STATE__INVALID_PATH;
> 
> -- snap --
> 
> FWIW it looks as if the loop might be a bit of an overkill, as I could not
> get the code to need more than a single one-millisecond sleep, but it's
> probably safer to just keep the delay loop in place as-is.
> 
> Ciao,
> Dscho
>
Johannes Schindelin Nov. 9, 2021, 11:01 p.m. UTC | #9
Hi Ramsay,

On Tue, 9 Nov 2021, Ramsay Jones wrote:

> On 08/11/2021 23:59, Johannes Schindelin wrote:
> [snip]
> > I had a look at this and could reproduce... partially. I only managed to
> > make it fail every once in a while.
> >
> > Digging deeper, it turns out that the `lstat()` call in
> > `ipc_get_active_state()` does not receive an `st_mode` indicating a
> > socket, but rather a file (in my tests, it was usually 0100644, but
> > sometimes even 0100755).
> >
> > The reason is, of course, that Cygwin _emulates_ Unix sockets. What is in
> > the file system is just a special file, it is marked with the `system` bit
> > (which only exists on Windows), and its contents start with the tell-tale
> > `!<socket>`.
> >
> > And as you might have guessed, there is a race going on between Cygwin
> > writing that file _and_ flipping that `system` bit, and Git trying to
> > access the Unix socket and encountering an unexpected file.
> >
> > Now, why this only happens in your 32-bit setup, I have no idea.
> >
> > In my tests, the following patch works around the issue. Could I ask you
> > to test it in your environment?
>
> Just FYI, I just tried the patch below (on 64-bit cygwin) and this test
> now works fine for me. (well, run 5 times by hand - not with --stress).

Very good!

I fear that it is a bit late in the -rc cycle to try to get this into the
official v2.34.0. Adam, since you are the maintainer of the Cygwin git
package, would you mind incorporating this patch into Cygwin's version of
Git?

> This is on windows 10 21H1 and cygwin:
>
>   $ uname -a
>   CYGWIN_NT-10.0 satellite 3.3.2(0.341/5/3) 2021-11-08 16:55 x86_64 Cygwin
>   $
>
> [Yes, I updated last night!]

Good thing, too: v3.3.2 fixes a critical bug in the pipe code. One symptom
was that you could not use Git Credential Manager Core as credential
helper (because Git thought that the helper had hung up, well before the
helper sent any information).

Ciao,
Dscho

>
> ATB,
> Ramsay Jones
>
> > -- snip --
> > diff --git a/compat/simple-ipc/ipc-unix-socket.c
> > b/compat/simple-ipc/ipc-unix-socket.c
> > index 4e28857a0a..1c591b2adf 100644
> > --- a/compat/simple-ipc/ipc-unix-socket.c
> > +++ b/compat/simple-ipc/ipc-unix-socket.c
> > @@ -36,6 +36,23 @@ enum ipc_active_state ipc_get_active_state(const char
> > *path)
> >  	}
> >
> >  	/* also complain if a plain file is in the way */
> > +#ifdef __CYGWIN__
> > +	{
> > +		static const int delay[] = { 1, 10, 20, 40, -1 };
> > +		int i;
> > +
> > +		for (i = 0; S_ISREG(st.st_mode) && delay[i] > 0; i++) {
> > +			/*
> > +			 * Cygwin might still be in the process of marking the
> > +			 * underlying file as a system file.
> > +			 */
> > +			sleep_millisec(delay[i]);
> > +			if (lstat(path, &st) == -1)
> > +				return IPC_STATE__INVALID_PATH;
> > +		}
> > +	}
> > +#endif
> > +
> >  	if ((st.st_mode & S_IFMT) != S_IFSOCK)
> >  		return IPC_STATE__INVALID_PATH;
> >
> > -- snap --
> >
> > FWIW it looks as if the loop might be a bit of an overkill, as I could not
> > get the code to need more than a single one-millisecond sleep, but it's
> > probably safer to just keep the delay loop in place as-is.
> >
> > Ciao,
> > Dscho
> >
>
>
Junio C Hamano Nov. 9, 2021, 11:34 p.m. UTC | #10
Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:

> I fear that it is a bit late in the -rc cycle to try to get this into the
> official v2.34.0. Adam, since you are the maintainer of the Cygwin git
> package, would you mind incorporating this patch into Cygwin's version of
> Git?

I do not mind taking a Cygwin-only #ifdef block in compat/ like we
see below from folks who have stake in Cygwin, and who are clearly
leading Cygwin users on the list, like Ramsay and Adam are, even
after I tag -rc2.

I cannot give the change any better test than they can, and it is
their platform to improve, or break by accident while trying to do
so.

Thanks.
Johannes Schindelin Nov. 10, 2021, 12:27 p.m. UTC | #11
Hi Junio,

On Tue, 9 Nov 2021, Junio C Hamano wrote:

> Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:
>
> > I fear that it is a bit late in the -rc cycle to try to get this into the
> > official v2.34.0. Adam, since you are the maintainer of the Cygwin git
> > package, would you mind incorporating this patch into Cygwin's version of
> > Git?
>
> I do not mind taking a Cygwin-only #ifdef block in compat/ like we
> see below from folks who have stake in Cygwin, and who are clearly
> leading Cygwin users on the list, like Ramsay and Adam are, even
> after I tag -rc2.

Thank you for your encouragement, I contributed it here:
https://lore.kernel.org/git/pull.1074.git.1636542550889.gitgitgadget@gmail.com

> I cannot give the change any better test than they can, and it is
> their platform to improve, or break by accident while trying to do
> so.

Right. I tested this as well as I could, via the `--stress` option, and am
fairly confident that it is correct. Since the patch touches only
`simply-ipc` code, the only test that could possibly affected is t0052,
and it passes with `--stress` over here (when it failed without the
patch).

Ciao,
Dscho

P.S.: in case you wondered, no, I did not run the entire test suite. With
the performance characteristics of the POSIX emulation provided by the
Cygwin runtime, this would simply take too long. It's not the first time I
wish our test suite was more efficient, across _all_ supported platforms.
Adam Dinwoodie Nov. 12, 2021, 8:56 a.m. UTC | #12
On Wed, 10 Nov 2021 at 12:28, Johannes Schindelin
<Johannes.Schindelin@gmx.de> wrote:
>
> Hi Junio,
>
> On Tue, 9 Nov 2021, Junio C Hamano wrote:
>
> > Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:
> >
> > > I fear that it is a bit late in the -rc cycle to try to get this into the
> > > official v2.34.0. Adam, since you are the maintainer of the Cygwin git
> > > package, would you mind incorporating this patch into Cygwin's version of
> > > Git?
> >
> > I do not mind taking a Cygwin-only #ifdef block in compat/ like we
> > see below from folks who have stake in Cygwin, and who are clearly
> > leading Cygwin users on the list, like Ramsay and Adam are, even
> > after I tag -rc2.
>
> Thank you for your encouragement, I contributed it here:
> https://lore.kernel.org/git/pull.1074.git.1636542550889.gitgitgadget@gmail.com
>
> > I cannot give the change any better test than they can, and it is
> > their platform to improve, or break by accident while trying to do
> > so.
>
> Right. I tested this as well as I could, via the `--stress` option, and am
> fairly confident that it is correct. Since the patch touches only
> `simply-ipc` code, the only test that could possibly affected is t0052,
> and it passes with `--stress` over here (when it failed without the
> patch).
>
> Ciao,
> Dscho
>
> P.S.: in case you wondered, no, I did not run the entire test suite. With
> the performance characteristics of the POSIX emulation provided by the
> Cygwin runtime, this would simply take too long. It's not the first time I
> wish our test suite was more efficient, across _all_ supported platforms.

I have just run the complete test suite on rc2, both with and without
this patch, and I can confirm it resolves this problem and doesn't
cause any other new test failures.

But yes, the (lack of) speed of running the Git test suite on Cygwin
is one of the reasons I run the tests on high-spec Azure VMs rather
than my own systems. Unfortunately the Cygwin compatibility layer plus
the overheads of NTFS mean things are unlikely to get significantly
quicker any time soon, and between WSL and Git for Windows, I expect
interest in improving Cygwin's performance is going to continue to
wane.
Junio C Hamano Nov. 12, 2021, 4:01 p.m. UTC | #13
Adam Dinwoodie <adam@dinwoodie.org> writes:

> I have just run the complete test suite on rc2, both with and without
> this patch, and I can confirm it resolves this problem and doesn't
> cause any other new test failures.

Thanks for a good news.

> But yes, the (lack of) speed of running the Git test suite on Cygwin
> is one of the reasons I run the tests on high-spec Azure VMs rather
> than my own systems. Unfortunately the Cygwin compatibility layer plus
> the overheads of NTFS mean things are unlikely to get significantly
> quicker any time soon, and between WSL and Git for Windows, I expect
> interest in improving Cygwin's performance is going to continue to
> wane.

Out of curiosity, are the use cases and user base of Cygwin waning,
or are there still viable cases where Cygwin is a more preferred
solution over WSL (the question is not limited to use of Git)?
Adam Dinwoodie Nov. 12, 2021, 9:33 p.m. UTC | #14
On Fri, 12 Nov 2021 at 16:01, Junio C Hamano <gitster@pobox.com> wrote:
>
> Adam Dinwoodie <adam@dinwoodie.org> writes:
> > But yes, the (lack of) speed of running the Git test suite on Cygwin
> > is one of the reasons I run the tests on high-spec Azure VMs rather
> > than my own systems. Unfortunately the Cygwin compatibility layer plus
> > the overheads of NTFS mean things are unlikely to get significantly
> > quicker any time soon, and between WSL and Git for Windows, I expect
> > interest in improving Cygwin's performance is going to continue to
> > wane.
>
> Out of curiosity, are the use cases and user base of Cygwin waning,
> or are there still viable cases where Cygwin is a more preferred
> solution over WSL (the question is not limited to use of Git)?

No formal research here, just impressions as someone who has used
Cygwin for a long time and who hangs out on the Cygwin mailing list:
for a lot of use cases, WSL is at least as good, if not better, than
Cygwin. There are a few areas where Cygwin is still a better solution,
though:

- WSL requires essentially installing an entire operating system. Disk
space is relatively cheap, so that's not nearly the obstacle it used
to be, but it is an obstacle. This is more relevant for people who
want to distribute packaged installers to Windows users: most
non-technical users won't want to get WSL working, but if you've
written code for *nix and don't want to port it manually to Windows,
it's relatively straightforward to compile it using Cygwin and bundle
the cygwin1.dll file with the installer. That'll mostly get your code
working with a user experience that doesn't differ too much from a
fully native Windows application. (This is essentially what Git for
Windows is doing, albeit with an increasingly distant Cygwin fork.)

- There are some functions that Cygwin offers that WSL doesn't. The
key one for me is the ability to access Windows network file shares,
which WSL doesn't support (or at least didn't last time I checked). I
expect some of these gaps will disappear as WSL gets more features,
but I expect some of them are fairly fundamental restrictions: Cygwin
applications can have code specifically to handle the fact that
there's a Windows OS there, so they can -- with care -- interact with
the Windows OS directly to (say) use Windows file access APIs or the
Windows clipboard. WSL applications generally don't have that ability;
if I install something from apt on my Debian WSL installation, it'll
pull exactly the same binary as if I'd installed it on a normal Debian
system. I guess in theory people could write code to detect that
they're running in WSL and handle that specially, in the same way that
it's normally possible to detect and handle when you're running in a
VM versus running on bare metal. I expect that'll be much less common,
though, just as Git has code for handling Cygwin specially but doesn't
have code for handling Linux-within-WSL specially, even though both
could be used to access a Git repository stored in the same Windows
NTFS directory.

I expect some folk who historically have used Cygwin will shift over
to WSL, some will stick with Cygwin, and a small number (as I do) will
use both in parallel for slightly different jobs.

tl;dr IMO both is true: WSL is better than Cygwin for some use cases
so the user base is waning, but Cygwin is still a very viable
preference over WSL for other use cases.
Ramsay Jones Nov. 13, 2021, 7:11 p.m. UTC | #15
On 10/11/2021 12:27, Johannes Schindelin wrote:
[snip]
>> I cannot give the change any better test than they can, and it is
>> their platform to improve, or break by accident while trying to do
>> so.
> 
> Right. I tested this as well as I could, via the `--stress` option, and am
> fairly confident that it is correct. Since the patch touches only
> `simply-ipc` code, the only test that could possibly affected is t0052,
> and it passes with `--stress` over here (when it failed without the
> patch).
> 
> Ciao,
> Dscho
> 
> P.S.: in case you wondered, no, I did not run the entire test suite. With
> the performance characteristics of the POSIX emulation provided by the
> Cygwin runtime, this would simply take too long. It's not the first time I
> wish our test suite was more efficient, across _all_ supported platforms.


[I seem to have lost Adam's reply about this now being Hunky-Dory, but ...]

I ran the test-suite on -rc2 on thursday night (note _not_ rc2 + this patch)
and it deadlocked on me; I didn't notice for 4 hours, so (in the early hours)
I simply Ctl+C-ed it and went to bed. I haven't had the test-suite deadlock
for many many years - I've been spoilt! ;-)

I tried -rc2 again last night; this time it finished, but I gained another
test failure: t0301-credential-cache.sh. I have _never_ had this test fail
before, so that was unexpected. :(

[Yes, t0052-simple-ipc.sh failed as expected, since this patch was not
applied].

Also, I was half expecting a small speed-up due to the new pipe code in
v3.3.2 of the cygwin dll, but it actually took an hour longer than normal. :(

The only change to my setup, between -rc1 and -rc2, was the cygwin update
to v3.3.2, so this may point to some more fallout from the new pipe code
(maybe?).

Anyway, I haven't even looked at the new failure (see below), which we will
probably not have time to fix before release, so I am just now building
current master (v2.34.0-rc2-16-g5a73c6bdc7) to give that a try. (So, I won't
have anything to report until tomorrow).

Just FYI:

  $ ./t0301-credential-cache.sh
  ...
  not ok 13 - socket defaults to ~/.cache/git/credential/socket
  #
  #               test_when_finished "
  #                       git credential-cache exit &&
  #                       rmdir -p .cache/git/credential/
  #               " &&
  #               test_path_is_missing "$HOME/.git-credential-cache" &&
  #               test_path_is_socket "$HOME/.cache/git/credential/socket"
  #
  ...
  not ok 26 - use custom XDG_CACHE_HOME if set and default sockets are not created
  #
  #               test_when_finished "git credential-cache exit" &&
  #               test_path_is_socket "$XDG_CACHE_HOME/git/credential/socket" &&
  #               test_path_is_missing "$HOME/.git-credential-cache/socket" &&
  #               test_path_is_missing "$HOME/.cache/git/credential/socket"
  #
  not ok 27 - credential-cache --socket option overrides default location
  #
  #               test_when_finished "
  #                       git credential-cache exit --socket \"\$HOME/dir/socket\" &&
  #                       rmdir \"\$HOME/dir\"
  #               " &&
  #               check approve "cache --socket \"\$HOME/dir/socket\"" <<-\EOF &&
  #               protocol=https
  #               host=example.com
  #               username=store-user
  #               password=store-pass
  #               EOF
  #               test_path_is_socket "$HOME/dir/socket"
  #
  not ok 28 - use custom XDG_CACHE_HOME even if xdg socket exists
  #
  #               test_when_finished "
  #                       git credential-cache exit &&
  #                       sane_unset XDG_CACHE_HOME
  #               " &&
  #               check approve cache <<-\EOF &&
  #               protocol=https
  #               host=example.com
  #               username=store-user
  #               password=store-pass
  #               EOF
  #               test_path_is_socket "$HOME/.cache/git/credential/socket" &&
  #               XDG_CACHE_HOME="$HOME/xdg" &&
  #               export XDG_CACHE_HOME &&
  #               check approve cache <<-\EOF &&
  #               protocol=https
  #               host=example.com
  #               username=store-user
  #               password=store-pass
  #               EOF
  #               test_path_is_socket "$XDG_CACHE_HOME/git/credential/socket"
  #
  not ok 29 - use user socket if user directory exists
  #
  #               test_when_finished "
  #                       git credential-cache exit &&
  #                       rmdir \"\$HOME/.git-credential-cache/\"
  #               " &&
  #               mkdir -p "$HOME/.git-credential-cache/" &&
  #               chmod 700 "$HOME/.git-credential-cache/" &&
  #               check approve cache <<-\EOF &&
  #               protocol=https
  #               host=example.com
  #               username=store-user
  #               password=store-pass
  #               EOF
  #               test_path_is_socket "$HOME/.git-credential-cache/socket"
  #
  not ok 30 - use user socket if user directory is a symlink to a directory
  #
  #               test_when_finished "
  #                       git credential-cache exit &&
  #                       rmdir \"\$HOME/dir/\" &&
  #                       rm \"\$HOME/.git-credential-cache\"
  #               " &&
  #               mkdir -p -m 700 "$HOME/dir/" &&
  #               ln -s "$HOME/dir" "$HOME/.git-credential-cache" &&
  #               check approve cache <<-\EOF &&
  #               protocol=https
  #               host=example.com
  #               username=store-user
  #               password=store-pass
  #               EOF
  #               test_path_is_socket "$HOME/.git-credential-cache/socket"
  #
  ok 31 - helper (cache --timeout=1) times out
  # failed 6 among 31 test(s)
  1..31
  $
  
ATB,
Ramsay Jones
Ramsay Jones Nov. 14, 2021, 7:34 p.m. UTC | #16
On 13/11/2021 19:11, Ramsay Jones wrote:
[snip]
> The only change to my setup, between -rc1 and -rc2, was the cygwin update
> to v3.3.2, so this may point to some more fallout from the new pipe code
> (maybe?).
> 
> Anyway, I haven't even looked at the new failure (see below), which we will
> probably not have time to fix before release, so I am just now building
> current master (v2.34.0-rc2-16-g5a73c6bdc7) to give that a try. (So, I won't
> have anything to report until tomorrow).

So, current 'master' fixes t0052-simple-ipc.sh, which is good, but the
t0301-credential-cache.sh test is still failing. Also, I can confirm
that cygwin v3.3.2 adds an additional hour to a test-suite run. :(

[ie it now takes 6 hours rather than 5 hours to run - I remember a time
when it used to only take 2 hours; those were the days!]

ATB,
Ramsay Jones
Adam Dinwoodie Nov. 14, 2021, 8:10 p.m. UTC | #17
On Sat, 13 Nov 2021 at 19:11, Ramsay Jones <ramsay@ramsayjones.plus.com> wrote:
> On 10/11/2021 12:27, Johannes Schindelin wrote:
> [snip]
> >> I cannot give the change any better test than they can, and it is
> >> their platform to improve, or break by accident while trying to do
> >> so.
> >
> > Right. I tested this as well as I could, via the `--stress` option, and am
> > fairly confident that it is correct. Since the patch touches only
> > `simply-ipc` code, the only test that could possibly affected is t0052,
> > and it passes with `--stress` over here (when it failed without the
> > patch).
> >
> > Ciao,
> > Dscho
> >
> > P.S.: in case you wondered, no, I did not run the entire test suite. With
> > the performance characteristics of the POSIX emulation provided by the
> > Cygwin runtime, this would simply take too long. It's not the first time I
> > wish our test suite was more efficient, across _all_ supported platforms.
>
>
> [I seem to have lost Adam's reply about this now being Hunky-Dory, but ...]
>
> I ran the test-suite on -rc2 on thursday night (note _not_ rc2 + this patch)
> and it deadlocked on me; I didn't notice for 4 hours, so (in the early hours)
> I simply Ctl+C-ed it and went to bed. I haven't had the test-suite deadlock
> for many many years - I've been spoilt! ;-)
>
> I tried -rc2 again last night; this time it finished, but I gained another
> test failure: t0301-credential-cache.sh. I have _never_ had this test fail
> before, so that was unexpected. :(
>
> [Yes, t0052-simple-ipc.sh failed as expected, since this patch was not
> applied].
>
> Also, I was half expecting a small speed-up due to the new pipe code in
> v3.3.2 of the cygwin dll, but it actually took an hour longer than normal. :(
>
> The only change to my setup, between -rc1 and -rc2, was the cygwin update
> to v3.3.2, so this may point to some more fallout from the new pipe code
> (maybe?).
>
> Anyway, I haven't even looked at the new failure (see below), which we will
> probably not have time to fix before release, so I am just now building
> current master (v2.34.0-rc2-16-g5a73c6bdc7) to give that a try. (So, I won't
> have anything to report until tomorrow).

I'm seeing the same failure. It isn't caused by a change in Git --
I've rebuilt and re-run the test on old versions where that test was
passing, and it's now failing -- so this is clearly something in the
Cygwin environment. I've not investigated further, but it's clearly
caused by a Cygwin change rather than a Git change, so I don't think
there's any reason to hold up the Git release.

(I should probably report it on the Cygwin mailing list, but I haven't
got around to that yet...)
Johannes Schindelin Nov. 16, 2021, 10:56 a.m. UTC | #18
Hi,

On Fri, 12 Nov 2021, Adam Dinwoodie wrote:

> On Fri, 12 Nov 2021 at 16:01, Junio C Hamano <gitster@pobox.com> wrote:
> >
> > Out of curiosity, are the use cases and user base of Cygwin waning,
> > or are there still viable cases where Cygwin is a more preferred
> > solution over WSL (the question is not limited to use of Git)?
>
> No formal research here, just impressions as someone who has used
> Cygwin for a long time and who hangs out on the Cygwin mailing list:
> for a lot of use cases, WSL is at least as good, if not better, than
> Cygwin. There are a few areas where Cygwin is still a better solution,
> though:
>
> - WSL requires essentially installing an entire operating system. Disk
> space is relatively cheap, so that's not nearly the obstacle it used
> to be, but it is an obstacle. This is more relevant for people who
> want to distribute packaged installers to Windows users: most
> non-technical users won't want to get WSL working, but if you've
> written code for *nix and don't want to port it manually to Windows,
> it's relatively straightforward to compile it using Cygwin and bundle
> the cygwin1.dll file with the installer. That'll mostly get your code
> working with a user experience that doesn't differ too much from a
> fully native Windows application. (This is essentially what Git for
> Windows is doing, albeit with an increasingly distant Cygwin fork.)
>
> - There are some functions that Cygwin offers that WSL doesn't. The
> key one for me is the ability to access Windows network file shares,
> which WSL doesn't support (or at least didn't last time I checked). I
> expect some of these gaps will disappear as WSL gets more features,
> but I expect some of them are fairly fundamental restrictions: Cygwin
> applications can have code specifically to handle the fact that
> there's a Windows OS there, so they can -- with care -- interact with
> the Windows OS directly to (say) use Windows file access APIs or the
> Windows clipboard. WSL applications generally don't have that ability;
> if I install something from apt on my Debian WSL installation, it'll
> pull exactly the same binary as if I'd installed it on a normal Debian
> system. I guess in theory people could write code to detect that
> they're running in WSL and handle that specially, in the same way that
> it's normally possible to detect and handle when you're running in a
> VM versus running on bare metal. I expect that'll be much less common,
> though, just as Git has code for handling Cygwin specially but doesn't
> have code for handling Linux-within-WSL specially, even though both
> could be used to access a Git repository stored in the same Windows
> NTFS directory.

I would like to add two additional scenarios where Cygwin needs to be used
instead of WSL:

- In order to install WSL, you need to switch your machine to Developer
  Mode, which requires administrative privileges (which not evey developer
  enjoys, and given recent and not so recent security news, maybe that's a
  good thing, too). Cygwin does not require administrative privileges.

- There is (finally!) a way to run graphical Linux applications in WSL,
  but it requires Windows 11. That excludes many existing Windows users.

So yeah, I think Cygwin is here to stay. Besides, as long as I don't find
any better way to have a POSIX-compliant shell (Git will continue to
depend on one for a long, long time, I expect), Git for Windows will
indirectly _have_ to depend on Cygwin (Git for Windows bundles a Bash
using the MSYS2 runtime, which is a very close fork of the Cygwin
runtime).

Ciao,
Dscho
Johannes Schindelin Nov. 16, 2021, 11:02 a.m. UTC | #19
Hi Adam,

On Fri, 12 Nov 2021, Adam Dinwoodie wrote:

> [...] the (lack of) speed of running the Git test suite on Cygwin
> is one of the reasons I run the tests on high-spec Azure VMs rather
> than my own systems. Unfortunately the Cygwin compatibility layer plus
> the overheads of NTFS mean things are unlikely to get significantly
> quicker any time soon, and between WSL and Git for Windows, I expect
> interest in improving Cygwin's performance is going to continue to
> wane.

Well, at least from the Git point of view, there is still _some_ hope. At
the Git Contributor Summit, we talked (very, very briefly) about moving
parts of Git's test infrastructure from shell code to C.

This would definitely help not only save electricity (and we all _do_ have
to get used to the idea that we cannot continue spending as much energy as
we do right now) when running the CI/PR builds, but also accelerate
running the test suite on Cygwin (or for that matter, I suspect HP NonStop
to be helped tremendously, too).

Ciao,
Dscho
diff mbox series

Patch

diff --git a/t/helper/test-simple-ipc.c b/t/helper/test-simple-ipc.c
index 91345180750..28365ff85b6 100644
--- a/t/helper/test-simple-ipc.c
+++ b/t/helper/test-simple-ipc.c
@@ -9,6 +9,7 @@ 
 #include "parse-options.h"
 #include "thread-utils.h"
 #include "strvec.h"
+#include "run-command.h"
 
 #ifndef SUPPORTS_SIMPLE_IPC
 int cmd__simple_ipc(int argc, const char **argv)
@@ -267,185 +268,71 @@  static int daemon__run_server(void)
 	 */
 	ret = ipc_server_run(cl_args.path, &opts, test_app_cb, (void*)&my_app_data);
 	if (ret == -2)
-		error(_("socket/pipe already in use: '%s'"), cl_args.path);
+		error("socket/pipe already in use: '%s'", cl_args.path);
 	else if (ret == -1)
-		error_errno(_("could not start server on: '%s'"), cl_args.path);
+		error_errno("could not start server on: '%s'", cl_args.path);
 
 	return ret;
 }
 
-#ifndef GIT_WINDOWS_NATIVE
-/*
- * This is adapted from `daemonize()`.  Use `fork()` to directly create and
- * run the daemon in a child process.
- */
-static int spawn_server(pid_t *pid)
-{
-	struct ipc_server_opts opts = {
-		.nr_threads = cl_args.nr_threads,
-	};
+static start_bg_wait_cb bg_wait_cb;
 
-	*pid = fork();
-
-	switch (*pid) {
-	case 0:
-		if (setsid() == -1)
-			error_errno(_("setsid failed"));
-		close(0);
-		close(1);
-		close(2);
-		sanitize_stdfds();
+static int bg_wait_cb(const struct child_process *cp, void *cb_data)
+{
+	int s = ipc_get_active_state(cl_args.path);
 
-		return ipc_server_run(cl_args.path, &opts, test_app_cb,
-				      (void*)&my_app_data);
+	switch (s) {
+	case IPC_STATE__LISTENING:
+		/* child is "ready" */
+		return 0;
 
-	case -1:
-		return error_errno(_("could not spawn daemon in the background"));
+	case IPC_STATE__NOT_LISTENING:
+	case IPC_STATE__PATH_NOT_FOUND:
+		/* give child more time */
+		return 1;
 
 	default:
-		return 0;
+	case IPC_STATE__INVALID_PATH:
+	case IPC_STATE__OTHER_ERROR:
+		/* all the time in world won't help */
+		return -1;
 	}
 }
-#else
-/*
- * Conceptually like `daemonize()` but different because Windows does not
- * have `fork(2)`.  Spawn a normal Windows child process but without the
- * limitations of `start_command()` and `finish_command()`.
- */
-static int spawn_server(pid_t *pid)
-{
-	char test_tool_exe[MAX_PATH];
-	struct strvec args = STRVEC_INIT;
-	int in, out;
-
-	GetModuleFileNameA(NULL, test_tool_exe, MAX_PATH);
-
-	in = open("/dev/null", O_RDONLY);
-	out = open("/dev/null", O_WRONLY);
-
-	strvec_push(&args, test_tool_exe);
-	strvec_push(&args, "simple-ipc");
-	strvec_push(&args, "run-daemon");
-	strvec_pushf(&args, "--name=%s", cl_args.path);
-	strvec_pushf(&args, "--threads=%d", cl_args.nr_threads);
-
-	*pid = mingw_spawnvpe(args.v[0], args.v, NULL, NULL, in, out, out);
-	close(in);
-	close(out);
-
-	strvec_clear(&args);
 
-	if (*pid < 0)
-		return error(_("could not spawn daemon in the background"));
-
-	return 0;
-}
-#endif
-
-/*
- * This is adapted from `wait_or_whine()`.  Watch the child process and
- * let it get started and begin listening for requests on the socket
- * before reporting our success.
- */
-static int wait_for_server_startup(pid_t pid_child)
+static int daemon__start_server(void)
 {
-	int status;
-	pid_t pid_seen;
-	enum ipc_active_state s;
-	time_t time_limit, now;
+	struct child_process cp = CHILD_PROCESS_INIT;
+	enum start_bg_result sbgr;
 
-	time(&time_limit);
-	time_limit += cl_args.max_wait_sec;
+	strvec_push(&cp.args, "test-tool");
+	strvec_push(&cp.args, "simple-ipc");
+	strvec_push(&cp.args, "run-daemon");
+	strvec_pushf(&cp.args, "--name=%s", cl_args.path);
+	strvec_pushf(&cp.args, "--threads=%d", cl_args.nr_threads);
 
-	for (;;) {
-		pid_seen = waitpid(pid_child, &status, WNOHANG);
+	cp.no_stdin = 1;
+	cp.no_stdout = 1;
+	cp.no_stderr = 1;
 
-		if (pid_seen == -1)
-			return error_errno(_("waitpid failed"));
+	sbgr = start_bg_command(&cp, bg_wait_cb, NULL, cl_args.max_wait_sec);
 
-		else if (pid_seen == 0) {
-			/*
-			 * The child is still running (this should be
-			 * the normal case).  Try to connect to it on
-			 * the socket and see if it is ready for
-			 * business.
-			 *
-			 * If there is another daemon already running,
-			 * our child will fail to start (possibly
-			 * after a timeout on the lock), but we don't
-			 * care (who responds) if the socket is live.
-			 */
-			s = ipc_get_active_state(cl_args.path);
-			if (s == IPC_STATE__LISTENING)
-				return 0;
-
-			time(&now);
-			if (now > time_limit)
-				return error(_("daemon not online yet"));
-
-			continue;
-		}
+	switch (sbgr) {
+	case SBGR_READY:
+		return 0;
 
-		else if (pid_seen == pid_child) {
-			/*
-			 * The new child daemon process shutdown while
-			 * it was starting up, so it is not listening
-			 * on the socket.
-			 *
-			 * Try to ping the socket in the odd chance
-			 * that another daemon started (or was already
-			 * running) while our child was starting.
-			 *
-			 * Again, we don't care who services the socket.
-			 */
-			s = ipc_get_active_state(cl_args.path);
-			if (s == IPC_STATE__LISTENING)
-				return 0;
+	default:
+	case SBGR_ERROR:
+	case SBGR_CB_ERROR:
+		return error("daemon failed to start");
 
-			/*
-			 * We don't care about the WEXITSTATUS() nor
-			 * any of the WIF*(status) values because
-			 * `cmd__simple_ipc()` does the `!!result`
-			 * trick on all function return values.
-			 *
-			 * So it is sufficient to just report the
-			 * early shutdown as an error.
-			 */
-			return error(_("daemon failed to start"));
-		}
+	case SBGR_TIMEOUT:
+		return error("daemon not online yet");
 
-		else
-			return error(_("waitpid is confused"));
+	case SBGR_DIED:
+		return error("daemon terminated");
 	}
 }
 
-/*
- * This process will start a simple-ipc server in a background process and
- * wait for it to become ready.  This is like `daemonize()` but gives us
- * more control and better error reporting (and makes it easier to write
- * unit tests).
- */
-static int daemon__start_server(void)
-{
-	pid_t pid_child;
-	int ret;
-
-	/*
-	 * Run the actual daemon in a background process.
-	 */
-	ret = spawn_server(&pid_child);
-	if (pid_child <= 0)
-		return ret;
-
-	/*
-	 * Let the parent wait for the child process to get started
-	 * and begin listening for requests on the socket.
-	 */
-	ret = wait_for_server_startup(pid_child);
-
-	return ret;
-}
-
 /*
  * This process will run a quick probe to see if a simple-ipc server
  * is active on this path.
@@ -548,7 +435,7 @@  static int client__stop_server(void)
 
 		time(&now);
 		if (now > time_limit)
-			return error(_("daemon has not shutdown yet"));
+			return error("daemon has not shutdown yet");
 	}
 }