diff mbox series

[RFC,iproute2-next,1/5] ip: Mount netns in child process instead of from inside the new namespace

Message ID 20231009182753.851551-2-toke@redhat.com (mailing list archive)
State RFC
Delegated to: David Ahern
Headers show
Series Persisting of mount namespaces along with network namespaces | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Toke Høiland-Jørgensen Oct. 9, 2023, 6:27 p.m. UTC
Refactor the netns creation code so that we offload the mounting of the
namespace file to a child process instead of bind mounting from inside the newly
created namespace.

This is done in preparation for also persisting a mount namespace; the mount
namespace reference cannot be bind-mounted from inside that namespace itself, so
we need to mount that from a child process anyway. The child process
approach (as well as some of the helper functions used for it) is adapted from
the code in the unshare(1) utility that is part of util-linux.

No functional change is intended with this patch.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
---
 ip/ipnetns.c | 184 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 130 insertions(+), 54 deletions(-)
diff mbox series

Patch

diff --git a/ip/ipnetns.c b/ip/ipnetns.c
index 9d996832aef8..a05d84514326 100644
--- a/ip/ipnetns.c
+++ b/ip/ipnetns.c
@@ -1,4 +1,5 @@ 
 /* SPDX-License-Identifier: GPL-2.0 */
+#include <stdlib.h>
 #define _ATFILE_SOURCE
 #include <sys/file.h>
 #include <sys/types.h>
@@ -7,6 +8,7 @@ 
 #include <sys/inotify.h>
 #include <sys/mount.h>
 #include <sys/syscall.h>
+#include <sys/eventfd.h>
 #include <stdio.h>
 #include <string.h>
 #include <sched.h>
@@ -25,6 +27,9 @@ 
 #include "namespace.h"
 #include "json_print.h"
 
+/* synchronize parent and child by pipe */
+#define PIPE_SYNC_BYTE	0x06
+
 static int usage(void)
 {
 	fprintf(stderr,
@@ -46,7 +51,6 @@  static int usage(void)
 static struct rtnl_handle rtnsh = { .fd = -1 };
 
 static int have_rtnl_getnsid = -1;
-static int saved_netns = -1;
 static struct link_filter filter;
 
 static int ipnetns_accept_msg(struct rtnl_ctrl_data *ctrl,
@@ -768,31 +772,131 @@  static int create_netns_dir(void)
 	return 0;
 }
 
-/* Obtain a FD for the current namespace, so we can reenter it later */
-static void netns_save(void)
+/**
+ * waitchild() - Wait for a process to exit successfully
+ * @pid: PID of the process to wait for
+ *
+ * Wait for a process to exit successfully and return its exit status.
+ */
+static int waitchild(int pid)
 {
-	if (saved_netns != -1)
-		return;
+	int rc, status;
+
+	do {
+		rc = waitpid(pid, &status, 0);
+		if (rc < 0) {
+			if (errno == EINTR)
+				continue;
+			return -errno;
+		}
+		if (WIFEXITED(status) &&
+		    WEXITSTATUS(status) != EXIT_SUCCESS)
+			return WEXITSTATUS(status);
+	} while (rc < 0);
 
-	saved_netns = open("/proc/self/ns/net", O_RDONLY | O_CLOEXEC);
-	if (saved_netns == -1) {
-		perror("Cannot open init namespace");
-		exit(1);
+	return 0;
+}
+
+/**
+ * sync_with_child() - Tell our child we're ready and wait for it to exit
+ * @pid: The pid of our child
+ * @fd: A file descriptor created with eventfd()
+ *
+ * This tells a child created with fork_and_wait() that we are ready for it to
+ * continue. Once we have done that, wait for our child to exit.
+ */
+static int sync_with_child(pid_t pid, int fd)
+{
+	uint64_t ch = PIPE_SYNC_BYTE;
+
+	write(fd, &ch, sizeof(ch));
+	close(fd);
+
+	return waitchild(pid);
+}
+
+/**
+ * fork_and_wait() - Fork and wait to be sync'd with
+ * @fd - A file descriptor created with eventfd() which should be passed to
+ *       sync_with_child()
+ *
+ * This creates an eventfd and forks. The parent process returns immediately,
+ * but the child waits for a %PIPE_SYNC_BYTE on the eventfd before returning.
+ * This allows the parent to perform some tasks before the child starts its
+ * work. The parent should call sync_with_child() once it is ready for the
+ * child to continue.
+ *
+ * Return: The pid from fork()
+ */
+static pid_t fork_and_wait(int *fd)
+{
+	uint64_t ch;
+	pid_t pid;
+
+	*fd = eventfd(0, 0);
+	if (*fd < 0) {
+		fprintf(stderr, "eventfd failed: %s\n", strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+
+	pid = fork();
+	if (pid < 0) {
+		fprintf(stderr, "fork failed: %s\n", strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+
+	if (!pid) {
+		/* wait for the our parent to tell us to continue */
+		if (read(*fd, (char *)&ch, sizeof(ch)) != sizeof(ch) ||
+		    ch != PIPE_SYNC_BYTE) {
+			fprintf(stderr, "failed to read eventfd\n");
+			exit(EXIT_FAILURE);
+		}
+		close(*fd);
 	}
+
+	return pid;
 }
 
-static void netns_restore(void)
+static int bind_ns_file(const char *parent, const char *nsfile,
+			const char *ns_name, pid_t target_pid)
 {
-	if (saved_netns == -1)
-		return;
+	char ns_path[PATH_MAX], proc_path[PATH_MAX];
+	int fd;
 
-	if (setns(saved_netns, CLONE_NEWNET)) {
-		perror("setns");
-		exit(1);
+	snprintf(ns_path, sizeof(ns_path), "%s/%s", parent, ns_name);
+	snprintf(proc_path, sizeof(proc_path), "/proc/%d/ns/%s", target_pid, nsfile);
+
+	/* Create the filesystem state */
+	fd = open(ns_path, O_RDONLY|O_CREAT|O_EXCL, 0);
+	if (fd < 0) {
+		fprintf(stderr, "Cannot create namespace file \"%s\": %s\n",
+			ns_path, strerror(errno));
+		return -1;
 	}
+	close(fd);
 
-	close(saved_netns);
-	saved_netns = -1;
+	if (mount(proc_path, ns_path, "none", MS_BIND, NULL) < 0) {
+		fprintf(stderr, "Bind %s -> %s failed: %s\n", proc_path,
+			ns_path, strerror(errno));
+		unlink(ns_path);
+		return -1;
+	}
+	return 0;
+}
+
+static pid_t bind_ns_files_from_child(const char *ns_name, pid_t target_pid,
+				      int *fd)
+{
+	pid_t child;
+
+	child = fork_and_wait(fd);
+	if (child)
+		return child;
+
+	if (bind_ns_file(NETNS_RUN_DIR, "net", ns_name, target_pid))
+		exit(EXIT_FAILURE);
+	exit(EXIT_SUCCESS);
 }
 
 static int netns_add(int argc, char **argv, bool create)
@@ -808,10 +912,9 @@  static int netns_add(int argc, char **argv, bool create)
 	 * userspace tweaks like remounting /sys, or bind mounting
 	 * a new /etc/resolv.conf can be shared between users.
 	 */
-	char netns_path[PATH_MAX], proc_path[PATH_MAX];
 	const char *name;
-	pid_t pid;
-	int fd;
+	pid_t pid, child;
+	int event_fd;
 	int lock;
 	int made_netns_run_dir_mount = 0;
 
@@ -820,6 +923,7 @@  static int netns_add(int argc, char **argv, bool create)
 			fprintf(stderr, "No netns name specified\n");
 			return -1;
 		}
+		pid = getpid();
 	} else {
 		if (argc < 2) {
 			fprintf(stderr, "No netns name and PID specified\n");
@@ -833,8 +937,6 @@  static int netns_add(int argc, char **argv, bool create)
 	}
 	name = argv[0];
 
-	snprintf(netns_path, sizeof(netns_path), "%s/%s", NETNS_RUN_DIR, name);
-
 	if (create_netns_dir())
 		return -1;
 
@@ -894,46 +996,20 @@  static int netns_add(int argc, char **argv, bool create)
 		close(lock);
 	}
 
-	/* Create the filesystem state */
-	fd = open(netns_path, O_RDONLY|O_CREAT|O_EXCL, 0);
-	if (fd < 0) {
-		fprintf(stderr, "Cannot create namespace file \"%s\": %s\n",
-			netns_path, strerror(errno));
-		return -1;
-	}
-	close(fd);
+	child = bind_ns_files_from_child(name, pid, &event_fd);
+	if (child < 0)
+		exit(EXIT_FAILURE);
 
 	if (create) {
-		netns_save();
 		if (unshare(CLONE_NEWNET) < 0) {
 			fprintf(stderr, "Failed to create a new network namespace \"%s\": %s\n",
 				name, strerror(errno));
-			goto out_delete;
+			close(event_fd);
+			exit(EXIT_FAILURE);
 		}
-
-		strcpy(proc_path, "/proc/self/ns/net");
-	} else {
-		snprintf(proc_path, sizeof(proc_path), "/proc/%d/ns/net", pid);
-	}
-
-	/* Bind the netns last so I can watch for it */
-	if (mount(proc_path, netns_path, "none", MS_BIND, NULL) < 0) {
-		fprintf(stderr, "Bind %s -> %s failed: %s\n",
-			proc_path, netns_path, strerror(errno));
-		goto out_delete;
 	}
-	netns_restore();
 
-	return 0;
-out_delete:
-	if (create) {
-		netns_restore();
-		netns_delete(argc, argv);
-	} else if (unlink(netns_path) < 0) {
-		fprintf(stderr, "Cannot remove namespace file \"%s\": %s\n",
-			netns_path, strerror(errno));
-	}
-	return -1;
+	return sync_with_child(child, event_fd);
 }
 
 int set_netnsid_from_name(const char *name, int nsid)