diff mbox

[v3] fs: only sync() superblocks reachable from the current namespace

Message ID 095717754a0f35cb8502b9763500c046cb8dcb87.1517008068.git.osandov@fb.com (mailing list archive)
State New, archived
Headers show

Commit Message

Omar Sandoval Jan. 26, 2018, 11:09 p.m. UTC
From: Omar Sandoval <osandov@fb.com>

Currently, the sync() syscall is system-wide, so any process in a
container can cause significant I/O stalls across the system by calling
sync(). This is even true for filesystems which are not accessible in
the process' mount namespace. This patch scopes sync() to only write out
filesystems reachable in the current mount namespace, except for the
initial mount namespace, which still syncs everything to avoid
surprises. This fixes the broken isolation we were seeing here.

Signed-off-by: Omar Sandoval <osandov@fb.com>
---
v2->v3: Fix stupid missed unlock bug
v1->v2: Grab mount_lock while iterating mounts

 fs/sync.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 56 insertions(+), 14 deletions(-)

Comments

Matthew Wilcox Jan. 27, 2018, 6:49 a.m. UTC | #1
On Fri, Jan 26, 2018 at 03:09:53PM -0800, Omar Sandoval wrote:
> +static int sb_reachable(struct super_block *sb, struct mnt_namespace *mnt_ns)

bool return value, surely?

> -static void sync_fs_one_sb(struct super_block *sb, void *arg)
> +static void sync_fs_one_sb(struct super_block *sb, void *p)
>  {
> -	if (!sb_rdonly(sb) && sb->s_op->sync_fs)
> -		sb->s_op->sync_fs(sb, *(int *)arg);
> +	struct sb_sync *arg = p;
> +
> +	if (!sb_rdonly(sb) && sb_reachable(sb, arg->mnt_ns) &&
> +	    sb->s_op->sync_fs)
> +		sb->s_op->sync_fs(sb, arg->wait);

The order of tests seems wrong.  There's no point in doing the expensive
"is it reachable" test before the cheap "can we sync it" test.

>  {
> -	int nowait = 0, wait = 1;
> +	struct sb_sync arg = {
> +		.mnt_ns = current->nsproxy->mnt_ns,
> +	};
> +
> +	if (arg.mnt_ns == init_task.nsproxy->mnt_ns)
> +		arg.mnt_ns = NULL;

Why is the root namespace special?  That at least needs a comment.
diff mbox

Patch

diff --git a/fs/sync.c b/fs/sync.c
index 6e0a2cbaf6de..654da445340a 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -17,6 +17,7 @@ 
 #include <linux/quotaops.h>
 #include <linux/backing-dev.h>
 #include "internal.h"
+#include "mount.h"
 
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
 			SYNC_FILE_RANGE_WAIT_AFTER)
@@ -68,16 +69,51 @@  int sync_filesystem(struct super_block *sb)
 }
 EXPORT_SYMBOL(sync_filesystem);
 
-static void sync_inodes_one_sb(struct super_block *sb, void *arg)
+struct sb_sync {
+	/*
+	 * Only sync superblocks reachable from this namespace. If NULL, sync
+	 * everything.
+	 */
+	struct mnt_namespace *mnt_ns;
+
+	/* ->sync_fs() wait argument. */
+	int wait;
+};
+
+static int sb_reachable(struct super_block *sb, struct mnt_namespace *mnt_ns)
 {
-	if (!sb_rdonly(sb))
+	struct mount *mnt;
+	int ret = 0;
+
+	if (!mnt_ns)
+		return 1;
+
+	read_seqlock_excl(&mount_lock);
+	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
+		if (mnt->mnt_ns == mnt_ns) {
+			ret = 1;
+			break;
+		}
+	}
+	read_sequnlock_excl(&mount_lock);
+	return ret;
+}
+
+static void sync_inodes_one_sb(struct super_block *sb, void *p)
+{
+	struct sb_sync *arg = p;
+
+	if (!sb_rdonly(sb) && sb_reachable(sb, arg->mnt_ns))
 		sync_inodes_sb(sb);
 }
 
-static void sync_fs_one_sb(struct super_block *sb, void *arg)
+static void sync_fs_one_sb(struct super_block *sb, void *p)
 {
-	if (!sb_rdonly(sb) && sb->s_op->sync_fs)
-		sb->s_op->sync_fs(sb, *(int *)arg);
+	struct sb_sync *arg = p;
+
+	if (!sb_rdonly(sb) && sb_reachable(sb, arg->mnt_ns) &&
+	    sb->s_op->sync_fs)
+		sb->s_op->sync_fs(sb, arg->wait);
 }
 
 static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
@@ -107,12 +143,18 @@  static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
  */
 SYSCALL_DEFINE0(sync)
 {
-	int nowait = 0, wait = 1;
+	struct sb_sync arg = {
+		.mnt_ns = current->nsproxy->mnt_ns,
+	};
+
+	if (arg.mnt_ns == init_task.nsproxy->mnt_ns)
+		arg.mnt_ns = NULL;
 
 	wakeup_flusher_threads(WB_REASON_SYNC);
-	iterate_supers(sync_inodes_one_sb, NULL);
-	iterate_supers(sync_fs_one_sb, &nowait);
-	iterate_supers(sync_fs_one_sb, &wait);
+	iterate_supers(sync_inodes_one_sb, &arg);
+	iterate_supers(sync_fs_one_sb, &arg);
+	arg.wait = 1;
+	iterate_supers(sync_fs_one_sb, &arg);
 	iterate_bdevs(fdatawrite_one_bdev, NULL);
 	iterate_bdevs(fdatawait_one_bdev, NULL);
 	if (unlikely(laptop_mode))
@@ -122,17 +164,17 @@  SYSCALL_DEFINE0(sync)
 
 static void do_sync_work(struct work_struct *work)
 {
-	int nowait = 0;
+	struct sb_sync arg = {};
 
 	/*
 	 * Sync twice to reduce the possibility we skipped some inodes / pages
 	 * because they were temporarily locked
 	 */
-	iterate_supers(sync_inodes_one_sb, &nowait);
-	iterate_supers(sync_fs_one_sb, &nowait);
+	iterate_supers(sync_inodes_one_sb, &arg);
+	iterate_supers(sync_fs_one_sb, &arg);
 	iterate_bdevs(fdatawrite_one_bdev, NULL);
-	iterate_supers(sync_inodes_one_sb, &nowait);
-	iterate_supers(sync_fs_one_sb, &nowait);
+	iterate_supers(sync_inodes_one_sb, &arg);
+	iterate_supers(sync_fs_one_sb, &arg);
 	iterate_bdevs(fdatawrite_one_bdev, NULL);
 	printk("Emergency Sync complete\n");
 	kfree(work);