diff mbox

[v2] fs: only sync() superblocks reachable from the current namespace

Message ID eaa3bb7655ceb9c887c36d6f7d607d494eb33186.1517007925.git.osandov@fb.com (mailing list archive)
State New, archived
Headers show

Commit Message

Omar Sandoval Jan. 26, 2018, 11:06 p.m. UTC
From: Omar Sandoval <osandov@fb.com>

Currently, the sync() syscall is system-wide, so any process in a
container can cause significant I/O stalls across the system by calling
sync(). This is even true for filesystems which are not accessible in
the process' mount namespace. This patch scopes sync() to only write out
filesystems reachable in the current mount namespace, except for the
initial mount namespace, which still syncs everything to avoid
surprises. This fixes the broken isolation we were seeing here.

Signed-off-by: Omar Sandoval <osandov@fb.com>
---
Grab mount_lock while iterating mounts.

 fs/sync.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 53 insertions(+), 14 deletions(-)

Comments

Omar Sandoval Jan. 26, 2018, 11:07 p.m. UTC | #1
On Fri, Jan 26, 2018 at 03:06:31PM -0800, Omar Sandoval wrote:
> From: Omar Sandoval <osandov@fb.com>
> 
> Currently, the sync() syscall is system-wide, so any process in a
> container can cause significant I/O stalls across the system by calling
> sync(). This is even true for filesystems which are not accessible in
> the process' mount namespace. This patch scopes sync() to only write out
> filesystems reachable in the current mount namespace, except for the
> initial mount namespace, which still syncs everything to avoid
> surprises. This fixes the broken isolation we were seeing here.
> 
> Signed-off-by: Omar Sandoval <osandov@fb.com>
> ---
> Grab mount_lock while iterating mounts.
> 
>  fs/sync.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 53 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/sync.c b/fs/sync.c
> index 6e0a2cbaf6de..03d27b48b972 100644
> --- a/fs/sync.c
> +++ b/fs/sync.c
> @@ -17,6 +17,7 @@
>  #include <linux/quotaops.h>
>  #include <linux/backing-dev.h>
>  #include "internal.h"
> +#include "mount.h"
>  
>  #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
>  			SYNC_FILE_RANGE_WAIT_AFTER)
> @@ -68,16 +69,48 @@ int sync_filesystem(struct super_block *sb)
>  }
>  EXPORT_SYMBOL(sync_filesystem);
>  
> -static void sync_inodes_one_sb(struct super_block *sb, void *arg)
> +struct sb_sync {
> +	/*
> +	 * Only sync superblocks reachable from this namespace. If NULL, sync
> +	 * everything.
> +	 */
> +	struct mnt_namespace *mnt_ns;
> +
> +	/* ->sync_fs() wait argument. */
> +	int wait;
> +};
> +
> +static int sb_reachable(struct super_block *sb, struct mnt_namespace *mnt_ns)
> +{
> +	struct mount *mnt;
> +
> +	if (!mnt_ns)
> +		return 1;
> +
> +	read_seqlock_excl(&mount_lock);
> +	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
> +		if (mnt->mnt_ns == mnt_ns)
> +			return 1;

Aaand I continue to be an idiot...

> +	}
> +	read_sequnlock_excl(&mount_lock);
> +	return 0;
> +}
> +
> +static void sync_inodes_one_sb(struct super_block *sb, void *p)
>  {
> -	if (!sb_rdonly(sb))
> +	struct sb_sync *arg = p;
> +
> +	if (!sb_rdonly(sb) && sb_reachable(sb, arg->mnt_ns))
>  		sync_inodes_sb(sb);
>  }
>  
> -static void sync_fs_one_sb(struct super_block *sb, void *arg)
> +static void sync_fs_one_sb(struct super_block *sb, void *p)
>  {
> -	if (!sb_rdonly(sb) && sb->s_op->sync_fs)
> -		sb->s_op->sync_fs(sb, *(int *)arg);
> +	struct sb_sync *arg = p;
> +
> +	if (!sb_rdonly(sb) && sb_reachable(sb, arg->mnt_ns) &&
> +	    sb->s_op->sync_fs)
> +		sb->s_op->sync_fs(sb, arg->wait);
>  }
>  
>  static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
> @@ -107,12 +140,18 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
>   */
>  SYSCALL_DEFINE0(sync)
>  {
> -	int nowait = 0, wait = 1;
> +	struct sb_sync arg = {
> +		.mnt_ns = current->nsproxy->mnt_ns,
> +	};
> +
> +	if (arg.mnt_ns == init_task.nsproxy->mnt_ns)
> +		arg.mnt_ns = NULL;
>  
>  	wakeup_flusher_threads(WB_REASON_SYNC);
> -	iterate_supers(sync_inodes_one_sb, NULL);
> -	iterate_supers(sync_fs_one_sb, &nowait);
> -	iterate_supers(sync_fs_one_sb, &wait);
> +	iterate_supers(sync_inodes_one_sb, &arg);
> +	iterate_supers(sync_fs_one_sb, &arg);
> +	arg.wait = 1;
> +	iterate_supers(sync_fs_one_sb, &arg);
>  	iterate_bdevs(fdatawrite_one_bdev, NULL);
>  	iterate_bdevs(fdatawait_one_bdev, NULL);
>  	if (unlikely(laptop_mode))
> @@ -122,17 +161,17 @@ SYSCALL_DEFINE0(sync)
>  
>  static void do_sync_work(struct work_struct *work)
>  {
> -	int nowait = 0;
> +	struct sb_sync arg = {};
>  
>  	/*
>  	 * Sync twice to reduce the possibility we skipped some inodes / pages
>  	 * because they were temporarily locked
>  	 */
> -	iterate_supers(sync_inodes_one_sb, &nowait);
> -	iterate_supers(sync_fs_one_sb, &nowait);
> +	iterate_supers(sync_inodes_one_sb, &arg);
> +	iterate_supers(sync_fs_one_sb, &arg);
>  	iterate_bdevs(fdatawrite_one_bdev, NULL);
> -	iterate_supers(sync_inodes_one_sb, &nowait);
> -	iterate_supers(sync_fs_one_sb, &nowait);
> +	iterate_supers(sync_inodes_one_sb, &arg);
> +	iterate_supers(sync_fs_one_sb, &arg);
>  	iterate_bdevs(fdatawrite_one_bdev, NULL);
>  	printk("Emergency Sync complete\n");
>  	kfree(work);
> -- 
> 2.16.1
>
diff mbox

Patch

diff --git a/fs/sync.c b/fs/sync.c
index 6e0a2cbaf6de..03d27b48b972 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -17,6 +17,7 @@ 
 #include <linux/quotaops.h>
 #include <linux/backing-dev.h>
 #include "internal.h"
+#include "mount.h"
 
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
 			SYNC_FILE_RANGE_WAIT_AFTER)
@@ -68,16 +69,48 @@  int sync_filesystem(struct super_block *sb)
 }
 EXPORT_SYMBOL(sync_filesystem);
 
-static void sync_inodes_one_sb(struct super_block *sb, void *arg)
+struct sb_sync {
+	/*
+	 * Only sync superblocks reachable from this namespace. If NULL, sync
+	 * everything.
+	 */
+	struct mnt_namespace *mnt_ns;
+
+	/* ->sync_fs() wait argument. */
+	int wait;
+};
+
+static int sb_reachable(struct super_block *sb, struct mnt_namespace *mnt_ns)
+{
+	struct mount *mnt;
+
+	if (!mnt_ns)
+		return 1;
+
+	read_seqlock_excl(&mount_lock);
+	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
+		if (mnt->mnt_ns == mnt_ns)
+			return 1;
+	}
+	read_sequnlock_excl(&mount_lock);
+	return 0;
+}
+
+static void sync_inodes_one_sb(struct super_block *sb, void *p)
 {
-	if (!sb_rdonly(sb))
+	struct sb_sync *arg = p;
+
+	if (!sb_rdonly(sb) && sb_reachable(sb, arg->mnt_ns))
 		sync_inodes_sb(sb);
 }
 
-static void sync_fs_one_sb(struct super_block *sb, void *arg)
+static void sync_fs_one_sb(struct super_block *sb, void *p)
 {
-	if (!sb_rdonly(sb) && sb->s_op->sync_fs)
-		sb->s_op->sync_fs(sb, *(int *)arg);
+	struct sb_sync *arg = p;
+
+	if (!sb_rdonly(sb) && sb_reachable(sb, arg->mnt_ns) &&
+	    sb->s_op->sync_fs)
+		sb->s_op->sync_fs(sb, arg->wait);
 }
 
 static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
@@ -107,12 +140,18 @@  static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
  */
 SYSCALL_DEFINE0(sync)
 {
-	int nowait = 0, wait = 1;
+	struct sb_sync arg = {
+		.mnt_ns = current->nsproxy->mnt_ns,
+	};
+
+	if (arg.mnt_ns == init_task.nsproxy->mnt_ns)
+		arg.mnt_ns = NULL;
 
 	wakeup_flusher_threads(WB_REASON_SYNC);
-	iterate_supers(sync_inodes_one_sb, NULL);
-	iterate_supers(sync_fs_one_sb, &nowait);
-	iterate_supers(sync_fs_one_sb, &wait);
+	iterate_supers(sync_inodes_one_sb, &arg);
+	iterate_supers(sync_fs_one_sb, &arg);
+	arg.wait = 1;
+	iterate_supers(sync_fs_one_sb, &arg);
 	iterate_bdevs(fdatawrite_one_bdev, NULL);
 	iterate_bdevs(fdatawait_one_bdev, NULL);
 	if (unlikely(laptop_mode))
@@ -122,17 +161,17 @@  SYSCALL_DEFINE0(sync)
 
 static void do_sync_work(struct work_struct *work)
 {
-	int nowait = 0;
+	struct sb_sync arg = {};
 
 	/*
 	 * Sync twice to reduce the possibility we skipped some inodes / pages
 	 * because they were temporarily locked
 	 */
-	iterate_supers(sync_inodes_one_sb, &nowait);
-	iterate_supers(sync_fs_one_sb, &nowait);
+	iterate_supers(sync_inodes_one_sb, &arg);
+	iterate_supers(sync_fs_one_sb, &arg);
 	iterate_bdevs(fdatawrite_one_bdev, NULL);
-	iterate_supers(sync_inodes_one_sb, &nowait);
-	iterate_supers(sync_fs_one_sb, &nowait);
+	iterate_supers(sync_inodes_one_sb, &arg);
+	iterate_supers(sync_fs_one_sb, &arg);
 	iterate_bdevs(fdatawrite_one_bdev, NULL);
 	printk("Emergency Sync complete\n");
 	kfree(work);