Message ID | eaa3bb7655ceb9c887c36d6f7d607d494eb33186.1517007925.git.osandov@fb.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Jan 26, 2018 at 03:06:31PM -0800, Omar Sandoval wrote: > From: Omar Sandoval <osandov@fb.com> > > Currently, the sync() syscall is system-wide, so any process in a > container can cause significant I/O stalls across the system by calling > sync(). This is even true for filesystems which are not accessible in > the process' mount namespace. This patch scopes sync() to only write out > filesystems reachable in the current mount namespace, except for the > initial mount namespace, which still syncs everything to avoid > surprises. This fixes the broken isolation we were seeing here. > > Signed-off-by: Omar Sandoval <osandov@fb.com> > --- > Grab mount_lock while iterating mounts. > > fs/sync.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++------------- > 1 file changed, 53 insertions(+), 14 deletions(-) > > diff --git a/fs/sync.c b/fs/sync.c > index 6e0a2cbaf6de..03d27b48b972 100644 > --- a/fs/sync.c > +++ b/fs/sync.c > @@ -17,6 +17,7 @@ > #include <linux/quotaops.h> > #include <linux/backing-dev.h> > #include "internal.h" > +#include "mount.h" > > #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ > SYNC_FILE_RANGE_WAIT_AFTER) > @@ -68,16 +69,48 @@ int sync_filesystem(struct super_block *sb) > } > EXPORT_SYMBOL(sync_filesystem); > > -static void sync_inodes_one_sb(struct super_block *sb, void *arg) > +struct sb_sync { > + /* > + * Only sync superblocks reachable from this namespace. If NULL, sync > + * everything. > + */ > + struct mnt_namespace *mnt_ns; > + > + /* ->sync_fs() wait argument. */ > + int wait; > +}; > + > +static int sb_reachable(struct super_block *sb, struct mnt_namespace *mnt_ns) > +{ > + struct mount *mnt; > + > + if (!mnt_ns) > + return 1; > + > + read_seqlock_excl(&mount_lock); > + list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { > + if (mnt->mnt_ns == mnt_ns) > + return 1; Aaand I continue to be an idiot... > + } > + read_sequnlock_excl(&mount_lock); > + return 0; > +} > + > +static void sync_inodes_one_sb(struct super_block *sb, void *p) > { > - if (!sb_rdonly(sb)) > + struct sb_sync *arg = p; > + > + if (!sb_rdonly(sb) && sb_reachable(sb, arg->mnt_ns)) > sync_inodes_sb(sb); > } > > -static void sync_fs_one_sb(struct super_block *sb, void *arg) > +static void sync_fs_one_sb(struct super_block *sb, void *p) > { > - if (!sb_rdonly(sb) && sb->s_op->sync_fs) > - sb->s_op->sync_fs(sb, *(int *)arg); > + struct sb_sync *arg = p; > + > + if (!sb_rdonly(sb) && sb_reachable(sb, arg->mnt_ns) && > + sb->s_op->sync_fs) > + sb->s_op->sync_fs(sb, arg->wait); > } > > static void fdatawrite_one_bdev(struct block_device *bdev, void *arg) > @@ -107,12 +140,18 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg) > */ > SYSCALL_DEFINE0(sync) > { > - int nowait = 0, wait = 1; > + struct sb_sync arg = { > + .mnt_ns = current->nsproxy->mnt_ns, > + }; > + > + if (arg.mnt_ns == init_task.nsproxy->mnt_ns) > + arg.mnt_ns = NULL; > > wakeup_flusher_threads(WB_REASON_SYNC); > - iterate_supers(sync_inodes_one_sb, NULL); > - iterate_supers(sync_fs_one_sb, &nowait); > - iterate_supers(sync_fs_one_sb, &wait); > + iterate_supers(sync_inodes_one_sb, &arg); > + iterate_supers(sync_fs_one_sb, &arg); > + arg.wait = 1; > + iterate_supers(sync_fs_one_sb, &arg); > iterate_bdevs(fdatawrite_one_bdev, NULL); > iterate_bdevs(fdatawait_one_bdev, NULL); > if (unlikely(laptop_mode)) > @@ -122,17 +161,17 @@ SYSCALL_DEFINE0(sync) > > static void do_sync_work(struct work_struct *work) > { > - int nowait = 0; > + struct sb_sync arg = {}; > > /* > * Sync twice to reduce the possibility we skipped some inodes / pages > * because they were temporarily locked > */ > - iterate_supers(sync_inodes_one_sb, &nowait); > - iterate_supers(sync_fs_one_sb, &nowait); > + iterate_supers(sync_inodes_one_sb, &arg); > + iterate_supers(sync_fs_one_sb, &arg); > iterate_bdevs(fdatawrite_one_bdev, NULL); > - iterate_supers(sync_inodes_one_sb, &nowait); > - iterate_supers(sync_fs_one_sb, &nowait); > + iterate_supers(sync_inodes_one_sb, &arg); > + iterate_supers(sync_fs_one_sb, &arg); > iterate_bdevs(fdatawrite_one_bdev, NULL); > printk("Emergency Sync complete\n"); > kfree(work); > -- > 2.16.1 >
diff --git a/fs/sync.c b/fs/sync.c index 6e0a2cbaf6de..03d27b48b972 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -17,6 +17,7 @@ #include <linux/quotaops.h> #include <linux/backing-dev.h> #include "internal.h" +#include "mount.h" #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ SYNC_FILE_RANGE_WAIT_AFTER) @@ -68,16 +69,48 @@ int sync_filesystem(struct super_block *sb) } EXPORT_SYMBOL(sync_filesystem); -static void sync_inodes_one_sb(struct super_block *sb, void *arg) +struct sb_sync { + /* + * Only sync superblocks reachable from this namespace. If NULL, sync + * everything. + */ + struct mnt_namespace *mnt_ns; + + /* ->sync_fs() wait argument. */ + int wait; +}; + +static int sb_reachable(struct super_block *sb, struct mnt_namespace *mnt_ns) +{ + struct mount *mnt; + + if (!mnt_ns) + return 1; + + read_seqlock_excl(&mount_lock); + list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { + if (mnt->mnt_ns == mnt_ns) + return 1; + } + read_sequnlock_excl(&mount_lock); + return 0; +} + +static void sync_inodes_one_sb(struct super_block *sb, void *p) { - if (!sb_rdonly(sb)) + struct sb_sync *arg = p; + + if (!sb_rdonly(sb) && sb_reachable(sb, arg->mnt_ns)) sync_inodes_sb(sb); } -static void sync_fs_one_sb(struct super_block *sb, void *arg) +static void sync_fs_one_sb(struct super_block *sb, void *p) { - if (!sb_rdonly(sb) && sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, *(int *)arg); + struct sb_sync *arg = p; + + if (!sb_rdonly(sb) && sb_reachable(sb, arg->mnt_ns) && + sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, arg->wait); } static void fdatawrite_one_bdev(struct block_device *bdev, void *arg) @@ -107,12 +140,18 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg) */ SYSCALL_DEFINE0(sync) { - int nowait = 0, wait = 1; + struct sb_sync arg = { + .mnt_ns = current->nsproxy->mnt_ns, + }; + + if (arg.mnt_ns == init_task.nsproxy->mnt_ns) + arg.mnt_ns = NULL; wakeup_flusher_threads(WB_REASON_SYNC); - iterate_supers(sync_inodes_one_sb, NULL); - iterate_supers(sync_fs_one_sb, &nowait); - iterate_supers(sync_fs_one_sb, &wait); + iterate_supers(sync_inodes_one_sb, &arg); + iterate_supers(sync_fs_one_sb, &arg); + arg.wait = 1; + iterate_supers(sync_fs_one_sb, &arg); iterate_bdevs(fdatawrite_one_bdev, NULL); iterate_bdevs(fdatawait_one_bdev, NULL); if (unlikely(laptop_mode)) @@ -122,17 +161,17 @@ SYSCALL_DEFINE0(sync) static void do_sync_work(struct work_struct *work) { - int nowait = 0; + struct sb_sync arg = {}; /* * Sync twice to reduce the possibility we skipped some inodes / pages * because they were temporarily locked */ - iterate_supers(sync_inodes_one_sb, &nowait); - iterate_supers(sync_fs_one_sb, &nowait); + iterate_supers(sync_inodes_one_sb, &arg); + iterate_supers(sync_fs_one_sb, &arg); iterate_bdevs(fdatawrite_one_bdev, NULL); - iterate_supers(sync_inodes_one_sb, &nowait); - iterate_supers(sync_fs_one_sb, &nowait); + iterate_supers(sync_inodes_one_sb, &arg); + iterate_supers(sync_fs_one_sb, &arg); iterate_bdevs(fdatawrite_one_bdev, NULL); printk("Emergency Sync complete\n"); kfree(work);