Message ID | 1429639543.7346.329.camel@edumazet-glaptop2.roam.corp.google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, Apr 21, 2015 at 11:05:43AM -0700, Eric Dumazet wrote: > On Mon, 2015-04-20 at 13:49 -0700, Eric Dumazet wrote: > > On Mon, 2015-04-20 at 10:15 -0700, Eric Dumazet wrote: > > > On Mon, 2015-04-20 at 17:10 +0200, Mateusz Guzik wrote: > > > > > > > Sorry for spam but I came up with another hack. :) > > > > > > > > The idea is that we can have a variable which would signify the that > > > > given thread is playing with fd table in fd_install (kind of a lock > > > > embedded into task_struct). We would also have a flag in files struct > > > > indicating that a thread would like to resize it. > > > > > > > > expand_fdtable would set the flag and iterate over all threads waiting > > > > for all of them to have the var set to 0. > > > > > > The opposite : you have to block them in some way and add a rcu_sched() > > > or something. > > What I described would block them, although it was a crappy approach (iterating threads vs cpus). I was wondering if RCU could be abused for this feature and apparently it can. > > Here is the patch I cooked here but not yet tested. > > In following version : > > 1) I replaced the yield() hack by a proper wait queue. > > 2) I do not invoke synchronize_sched() for mono threaded programs. > > 3) I avoid multiple threads doing a resize and then only one wins the > deal. > One could argue this last bit could be committed separately (a different logical change). As I read up about synchronize_sched and rcu_read_lock_sched, the code should be correct. Also see nits below. > (copying/clearing big amount of memory only to discover another guy did > the same is a big waste of resources) > > > This seems to run properly on my hosts. > > Your comments/tests are most welcomed, thanks ! > > fs/file.c | 41 +++++++++++++++++++++++++++++++++----- > include/linux/fdtable.h | 3 ++ > 2 files changed, 39 insertions(+), 5 deletions(-) > > diff --git a/fs/file.c b/fs/file.c > index 93c5f89c248b..e0e113a56444 100644 > --- a/fs/file.c > +++ b/fs/file.c > @@ -147,6 +147,9 @@ static int expand_fdtable(struct files_struct *files, int nr) > > spin_unlock(&files->file_lock); > new_fdt = alloc_fdtable(nr); > + /* make sure no __fd_install() are still updating fdt */ > + if (atomic_read(&files->count) > 1) > + synchronize_sched(); > spin_lock(&files->file_lock); > if (!new_fdt) > return -ENOMEM; > @@ -170,9 +173,12 @@ static int expand_fdtable(struct files_struct *files, int nr) > if (cur_fdt != &files->fdtab) > call_rcu(&cur_fdt->rcu, free_fdtable_rcu); > } else { > + WARN_ON_ONCE(1); > /* Somebody else expanded, so undo our attempt */ > __free_fdtable(new_fdt); The reader may be left confused why there is a warning while the comment does not indicate anything is wrong. > } > + /* coupled with smp_rmb() in __fd_install() */ > + smp_wmb(); > return 1; > } > > @@ -187,19 +193,33 @@ static int expand_fdtable(struct files_struct *files, int nr) > static int expand_files(struct files_struct *files, int nr) > { > struct fdtable *fdt; > + int expanded = 0; > > +begin: > fdt = files_fdtable(files); > > /* Do we need to expand? */ > if (nr < fdt->max_fds) > - return 0; > + return expanded; > > /* Can we expand? */ > if (nr >= sysctl_nr_open) > return -EMFILE; > > + while (unlikely(files->resize_in_progress)) { > + spin_unlock(&files->file_lock); > + expanded = 1; > + wait_event(files->resize_wait, !files->resize_in_progress); > + spin_lock(&files->file_lock); > + goto begin; > + } This does not loop anymore, so s/while/if/ ? > + > /* All good, so we try */ > - return expand_fdtable(files, nr); > + files->resize_in_progress = true; > + expanded = expand_fdtable(files, nr); > + files->resize_in_progress = false; > + wake_up_all(&files->resize_wait); > + return expanded; > } > > static inline void __set_close_on_exec(int fd, struct fdtable *fdt) > @@ -256,6 +276,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) > atomic_set(&newf->count, 1); > > spin_lock_init(&newf->file_lock); > + newf->resize_in_progress = 0; > + init_waitqueue_head(&newf->resize_wait); > newf->next_fd = 0; > new_fdt = &newf->fdtab; > new_fdt->max_fds = NR_OPEN_DEFAULT; > @@ -553,11 +575,20 @@ void __fd_install(struct files_struct *files, unsigned int fd, > struct file *file) > { > struct fdtable *fdt; > - spin_lock(&files->file_lock); > - fdt = files_fdtable(files); > + > + rcu_read_lock_sched(); > + > + while (unlikely(files->resize_in_progress)) { > + rcu_read_unlock_sched(); > + wait_event(files->resize_wait, !files->resize_in_progress); > + rcu_read_lock_sched(); > + } > + /* coupled with smp_wmb() in expand_fdtable() */ > + smp_rmb(); > + fdt = READ_ONCE(files->fdt); > BUG_ON(fdt->fd[fd] != NULL); > rcu_assign_pointer(fdt->fd[fd], file); > - spin_unlock(&files->file_lock); > + rcu_read_unlock_sched(); > } > > void fd_install(unsigned int fd, struct file *file) > diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h > index 230f87bdf5ad..fbb88740634a 100644 > --- a/include/linux/fdtable.h > +++ b/include/linux/fdtable.h > @@ -47,6 +47,9 @@ struct files_struct { > * read mostly part > */ > atomic_t count; > + bool resize_in_progress; > + wait_queue_head_t resize_wait; > + > struct fdtable __rcu *fdt; > struct fdtable fdtab; > /* > > > >
On Tue, Apr 21, 2015 at 10:06:24PM +0200, Mateusz Guzik wrote: > On Tue, Apr 21, 2015 at 11:05:43AM -0700, Eric Dumazet wrote: > > On Mon, 2015-04-20 at 13:49 -0700, Eric Dumazet wrote: > > > On Mon, 2015-04-20 at 10:15 -0700, Eric Dumazet wrote: > > > > On Mon, 2015-04-20 at 17:10 +0200, Mateusz Guzik wrote: > > > > > > > > > Sorry for spam but I came up with another hack. :) > > > > > > > > > > The idea is that we can have a variable which would signify the that > > > > > given thread is playing with fd table in fd_install (kind of a lock > > > > > embedded into task_struct). We would also have a flag in files struct > > > > > indicating that a thread would like to resize it. > > > > > > > > > > expand_fdtable would set the flag and iterate over all threads waiting > > > > > for all of them to have the var set to 0. > > > > > > > > The opposite : you have to block them in some way and add a rcu_sched() > > > > or something. > > > > > What I described would block them, although it was a crappy approach > (iterating threads vs cpus). I was wondering if RCU could be abused for > this feature and apparently it can. > > > > Here is the patch I cooked here but not yet tested. > > > > In following version : > > > > 1) I replaced the yield() hack by a proper wait queue. > > > > 2) I do not invoke synchronize_sched() for mono threaded programs. > > > > 3) I avoid multiple threads doing a resize and then only one wins the > > deal. > > > > One could argue this last bit could be committed separately (a different > logical change). > > As I read up about synchronize_sched and rcu_read_lock_sched, the code > should be correct. > > Also see nits below. > I'm utter crap with memory barriers, but I believe some are needed now: in dup_fd: for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; if (f) { get_file(f); at least a data dependency barrier, or maybe smp_rmb for peace of mind similarly in do_dup2: tofree = fdt->fd[fd]; if (!tofree && fd_is_open(fd, fdt)) goto Ebusy; > > (copying/clearing big amount of memory only to discover another guy did > > the same is a big waste of resources) > > > > > > This seems to run properly on my hosts. > > > > Your comments/tests are most welcomed, thanks ! > > > > fs/file.c | 41 +++++++++++++++++++++++++++++++++----- > > include/linux/fdtable.h | 3 ++ > > 2 files changed, 39 insertions(+), 5 deletions(-) > > > > diff --git a/fs/file.c b/fs/file.c > > index 93c5f89c248b..e0e113a56444 100644 > > --- a/fs/file.c > > +++ b/fs/file.c > > @@ -147,6 +147,9 @@ static int expand_fdtable(struct files_struct *files, int nr) > > > > spin_unlock(&files->file_lock); > > new_fdt = alloc_fdtable(nr); > > + /* make sure no __fd_install() are still updating fdt */ > > + if (atomic_read(&files->count) > 1) > > + synchronize_sched(); > > spin_lock(&files->file_lock); > > if (!new_fdt) > > return -ENOMEM; > > @@ -170,9 +173,12 @@ static int expand_fdtable(struct files_struct *files, int nr) > > if (cur_fdt != &files->fdtab) > > call_rcu(&cur_fdt->rcu, free_fdtable_rcu); > > } else { > > + WARN_ON_ONCE(1); > > /* Somebody else expanded, so undo our attempt */ > > __free_fdtable(new_fdt); > > The reader may be left confused why there is a warning while the comment > does not indicate anything is wrong. > > > } > > + /* coupled with smp_rmb() in __fd_install() */ > > + smp_wmb(); > > return 1; > > } > > > > @@ -187,19 +193,33 @@ static int expand_fdtable(struct files_struct *files, int nr) > > static int expand_files(struct files_struct *files, int nr) > > { > > struct fdtable *fdt; > > + int expanded = 0; > > > > +begin: > > fdt = files_fdtable(files); > > > > /* Do we need to expand? */ > > if (nr < fdt->max_fds) > > - return 0; > > + return expanded; > > > > /* Can we expand? */ > > if (nr >= sysctl_nr_open) > > return -EMFILE; > > > > + while (unlikely(files->resize_in_progress)) { > > + spin_unlock(&files->file_lock); > > + expanded = 1; > > + wait_event(files->resize_wait, !files->resize_in_progress); > > + spin_lock(&files->file_lock); > > + goto begin; > > + } > > This does not loop anymore, so s/while/if/ ? > > > + > > /* All good, so we try */ > > - return expand_fdtable(files, nr); > > + files->resize_in_progress = true; > > + expanded = expand_fdtable(files, nr); > > + files->resize_in_progress = false; > > + wake_up_all(&files->resize_wait); > > + return expanded; > > } > > > > static inline void __set_close_on_exec(int fd, struct fdtable *fdt) > > @@ -256,6 +276,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) > > atomic_set(&newf->count, 1); > > > > spin_lock_init(&newf->file_lock); > > + newf->resize_in_progress = 0; > > + init_waitqueue_head(&newf->resize_wait); > > newf->next_fd = 0; > > new_fdt = &newf->fdtab; > > new_fdt->max_fds = NR_OPEN_DEFAULT; > > @@ -553,11 +575,20 @@ void __fd_install(struct files_struct *files, unsigned int fd, > > struct file *file) > > { > > struct fdtable *fdt; > > - spin_lock(&files->file_lock); > > - fdt = files_fdtable(files); > > + > > + rcu_read_lock_sched(); > > + > > + while (unlikely(files->resize_in_progress)) { > > + rcu_read_unlock_sched(); > > + wait_event(files->resize_wait, !files->resize_in_progress); > > + rcu_read_lock_sched(); > > + } > > + /* coupled with smp_wmb() in expand_fdtable() */ > > + smp_rmb(); > > + fdt = READ_ONCE(files->fdt); > > BUG_ON(fdt->fd[fd] != NULL); > > rcu_assign_pointer(fdt->fd[fd], file); > > - spin_unlock(&files->file_lock); > > + rcu_read_unlock_sched(); > > } > > > > void fd_install(unsigned int fd, struct file *file) > > diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h > > index 230f87bdf5ad..fbb88740634a 100644 > > --- a/include/linux/fdtable.h > > +++ b/include/linux/fdtable.h > > @@ -47,6 +47,9 @@ struct files_struct { > > * read mostly part > > */ > > atomic_t count; > > + bool resize_in_progress; > > + wait_queue_head_t resize_wait; > > + > > struct fdtable __rcu *fdt; > > struct fdtable fdtab; > > /* > > > > > > > > > > -- > Mateusz Guzik
On Tue, 2015-04-21 at 22:06 +0200, Mateusz Guzik wrote: > On Tue, Apr 21, 2015 at 11:05:43AM -0700, Eric Dumazet wrote: > > 3) I avoid multiple threads doing a resize and then only one wins the > > deal. > > > > One could argue this last bit could be committed separately (a different > logical change). Not really. The prior code was fine, but the addition of synchronize_sched() made the overhead much bigger in case multiple threads do this at the same time. > > As I read up about synchronize_sched and rcu_read_lock_sched, the code > should be correct. > > spin_lock(&files->file_lock); > > if (!new_fdt) > > return -ENOMEM; > > @@ -170,9 +173,12 @@ static int expand_fdtable(struct files_struct *files, int nr) > > if (cur_fdt != &files->fdtab) > > call_rcu(&cur_fdt->rcu, free_fdtable_rcu); > > } else { > > + WARN_ON_ONCE(1); > > /* Somebody else expanded, so undo our attempt */ > > __free_fdtable(new_fdt); > > The reader may be left confused why there is a warning while the comment > does not indicate anything is wrong. My intent is to remove completely this code, but I left this WARN_ON_ONCE() for my tests, just to make sure my theory was right ;) > > > } > > + /* coupled with smp_rmb() in __fd_install() */ > > + smp_wmb(); > > return 1; > > } > > > > @@ -187,19 +193,33 @@ static int expand_fdtable(struct files_struct *files, int nr) > > static int expand_files(struct files_struct *files, int nr) > > { > > struct fdtable *fdt; > > + int expanded = 0; > > > > +begin: > > fdt = files_fdtable(files); > > > > /* Do we need to expand? */ > > if (nr < fdt->max_fds) > > - return 0; > > + return expanded; > > > > /* Can we expand? */ > > if (nr >= sysctl_nr_open) > > return -EMFILE; > > > > + while (unlikely(files->resize_in_progress)) { > > + spin_unlock(&files->file_lock); > > + expanded = 1; > > + wait_event(files->resize_wait, !files->resize_in_progress); > > + spin_lock(&files->file_lock); > > + goto begin; > > + } > > This does not loop anymore, so s/while/if/ ? You are right, thanks ! -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 2015-04-21 at 22:12 +0200, Mateusz Guzik wrote: > in dup_fd: > for (i = open_files; i != 0; i--) { > struct file *f = *old_fds++; > if (f) { > get_file(f); > I see no new requirement here. f is either NULL or not. multi threaded programs never had a guarantee dup_fd() would catch a non NULL pointer here. > at least a data dependency barrier, or maybe smp_rmb for peace of mind > > similarly in do_dup2: > tofree = fdt->fd[fd]; > if (!tofree && fd_is_open(fd, fdt)) > goto Ebusy; Same here. -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Apr 21, 2015 at 02:06:53PM -0700, Eric Dumazet wrote: > On Tue, 2015-04-21 at 22:12 +0200, Mateusz Guzik wrote: > > > in dup_fd: > > for (i = open_files; i != 0; i--) { > > struct file *f = *old_fds++; > > if (f) { > > get_file(f); > > > > I see no new requirement here. f is either NULL or not. > multi threaded programs never had a guarantee dup_fd() would catch a non > NULL pointer here. > It's not about seeing NULL f or not, but using the right address for dereference. If I read memory-barriers.txt right (see 'DATA DEPENDENCY BARRIERS'), it is possible that cpus like alpha will see a non-NULL pointer and then proceed to dereference *the old* (here: NULL) value. Hence I suspect this needs smp_read_barrier_depends (along with ACCESS_ONCE). Other consumers (e.g. procfs code) use rcu_dereference macro which does ends up using lockless_dereference macro, which in turn does: #define lockless_dereference(p) \ ({ \ typeof(p) _________p1 = ACCESS_ONCE(p); \ smp_read_barrier_depends(); /* Dependency order vs. p above. */ \ (_________p1); \ }) That said memory barriers are not exactly my strong suit, but I do believe my suspicion here is justified enough to ask someone with solid memory barrier-fu to comment. > > > at least a data dependency barrier, or maybe smp_rmb for peace of mind > > > > similarly in do_dup2: > > tofree = fdt->fd[fd]; > > if (!tofree && fd_is_open(fd, fdt)) > > goto Ebusy; > > Same here. > >
On Wed, 2015-04-22 at 15:31 +0200, Mateusz Guzik wrote: > On Tue, Apr 21, 2015 at 02:06:53PM -0700, Eric Dumazet wrote: > > On Tue, 2015-04-21 at 22:12 +0200, Mateusz Guzik wrote: > > > > > in dup_fd: > > > for (i = open_files; i != 0; i--) { > > > struct file *f = *old_fds++; > > > if (f) { > > > get_file(f); > > > > > > > I see no new requirement here. f is either NULL or not. > > multi threaded programs never had a guarantee dup_fd() would catch a non > > NULL pointer here. > > > > It's not about seeing NULL f or not, but using the right address for > dereference. > > If I read memory-barriers.txt right (see 'DATA DEPENDENCY BARRIERS'), it > is possible that cpus like alpha will see a non-NULL pointer and then > proceed to dereference *the old* (here: NULL) value. > > Hence I suspect this needs smp_read_barrier_depends (along with > ACCESS_ONCE). > > Other consumers (e.g. procfs code) use rcu_dereference macro which does > ends up using lockless_dereference macro, which in turn does: > #define lockless_dereference(p) \ > ({ \ > typeof(p) _________p1 = ACCESS_ONCE(p); \ > smp_read_barrier_depends(); /* Dependency order vs. p > above. */ \ > (_________p1); \ > }) > > That said memory barriers are not exactly my strong suit, but I do > believe my suspicion here is justified enough to ask someone with solid > memory barrier-fu to comment. Again, your comment has nothing to do with the patch. If there is old data, it only can be a NULL. And it is fine, case was _already_ handled. It can not be an 'old' file pointer, because close() takes the spinlock. spin_unlock() contains a write memory barrier, so the NULL pointer put by close() would have been committed to memory. This works also on alpha cpus. -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/file.c b/fs/file.c index 93c5f89c248b..e0e113a56444 100644 --- a/fs/file.c +++ b/fs/file.c @@ -147,6 +147,9 @@ static int expand_fdtable(struct files_struct *files, int nr) spin_unlock(&files->file_lock); new_fdt = alloc_fdtable(nr); + /* make sure no __fd_install() are still updating fdt */ + if (atomic_read(&files->count) > 1) + synchronize_sched(); spin_lock(&files->file_lock); if (!new_fdt) return -ENOMEM; @@ -170,9 +173,12 @@ static int expand_fdtable(struct files_struct *files, int nr) if (cur_fdt != &files->fdtab) call_rcu(&cur_fdt->rcu, free_fdtable_rcu); } else { + WARN_ON_ONCE(1); /* Somebody else expanded, so undo our attempt */ __free_fdtable(new_fdt); } + /* coupled with smp_rmb() in __fd_install() */ + smp_wmb(); return 1; } @@ -187,19 +193,33 @@ static int expand_fdtable(struct files_struct *files, int nr) static int expand_files(struct files_struct *files, int nr) { struct fdtable *fdt; + int expanded = 0; +begin: fdt = files_fdtable(files); /* Do we need to expand? */ if (nr < fdt->max_fds) - return 0; + return expanded; /* Can we expand? */ if (nr >= sysctl_nr_open) return -EMFILE; + while (unlikely(files->resize_in_progress)) { + spin_unlock(&files->file_lock); + expanded = 1; + wait_event(files->resize_wait, !files->resize_in_progress); + spin_lock(&files->file_lock); + goto begin; + } + /* All good, so we try */ - return expand_fdtable(files, nr); + files->resize_in_progress = true; + expanded = expand_fdtable(files, nr); + files->resize_in_progress = false; + wake_up_all(&files->resize_wait); + return expanded; } static inline void __set_close_on_exec(int fd, struct fdtable *fdt) @@ -256,6 +276,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) atomic_set(&newf->count, 1); spin_lock_init(&newf->file_lock); + newf->resize_in_progress = 0; + init_waitqueue_head(&newf->resize_wait); newf->next_fd = 0; new_fdt = &newf->fdtab; new_fdt->max_fds = NR_OPEN_DEFAULT; @@ -553,11 +575,20 @@ void __fd_install(struct files_struct *files, unsigned int fd, struct file *file) { struct fdtable *fdt; - spin_lock(&files->file_lock); - fdt = files_fdtable(files); + + rcu_read_lock_sched(); + + while (unlikely(files->resize_in_progress)) { + rcu_read_unlock_sched(); + wait_event(files->resize_wait, !files->resize_in_progress); + rcu_read_lock_sched(); + } + /* coupled with smp_wmb() in expand_fdtable() */ + smp_rmb(); + fdt = READ_ONCE(files->fdt); BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); - spin_unlock(&files->file_lock); + rcu_read_unlock_sched(); } void fd_install(unsigned int fd, struct file *file) diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 230f87bdf5ad..fbb88740634a 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -47,6 +47,9 @@ struct files_struct { * read mostly part */ atomic_t count; + bool resize_in_progress; + wait_queue_head_t resize_wait; + struct fdtable __rcu *fdt; struct fdtable fdtab; /*