Message ID | 1429678768.18561.64.camel@edumazet-glaptop2.roam.corp.google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, Apr 21, 2015 at 09:59:28PM -0700, Eric Dumazet wrote: > From: Eric Dumazet <edumazet@google.com> > > Mateusz Guzik reported : > > Currently obtaining a new file descriptor results in locking fdtable > twice - once in order to reserve a slot and second time to fill it. > > Holding the spinlock in __fd_install() is needed in case a resize is > done, or to prevent a resize. > > Mateusz provided an RFC patch and a micro benchmark : > http://people.redhat.com/~mguzik/pipebench.c > > A resize is an unlikely operation in a process lifetime, > as table size is at least doubled at every resize. > > We can use RCU instead of the spinlock : > > __fd_install() must wait if a resize is in progress. > > The resize must block new __fd_install() callers from starting, > and wait that ongoing install are finished (synchronize_sched()) > > resize should be attempted by a single thread to not waste resources. > > rcu_sched variant is used, as __fd_install() and expand_fdtable() run > from process context. > > It gives us a ~30% speedup using pipebench with 16 threads, and a ~10% > speedup with another program using TCP sockets. > > Signed-off-by: Eric Dumazet <edumazet@google.com> > Reported-by: Mateusz Guzik <mguzik@redhat.com> Reviewed-by: Mateusz Guzik <mguzik@redhat.com> Tested-by: Mateusz Guzik <mguzik@redhat.com> Thanks, > --- > fs/file.c | 66 +++++++++++++++++++++++++++----------- > include/linux/fdtable.h | 3 + > 2 files changed, 50 insertions(+), 19 deletions(-) > > diff --git a/fs/file.c b/fs/file.c > index 93c5f89c248b..17cef5e52f16 100644 > --- a/fs/file.c > +++ b/fs/file.c > @@ -147,6 +147,13 @@ static int expand_fdtable(struct files_struct *files, int nr) > > spin_unlock(&files->file_lock); > new_fdt = alloc_fdtable(nr); > + > + /* make sure all __fd_install() have seen resize_in_progress > + * or have finished their rcu_read_lock_sched() section. > + */ > + if (atomic_read(&files->count) > 1) > + synchronize_sched(); > + > spin_lock(&files->file_lock); > if (!new_fdt) > return -ENOMEM; > @@ -158,21 +165,14 @@ static int expand_fdtable(struct files_struct *files, int nr) > __free_fdtable(new_fdt); > return -EMFILE; > } > - /* > - * Check again since another task may have expanded the fd table while > - * we dropped the lock > - */ > cur_fdt = files_fdtable(files); > - if (nr >= cur_fdt->max_fds) { > - /* Continue as planned */ > - copy_fdtable(new_fdt, cur_fdt); > - rcu_assign_pointer(files->fdt, new_fdt); > - if (cur_fdt != &files->fdtab) > - call_rcu(&cur_fdt->rcu, free_fdtable_rcu); > - } else { > - /* Somebody else expanded, so undo our attempt */ > - __free_fdtable(new_fdt); > - } > + BUG_ON(nr < cur_fdt->max_fds); > + copy_fdtable(new_fdt, cur_fdt); > + rcu_assign_pointer(files->fdt, new_fdt); > + if (cur_fdt != &files->fdtab) > + call_rcu(&cur_fdt->rcu, free_fdtable_rcu); > + /* coupled with smp_rmb() in __fd_install() */ > + smp_wmb(); > return 1; > } > > @@ -185,21 +185,38 @@ static int expand_fdtable(struct files_struct *files, int nr) > * The files->file_lock should be held on entry, and will be held on exit. > */ > static int expand_files(struct files_struct *files, int nr) > + __releases(files->file_lock) > + __acquires(files->file_lock) > { > struct fdtable *fdt; > + int expanded = 0; > > +repeat: > fdt = files_fdtable(files); > > /* Do we need to expand? */ > if (nr < fdt->max_fds) > - return 0; > + return expanded; > > /* Can we expand? */ > if (nr >= sysctl_nr_open) > return -EMFILE; > > + if (unlikely(files->resize_in_progress)) { > + spin_unlock(&files->file_lock); > + expanded = 1; > + wait_event(files->resize_wait, !files->resize_in_progress); > + spin_lock(&files->file_lock); > + goto repeat; > + } > + > /* All good, so we try */ > - return expand_fdtable(files, nr); > + files->resize_in_progress = true; > + expanded = expand_fdtable(files, nr); > + files->resize_in_progress = false; > + > + wake_up_all(&files->resize_wait); > + return expanded; > } > > static inline void __set_close_on_exec(int fd, struct fdtable *fdt) > @@ -256,6 +273,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) > atomic_set(&newf->count, 1); > > spin_lock_init(&newf->file_lock); > + newf->resize_in_progress = false; > + init_waitqueue_head(&newf->resize_wait); > newf->next_fd = 0; > new_fdt = &newf->fdtab; > new_fdt->max_fds = NR_OPEN_DEFAULT; > @@ -553,11 +572,20 @@ void __fd_install(struct files_struct *files, unsigned int fd, > struct file *file) > { > struct fdtable *fdt; > - spin_lock(&files->file_lock); > - fdt = files_fdtable(files); > + > + rcu_read_lock_sched(); > + > + while (unlikely(files->resize_in_progress)) { > + rcu_read_unlock_sched(); > + wait_event(files->resize_wait, !files->resize_in_progress); > + rcu_read_lock_sched(); > + } > + /* coupled with smp_wmb() in expand_fdtable() */ > + smp_rmb(); > + fdt = READ_ONCE(files->fdt); > BUG_ON(fdt->fd[fd] != NULL); > rcu_assign_pointer(fdt->fd[fd], file); > - spin_unlock(&files->file_lock); > + rcu_read_unlock_sched(); > } > > void fd_install(unsigned int fd, struct file *file) > diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h > index 230f87bdf5ad..fbb88740634a 100644 > --- a/include/linux/fdtable.h > +++ b/include/linux/fdtable.h > @@ -47,6 +47,9 @@ struct files_struct { > * read mostly part > */ > atomic_t count; > + bool resize_in_progress; > + wait_queue_head_t resize_wait; > + > struct fdtable __rcu *fdt; > struct fdtable fdtab; > /* > >
On Mon, 2015-04-27 at 21:05 +0200, Mateusz Guzik wrote: > On Tue, Apr 21, 2015 at 09:59:28PM -0700, Eric Dumazet wrote: > > From: Eric Dumazet <edumazet@google.com> > > > > Mateusz Guzik reported : > > > > Currently obtaining a new file descriptor results in locking fdtable > > twice - once in order to reserve a slot and second time to fill it. > > > > Holding the spinlock in __fd_install() is needed in case a resize is > > done, or to prevent a resize. > > > > Mateusz provided an RFC patch and a micro benchmark : > > http://people.redhat.com/~mguzik/pipebench.c > > > > A resize is an unlikely operation in a process lifetime, > > as table size is at least doubled at every resize. > > > > We can use RCU instead of the spinlock : > > > > __fd_install() must wait if a resize is in progress. > > > > The resize must block new __fd_install() callers from starting, > > and wait that ongoing install are finished (synchronize_sched()) > > > > resize should be attempted by a single thread to not waste resources. > > > > rcu_sched variant is used, as __fd_install() and expand_fdtable() run > > from process context. > > > > It gives us a ~30% speedup using pipebench with 16 threads, and a ~10% > > speedup with another program using TCP sockets. > > > > Signed-off-by: Eric Dumazet <edumazet@google.com> > > Reported-by: Mateusz Guzik <mguzik@redhat.com> > > Reviewed-by: Mateusz Guzik <mguzik@redhat.com> > Tested-by: Mateusz Guzik <mguzik@redhat.com> > Thanks Mateusz I'll send a v2, replacing the READ_ONCE() by more appropriate rcu_dereference_sched() : > > new_fdt->max_fds = NR_OPEN_DEFAULT; > > @@ -553,11 +572,20 @@ void __fd_install(struct files_struct *files, unsigned int fd, > > struct file *file) > > { > > struct fdtable *fdt; > > - spin_lock(&files->file_lock); > > - fdt = files_fdtable(files); > > + > > + rcu_read_lock_sched(); > > + > > + while (unlikely(files->resize_in_progress)) { > > + rcu_read_unlock_sched(); > > + wait_event(files->resize_wait, !files->resize_in_progress); > > + rcu_read_lock_sched(); > > + } > > + /* coupled with smp_wmb() in expand_fdtable() */ > > + smp_rmb(); > > + fdt = READ_ONCE(files->fdt); This should be : fdt = rcu_dereference_sched(files->fdt); > > BUG_ON(fdt->fd[fd] != NULL); > > rcu_assign_pointer(fdt->fd[fd], file); > > - spin_unlock(&files->file_lock); > > + rcu_read_unlock_sched(); > > } > > -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/file.c b/fs/file.c index 93c5f89c248b..17cef5e52f16 100644 --- a/fs/file.c +++ b/fs/file.c @@ -147,6 +147,13 @@ static int expand_fdtable(struct files_struct *files, int nr) spin_unlock(&files->file_lock); new_fdt = alloc_fdtable(nr); + + /* make sure all __fd_install() have seen resize_in_progress + * or have finished their rcu_read_lock_sched() section. + */ + if (atomic_read(&files->count) > 1) + synchronize_sched(); + spin_lock(&files->file_lock); if (!new_fdt) return -ENOMEM; @@ -158,21 +165,14 @@ static int expand_fdtable(struct files_struct *files, int nr) __free_fdtable(new_fdt); return -EMFILE; } - /* - * Check again since another task may have expanded the fd table while - * we dropped the lock - */ cur_fdt = files_fdtable(files); - if (nr >= cur_fdt->max_fds) { - /* Continue as planned */ - copy_fdtable(new_fdt, cur_fdt); - rcu_assign_pointer(files->fdt, new_fdt); - if (cur_fdt != &files->fdtab) - call_rcu(&cur_fdt->rcu, free_fdtable_rcu); - } else { - /* Somebody else expanded, so undo our attempt */ - __free_fdtable(new_fdt); - } + BUG_ON(nr < cur_fdt->max_fds); + copy_fdtable(new_fdt, cur_fdt); + rcu_assign_pointer(files->fdt, new_fdt); + if (cur_fdt != &files->fdtab) + call_rcu(&cur_fdt->rcu, free_fdtable_rcu); + /* coupled with smp_rmb() in __fd_install() */ + smp_wmb(); return 1; } @@ -185,21 +185,38 @@ static int expand_fdtable(struct files_struct *files, int nr) * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_files(struct files_struct *files, int nr) + __releases(files->file_lock) + __acquires(files->file_lock) { struct fdtable *fdt; + int expanded = 0; +repeat: fdt = files_fdtable(files); /* Do we need to expand? */ if (nr < fdt->max_fds) - return 0; + return expanded; /* Can we expand? */ if (nr >= sysctl_nr_open) return -EMFILE; + if (unlikely(files->resize_in_progress)) { + spin_unlock(&files->file_lock); + expanded = 1; + wait_event(files->resize_wait, !files->resize_in_progress); + spin_lock(&files->file_lock); + goto repeat; + } + /* All good, so we try */ - return expand_fdtable(files, nr); + files->resize_in_progress = true; + expanded = expand_fdtable(files, nr); + files->resize_in_progress = false; + + wake_up_all(&files->resize_wait); + return expanded; } static inline void __set_close_on_exec(int fd, struct fdtable *fdt) @@ -256,6 +273,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) atomic_set(&newf->count, 1); spin_lock_init(&newf->file_lock); + newf->resize_in_progress = false; + init_waitqueue_head(&newf->resize_wait); newf->next_fd = 0; new_fdt = &newf->fdtab; new_fdt->max_fds = NR_OPEN_DEFAULT; @@ -553,11 +572,20 @@ void __fd_install(struct files_struct *files, unsigned int fd, struct file *file) { struct fdtable *fdt; - spin_lock(&files->file_lock); - fdt = files_fdtable(files); + + rcu_read_lock_sched(); + + while (unlikely(files->resize_in_progress)) { + rcu_read_unlock_sched(); + wait_event(files->resize_wait, !files->resize_in_progress); + rcu_read_lock_sched(); + } + /* coupled with smp_wmb() in expand_fdtable() */ + smp_rmb(); + fdt = READ_ONCE(files->fdt); BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); - spin_unlock(&files->file_lock); + rcu_read_unlock_sched(); } void fd_install(unsigned int fd, struct file *file) diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 230f87bdf5ad..fbb88740634a 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -47,6 +47,9 @@ struct files_struct { * read mostly part */ atomic_t count; + bool resize_in_progress; + wait_queue_head_t resize_wait; + struct fdtable __rcu *fdt; struct fdtable fdtab; /*