Message ID | 152292939368.19745.13784475656016424647.stgit@localhost.localdomain (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Hi, almost two weeks passed, while there is no reaction. Jeff, Bruce, what is your point of view? Just to underline, the problem is because of rw_lock fairness, which does not allow a reader to take a read locked lock in case of there is already a writer called write_lock(). See queued_read_lock_slowpath() for the details. This makes rw_lock fair, and it does not allow readers to occupy the lock forever without a possibility for writers to take it. Kirill On 05.04.2018 14:58, Kirill Tkhai wrote: > I observed the following deadlock between them: > > [task 1] [task 2] [task 3] > kill_fasync() mm_update_next_owner() copy_process() > spin_lock_irqsave(&fa->fa_lock) read_lock(&tasklist_lock) write_lock_irq(&tasklist_lock) > send_sigio() <IRQ> ... > read_lock(&fown->lock) kill_fasync() ... > read_lock(&tasklist_lock) spin_lock_irqsave(&fa->fa_lock) ... > > Task 1 can't acquire read locked tasklist_lock, since there is > already task 3 expressed its wish to take the lock exclusive. > Task 2 holds the read locked lock, but it can't take the spin lock. > > Also, there is possible another deadlock (which I haven't observed): > > [task 1] [task 2] > f_getown() kill_fasync() > read_lock(&f_own->lock) spin_lock_irqsave(&fa->fa_lock,) > <IRQ> send_sigio() write_lock_irq(&f_own->lock) > kill_fasync() read_lock(&fown->lock) > spin_lock_irqsave(&fa->fa_lock,) > > Actually, we do not need exclusive fa->fa_lock in kill_fasync_rcu(), > as it guarantees fa->fa_file->f_owner integrity only. It may seem, > that it used to give a task a small possibility to receive two sequential > signals, if there are two parallel kill_fasync() callers, and task > handles the first signal fastly, but the behaviour won't become > different, since there is exclusive sighand lock in do_send_sig_info(). > > The patch converts fa_lock into rwlock_t, and this fixes two above > deadlocks, as rwlock is allowed to be taken from interrupt handler > by qrwlock design. > > Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com> > > I used the following program for testing: > > #include <unistd.h> > #include <stdlib.h> > #include <signal.h> > #include <fcntl.h> > #include <errno.h> > #include <stdio.h> > > #ifndef F_SETSIG > #define F_SETSIG 10 > #endif > > void handler(int sig) > { > } > > main() > { > unsigned int flags; > int fd; > > system("echo 8 > /proc/sys/kernel/random/read_wakeup_threshold"); > system("while :; do ls -R / > /dev/random 2>&1 ; echo 3 > /proc/sys/vm/drop_caches; done &"); > > if (signal(SIGINT, handler) < 0) { > perror("Signal"); > exit(1); > } > > fd = open("/dev/random", O_RDWR); > if (fd < 0) { > perror("Can't open"); > exit(1); > } > > flags = FASYNC | fcntl(fd, F_GETFL); > if (fcntl(fd, F_SETFL, flags) < 0) { > perror("Setfl"); > exit(1); > } > if (fcntl(fd, F_SETOWN, getpid())) { > perror("Setown"); > exit(1); > } > if (fcntl(fd, F_SETSIG, SIGINT)) { > perror("Setsig"); > exit(1); > } > > while (1) > sleep(100); > } > --- > fs/fcntl.c | 15 +++++++-------- > include/linux/fs.h | 2 +- > 2 files changed, 8 insertions(+), 9 deletions(-) > > diff --git a/fs/fcntl.c b/fs/fcntl.c > index 1e97f1fda90c..780161a11f9d 100644 > --- a/fs/fcntl.c > +++ b/fs/fcntl.c > @@ -865,9 +865,9 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) > if (fa->fa_file != filp) > continue; > > - spin_lock_irq(&fa->fa_lock); > + write_lock_irq(&fa->fa_lock); > fa->fa_file = NULL; > - spin_unlock_irq(&fa->fa_lock); > + write_unlock_irq(&fa->fa_lock); > > *fp = fa->fa_next; > call_rcu(&fa->fa_rcu, fasync_free_rcu); > @@ -912,13 +912,13 @@ struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasy > if (fa->fa_file != filp) > continue; > > - spin_lock_irq(&fa->fa_lock); > + write_lock_irq(&fa->fa_lock); > fa->fa_fd = fd; > - spin_unlock_irq(&fa->fa_lock); > + write_unlock_irq(&fa->fa_lock); > goto out; > } > > - spin_lock_init(&new->fa_lock); > + rwlock_init(&new->fa_lock); > new->magic = FASYNC_MAGIC; > new->fa_file = filp; > new->fa_fd = fd; > @@ -981,14 +981,13 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) > { > while (fa) { > struct fown_struct *fown; > - unsigned long flags; > > if (fa->magic != FASYNC_MAGIC) { > printk(KERN_ERR "kill_fasync: bad magic number in " > "fasync_struct!\n"); > return; > } > - spin_lock_irqsave(&fa->fa_lock, flags); > + read_lock(&fa->fa_lock); > if (fa->fa_file) { > fown = &fa->fa_file->f_owner; > /* Don't send SIGURG to processes which have not set a > @@ -997,7 +996,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) > if (!(sig == SIGURG && fown->signum == 0)) > send_sigio(fown, fa->fa_fd, band); > } > - spin_unlock_irqrestore(&fa->fa_lock, flags); > + read_unlock(&fa->fa_lock); > fa = rcu_dereference(fa->fa_next); > } > } > diff --git a/include/linux/fs.h b/include/linux/fs.h > index c6baf767619e..297e2dcd9dd2 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -1250,7 +1250,7 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) > } > > struct fasync_struct { > - spinlock_t fa_lock; > + rwlock_t fa_lock; > int magic; > int fa_fd; > struct fasync_struct *fa_next; /* singly linked list */ >
On Thu, 2018-04-05 at 14:58 +0300, Kirill Tkhai wrote: > I observed the following deadlock between them: > > [task 1] [task 2] [task 3] > kill_fasync() mm_update_next_owner() copy_process() > spin_lock_irqsave(&fa->fa_lock) read_lock(&tasklist_lock) write_lock_irq(&tasklist_lock) > send_sigio() <IRQ> ... > read_lock(&fown->lock) kill_fasync() ... > read_lock(&tasklist_lock) spin_lock_irqsave(&fa->fa_lock) ... > > Task 1 can't acquire read locked tasklist_lock, since there is > already task 3 expressed its wish to take the lock exclusive. > Task 2 holds the read locked lock, but it can't take the spin lock. > > Also, there is possible another deadlock (which I haven't observed): > > [task 1] [task 2] > f_getown() kill_fasync() > read_lock(&f_own->lock) spin_lock_irqsave(&fa->fa_lock,) > <IRQ> send_sigio() write_lock_irq(&f_own->lock) > kill_fasync() read_lock(&fown->lock) > spin_lock_irqsave(&fa->fa_lock,) > > Actually, we do not need exclusive fa->fa_lock in kill_fasync_rcu(), > as it guarantees fa->fa_file->f_owner integrity only. It may seem, > that it used to give a task a small possibility to receive two sequential > signals, if there are two parallel kill_fasync() callers, and task > handles the first signal fastly, but the behaviour won't become > different, since there is exclusive sighand lock in do_send_sig_info(). > > The patch converts fa_lock into rwlock_t, and this fixes two above > deadlocks, as rwlock is allowed to be taken from interrupt handler > by qrwlock design. > > Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com> > > I used the following program for testing: > > #include <unistd.h> > #include <stdlib.h> > #include <signal.h> > #include <fcntl.h> > #include <errno.h> > #include <stdio.h> > > #ifndef F_SETSIG > #define F_SETSIG 10 > #endif > > void handler(int sig) > { > } > > main() > { > unsigned int flags; > int fd; > > system("echo 8 > /proc/sys/kernel/random/read_wakeup_threshold"); > system("while :; do ls -R / > /dev/random 2>&1 ; echo 3 > /proc/sys/vm/drop_caches; done &"); > > if (signal(SIGINT, handler) < 0) { > perror("Signal"); > exit(1); > } > > fd = open("/dev/random", O_RDWR); > if (fd < 0) { > perror("Can't open"); > exit(1); > } > > flags = FASYNC | fcntl(fd, F_GETFL); > if (fcntl(fd, F_SETFL, flags) < 0) { > perror("Setfl"); > exit(1); > } > if (fcntl(fd, F_SETOWN, getpid())) { > perror("Setown"); > exit(1); > } > if (fcntl(fd, F_SETSIG, SIGINT)) { > perror("Setsig"); > exit(1); > } > > while (1) > sleep(100); > } > --- > fs/fcntl.c | 15 +++++++-------- > include/linux/fs.h | 2 +- > 2 files changed, 8 insertions(+), 9 deletions(-) > > diff --git a/fs/fcntl.c b/fs/fcntl.c > index 1e97f1fda90c..780161a11f9d 100644 > --- a/fs/fcntl.c > +++ b/fs/fcntl.c > @@ -865,9 +865,9 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) > if (fa->fa_file != filp) > continue; > > - spin_lock_irq(&fa->fa_lock); > + write_lock_irq(&fa->fa_lock); > fa->fa_file = NULL; > - spin_unlock_irq(&fa->fa_lock); > + write_unlock_irq(&fa->fa_lock); > > *fp = fa->fa_next; > call_rcu(&fa->fa_rcu, fasync_free_rcu); > @@ -912,13 +912,13 @@ struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasy > if (fa->fa_file != filp) > continue; > > - spin_lock_irq(&fa->fa_lock); > + write_lock_irq(&fa->fa_lock); > fa->fa_fd = fd; > - spin_unlock_irq(&fa->fa_lock); > + write_unlock_irq(&fa->fa_lock); > goto out; > } > > - spin_lock_init(&new->fa_lock); > + rwlock_init(&new->fa_lock); > new->magic = FASYNC_MAGIC; > new->fa_file = filp; > new->fa_fd = fd; > @@ -981,14 +981,13 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) > { > while (fa) { > struct fown_struct *fown; > - unsigned long flags; > > if (fa->magic != FASYNC_MAGIC) { > printk(KERN_ERR "kill_fasync: bad magic number in " > "fasync_struct!\n"); > return; > } > - spin_lock_irqsave(&fa->fa_lock, flags); > + read_lock(&fa->fa_lock); Does this need to be read_lock_irq? Why is it ok to allow interrupts here? > if (fa->fa_file) { > fown = &fa->fa_file->f_owner; > /* Don't send SIGURG to processes which have not set a > @@ -997,7 +996,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) > if (!(sig == SIGURG && fown->signum == 0)) > send_sigio(fown, fa->fa_fd, band); > } > - spin_unlock_irqrestore(&fa->fa_lock, flags); > + read_unlock(&fa->fa_lock); > fa = rcu_dereference(fa->fa_next); > } > } > diff --git a/include/linux/fs.h b/include/linux/fs.h > index c6baf767619e..297e2dcd9dd2 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -1250,7 +1250,7 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) > } > > struct fasync_struct { > - spinlock_t fa_lock; > + rwlock_t fa_lock; > int magic; > int fa_fd; > struct fasync_struct *fa_next; /* singly linked list */ > I've no objection to the patch in principle, but I'm not as familiar with the fasync code as others here.
Hi, Jeff, On 17.04.2018 14:42, Jeff Layton wrote: > On Thu, 2018-04-05 at 14:58 +0300, Kirill Tkhai wrote: >> I observed the following deadlock between them: >> >> [task 1] [task 2] [task 3] >> kill_fasync() mm_update_next_owner() copy_process() >> spin_lock_irqsave(&fa->fa_lock) read_lock(&tasklist_lock) write_lock_irq(&tasklist_lock) >> send_sigio() <IRQ> ... >> read_lock(&fown->lock) kill_fasync() ... >> read_lock(&tasklist_lock) spin_lock_irqsave(&fa->fa_lock) ... >> >> Task 1 can't acquire read locked tasklist_lock, since there is >> already task 3 expressed its wish to take the lock exclusive. >> Task 2 holds the read locked lock, but it can't take the spin lock. >> >> Also, there is possible another deadlock (which I haven't observed): >> >> [task 1] [task 2] >> f_getown() kill_fasync() >> read_lock(&f_own->lock) spin_lock_irqsave(&fa->fa_lock,) >> <IRQ> send_sigio() write_lock_irq(&f_own->lock) >> kill_fasync() read_lock(&fown->lock) >> spin_lock_irqsave(&fa->fa_lock,) >> >> Actually, we do not need exclusive fa->fa_lock in kill_fasync_rcu(), >> as it guarantees fa->fa_file->f_owner integrity only. It may seem, >> that it used to give a task a small possibility to receive two sequential >> signals, if there are two parallel kill_fasync() callers, and task >> handles the first signal fastly, but the behaviour won't become >> different, since there is exclusive sighand lock in do_send_sig_info(). >> >> The patch converts fa_lock into rwlock_t, and this fixes two above >> deadlocks, as rwlock is allowed to be taken from interrupt handler >> by qrwlock design. >> >> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com> >> >> I used the following program for testing: >> >> #include <unistd.h> >> #include <stdlib.h> >> #include <signal.h> >> #include <fcntl.h> >> #include <errno.h> >> #include <stdio.h> >> >> #ifndef F_SETSIG >> #define F_SETSIG 10 >> #endif >> >> void handler(int sig) >> { >> } >> >> main() >> { >> unsigned int flags; >> int fd; >> >> system("echo 8 > /proc/sys/kernel/random/read_wakeup_threshold"); >> system("while :; do ls -R / > /dev/random 2>&1 ; echo 3 > /proc/sys/vm/drop_caches; done &"); >> >> if (signal(SIGINT, handler) < 0) { >> perror("Signal"); >> exit(1); >> } >> >> fd = open("/dev/random", O_RDWR); >> if (fd < 0) { >> perror("Can't open"); >> exit(1); >> } >> >> flags = FASYNC | fcntl(fd, F_GETFL); >> if (fcntl(fd, F_SETFL, flags) < 0) { >> perror("Setfl"); >> exit(1); >> } >> if (fcntl(fd, F_SETOWN, getpid())) { >> perror("Setown"); >> exit(1); >> } >> if (fcntl(fd, F_SETSIG, SIGINT)) { >> perror("Setsig"); >> exit(1); >> } >> >> while (1) >> sleep(100); >> } >> --- >> fs/fcntl.c | 15 +++++++-------- >> include/linux/fs.h | 2 +- >> 2 files changed, 8 insertions(+), 9 deletions(-) >> >> diff --git a/fs/fcntl.c b/fs/fcntl.c >> index 1e97f1fda90c..780161a11f9d 100644 >> --- a/fs/fcntl.c >> +++ b/fs/fcntl.c >> @@ -865,9 +865,9 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) >> if (fa->fa_file != filp) >> continue; >> >> - spin_lock_irq(&fa->fa_lock); >> + write_lock_irq(&fa->fa_lock); >> fa->fa_file = NULL; >> - spin_unlock_irq(&fa->fa_lock); >> + write_unlock_irq(&fa->fa_lock); >> >> *fp = fa->fa_next; >> call_rcu(&fa->fa_rcu, fasync_free_rcu); >> @@ -912,13 +912,13 @@ struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasy >> if (fa->fa_file != filp) >> continue; >> >> - spin_lock_irq(&fa->fa_lock); >> + write_lock_irq(&fa->fa_lock); >> fa->fa_fd = fd; >> - spin_unlock_irq(&fa->fa_lock); >> + write_unlock_irq(&fa->fa_lock); >> goto out; >> } >> >> - spin_lock_init(&new->fa_lock); >> + rwlock_init(&new->fa_lock); >> new->magic = FASYNC_MAGIC; >> new->fa_file = filp; >> new->fa_fd = fd; >> @@ -981,14 +981,13 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) >> { >> while (fa) { >> struct fown_struct *fown; >> - unsigned long flags; >> >> if (fa->magic != FASYNC_MAGIC) { >> printk(KERN_ERR "kill_fasync: bad magic number in " >> "fasync_struct!\n"); >> return; >> } >> - spin_lock_irqsave(&fa->fa_lock, flags); >> + read_lock(&fa->fa_lock); > > Does this need to be read_lock_irq? Why is it ok to allow interrupts > here? Read locked rwlock can be taken for reading from IRQ once again even if there is a writer pending, while spin lock can't: void queued_read_lock_slowpath(struct qrwlock *lock) { /* * Readers come here when they cannot get the lock without waiting */ if (unlikely(in_interrupt())) { /* * Readers in interrupt context will get the lock immediately * if the writer is just waiting (not holding the lock yet), * so spin with ACQUIRE semantics until the lock is available * without waiting in the queue. */ atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED)); return; } So, when we replace spinlock with read_lock(), we don't need disable IRQs anymore. All we need is to make write_lock always disable IRQs. >> if (fa->fa_file) { >> fown = &fa->fa_file->f_owner; >> /* Don't send SIGURG to processes which have not set a >> @@ -997,7 +996,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) >> if (!(sig == SIGURG && fown->signum == 0)) >> send_sigio(fown, fa->fa_fd, band); >> } >> - spin_unlock_irqrestore(&fa->fa_lock, flags); >> + read_unlock(&fa->fa_lock); >> fa = rcu_dereference(fa->fa_next); >> } >> } >> diff --git a/include/linux/fs.h b/include/linux/fs.h >> index c6baf767619e..297e2dcd9dd2 100644 >> --- a/include/linux/fs.h >> +++ b/include/linux/fs.h >> @@ -1250,7 +1250,7 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) >> } >> >> struct fasync_struct { >> - spinlock_t fa_lock; >> + rwlock_t fa_lock; >> int magic; >> int fa_fd; >> struct fasync_struct *fa_next; /* singly linked list */ >> > > I've no objection to the patch in principle, but I'm not as familiar > with the fasync code as others here. I took the reviewers list from MAINTAINERS and ./scripts/get_maintainer.pl, don't have an ideas what else should be CCed. Thanks, Kirill
On Tue, 2018-04-17 at 14:53 +0300, Kirill Tkhai wrote: > Hi, Jeff, > > On 17.04.2018 14:42, Jeff Layton wrote: > > On Thu, 2018-04-05 at 14:58 +0300, Kirill Tkhai wrote: > > > I observed the following deadlock between them: > > > > > > [task 1] [task 2] [task 3] > > > kill_fasync() mm_update_next_owner() copy_process() > > > spin_lock_irqsave(&fa->fa_lock) read_lock(&tasklist_lock) write_lock_irq(&tasklist_lock) > > > send_sigio() <IRQ> ... > > > read_lock(&fown->lock) kill_fasync() ... > > > read_lock(&tasklist_lock) spin_lock_irqsave(&fa->fa_lock) ... > > > > > > Task 1 can't acquire read locked tasklist_lock, since there is > > > already task 3 expressed its wish to take the lock exclusive. > > > Task 2 holds the read locked lock, but it can't take the spin lock. > > > > > > Also, there is possible another deadlock (which I haven't observed): > > > > > > [task 1] [task 2] > > > f_getown() kill_fasync() > > > read_lock(&f_own->lock) spin_lock_irqsave(&fa->fa_lock,) > > > <IRQ> send_sigio() write_lock_irq(&f_own->lock) > > > kill_fasync() read_lock(&fown->lock) > > > spin_lock_irqsave(&fa->fa_lock,) > > > > > > Actually, we do not need exclusive fa->fa_lock in kill_fasync_rcu(), > > > as it guarantees fa->fa_file->f_owner integrity only. It may seem, > > > that it used to give a task a small possibility to receive two sequential > > > signals, if there are two parallel kill_fasync() callers, and task > > > handles the first signal fastly, but the behaviour won't become > > > different, since there is exclusive sighand lock in do_send_sig_info(). > > > > > > The patch converts fa_lock into rwlock_t, and this fixes two above > > > deadlocks, as rwlock is allowed to be taken from interrupt handler > > > by qrwlock design. > > > > > > Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com> > > > > > > I used the following program for testing: > > > > > > #include <unistd.h> > > > #include <stdlib.h> > > > #include <signal.h> > > > #include <fcntl.h> > > > #include <errno.h> > > > #include <stdio.h> > > > > > > #ifndef F_SETSIG > > > #define F_SETSIG 10 > > > #endif > > > > > > void handler(int sig) > > > { > > > } > > > > > > main() > > > { > > > unsigned int flags; > > > int fd; > > > > > > system("echo 8 > /proc/sys/kernel/random/read_wakeup_threshold"); > > > system("while :; do ls -R / > /dev/random 2>&1 ; echo 3 > /proc/sys/vm/drop_caches; done &"); > > > > > > if (signal(SIGINT, handler) < 0) { > > > perror("Signal"); > > > exit(1); > > > } > > > > > > fd = open("/dev/random", O_RDWR); > > > if (fd < 0) { > > > perror("Can't open"); > > > exit(1); > > > } > > > > > > flags = FASYNC | fcntl(fd, F_GETFL); > > > if (fcntl(fd, F_SETFL, flags) < 0) { > > > perror("Setfl"); > > > exit(1); > > > } > > > if (fcntl(fd, F_SETOWN, getpid())) { > > > perror("Setown"); > > > exit(1); > > > } > > > if (fcntl(fd, F_SETSIG, SIGINT)) { > > > perror("Setsig"); > > > exit(1); > > > } > > > > > > while (1) > > > sleep(100); > > > } > > > --- > > > fs/fcntl.c | 15 +++++++-------- > > > include/linux/fs.h | 2 +- > > > 2 files changed, 8 insertions(+), 9 deletions(-) > > > > > > diff --git a/fs/fcntl.c b/fs/fcntl.c > > > index 1e97f1fda90c..780161a11f9d 100644 > > > --- a/fs/fcntl.c > > > +++ b/fs/fcntl.c > > > @@ -865,9 +865,9 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) > > > if (fa->fa_file != filp) > > > continue; > > > > > > - spin_lock_irq(&fa->fa_lock); > > > + write_lock_irq(&fa->fa_lock); > > > fa->fa_file = NULL; > > > - spin_unlock_irq(&fa->fa_lock); > > > + write_unlock_irq(&fa->fa_lock); > > > > > > *fp = fa->fa_next; > > > call_rcu(&fa->fa_rcu, fasync_free_rcu); > > > @@ -912,13 +912,13 @@ struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasy > > > if (fa->fa_file != filp) > > > continue; > > > > > > - spin_lock_irq(&fa->fa_lock); > > > + write_lock_irq(&fa->fa_lock); > > > fa->fa_fd = fd; > > > - spin_unlock_irq(&fa->fa_lock); > > > + write_unlock_irq(&fa->fa_lock); > > > goto out; > > > } > > > > > > - spin_lock_init(&new->fa_lock); > > > + rwlock_init(&new->fa_lock); > > > new->magic = FASYNC_MAGIC; > > > new->fa_file = filp; > > > new->fa_fd = fd; > > > @@ -981,14 +981,13 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) > > > { > > > while (fa) { > > > struct fown_struct *fown; > > > - unsigned long flags; > > > > > > if (fa->magic != FASYNC_MAGIC) { > > > printk(KERN_ERR "kill_fasync: bad magic number in " > > > "fasync_struct!\n"); > > > return; > > > } > > > - spin_lock_irqsave(&fa->fa_lock, flags); > > > + read_lock(&fa->fa_lock); > > > > Does this need to be read_lock_irq? Why is it ok to allow interrupts > > here? > > Read locked rwlock can be taken for reading from IRQ once again even > if there is a writer pending, while spin lock can't: > > void queued_read_lock_slowpath(struct qrwlock *lock) > { > /* > * Readers come here when they cannot get the lock without waiting > */ > if (unlikely(in_interrupt())) { > /* > * Readers in interrupt context will get the lock immediately > * if the writer is just waiting (not holding the lock yet), > * so spin with ACQUIRE semantics until the lock is available > * without waiting in the queue. > */ > atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED)); > return; > } > > So, when we replace spinlock with read_lock(), we don't need disable IRQs anymore. > All we need is to make write_lock always disable IRQs. Got it, thanks. read_lock_irq is still used in several (rather obscure) places. Does this mean that we should do a global s/read_lock_irq/read_lock/ and remove it? Or is it still useful to disable irqs for some read_lock acquisitions? > > > > if (fa->fa_file) { > > > fown = &fa->fa_file->f_owner; > > > /* Don't send SIGURG to processes which have not set a > > > @@ -997,7 +996,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) > > > if (!(sig == SIGURG && fown->signum == 0)) > > > send_sigio(fown, fa->fa_fd, band); > > > } > > > - spin_unlock_irqrestore(&fa->fa_lock, flags); > > > + read_unlock(&fa->fa_lock); > > > fa = rcu_dereference(fa->fa_next); > > > } > > > } > > > diff --git a/include/linux/fs.h b/include/linux/fs.h > > > index c6baf767619e..297e2dcd9dd2 100644 > > > --- a/include/linux/fs.h > > > +++ b/include/linux/fs.h > > > @@ -1250,7 +1250,7 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) > > > } > > > > > > struct fasync_struct { > > > - spinlock_t fa_lock; > > > + rwlock_t fa_lock; > > > int magic; > > > int fa_fd; > > > struct fasync_struct *fa_next; /* singly linked list */ > > > > > > > I've no objection to the patch in principle, but I'm not as familiar > > with the fasync code as others here. > > I took the reviewers list from MAINTAINERS and ./scripts/get_maintainer.pl, > don't have an ideas what else should be CCed. No worries. The patch seems sane enough to me. You can add: Acked-by: Jeff Layton <jlayton@kernel.org>
On 17.04.2018 16:31, Jeff Layton wrote: > On Tue, 2018-04-17 at 14:53 +0300, Kirill Tkhai wrote: >> Hi, Jeff, >> >> On 17.04.2018 14:42, Jeff Layton wrote: >>> On Thu, 2018-04-05 at 14:58 +0300, Kirill Tkhai wrote: >>>> I observed the following deadlock between them: >>>> >>>> [task 1] [task 2] [task 3] >>>> kill_fasync() mm_update_next_owner() copy_process() >>>> spin_lock_irqsave(&fa->fa_lock) read_lock(&tasklist_lock) write_lock_irq(&tasklist_lock) >>>> send_sigio() <IRQ> ... >>>> read_lock(&fown->lock) kill_fasync() ... >>>> read_lock(&tasklist_lock) spin_lock_irqsave(&fa->fa_lock) ... >>>> >>>> Task 1 can't acquire read locked tasklist_lock, since there is >>>> already task 3 expressed its wish to take the lock exclusive. >>>> Task 2 holds the read locked lock, but it can't take the spin lock. >>>> >>>> Also, there is possible another deadlock (which I haven't observed): >>>> >>>> [task 1] [task 2] >>>> f_getown() kill_fasync() >>>> read_lock(&f_own->lock) spin_lock_irqsave(&fa->fa_lock,) >>>> <IRQ> send_sigio() write_lock_irq(&f_own->lock) >>>> kill_fasync() read_lock(&fown->lock) >>>> spin_lock_irqsave(&fa->fa_lock,) >>>> >>>> Actually, we do not need exclusive fa->fa_lock in kill_fasync_rcu(), >>>> as it guarantees fa->fa_file->f_owner integrity only. It may seem, >>>> that it used to give a task a small possibility to receive two sequential >>>> signals, if there are two parallel kill_fasync() callers, and task >>>> handles the first signal fastly, but the behaviour won't become >>>> different, since there is exclusive sighand lock in do_send_sig_info(). >>>> >>>> The patch converts fa_lock into rwlock_t, and this fixes two above >>>> deadlocks, as rwlock is allowed to be taken from interrupt handler >>>> by qrwlock design. >>>> >>>> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com> >>>> >>>> I used the following program for testing: >>>> >>>> #include <unistd.h> >>>> #include <stdlib.h> >>>> #include <signal.h> >>>> #include <fcntl.h> >>>> #include <errno.h> >>>> #include <stdio.h> >>>> >>>> #ifndef F_SETSIG >>>> #define F_SETSIG 10 >>>> #endif >>>> >>>> void handler(int sig) >>>> { >>>> } >>>> >>>> main() >>>> { >>>> unsigned int flags; >>>> int fd; >>>> >>>> system("echo 8 > /proc/sys/kernel/random/read_wakeup_threshold"); >>>> system("while :; do ls -R / > /dev/random 2>&1 ; echo 3 > /proc/sys/vm/drop_caches; done &"); >>>> >>>> if (signal(SIGINT, handler) < 0) { >>>> perror("Signal"); >>>> exit(1); >>>> } >>>> >>>> fd = open("/dev/random", O_RDWR); >>>> if (fd < 0) { >>>> perror("Can't open"); >>>> exit(1); >>>> } >>>> >>>> flags = FASYNC | fcntl(fd, F_GETFL); >>>> if (fcntl(fd, F_SETFL, flags) < 0) { >>>> perror("Setfl"); >>>> exit(1); >>>> } >>>> if (fcntl(fd, F_SETOWN, getpid())) { >>>> perror("Setown"); >>>> exit(1); >>>> } >>>> if (fcntl(fd, F_SETSIG, SIGINT)) { >>>> perror("Setsig"); >>>> exit(1); >>>> } >>>> >>>> while (1) >>>> sleep(100); >>>> } >>>> --- >>>> fs/fcntl.c | 15 +++++++-------- >>>> include/linux/fs.h | 2 +- >>>> 2 files changed, 8 insertions(+), 9 deletions(-) >>>> >>>> diff --git a/fs/fcntl.c b/fs/fcntl.c >>>> index 1e97f1fda90c..780161a11f9d 100644 >>>> --- a/fs/fcntl.c >>>> +++ b/fs/fcntl.c >>>> @@ -865,9 +865,9 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) >>>> if (fa->fa_file != filp) >>>> continue; >>>> >>>> - spin_lock_irq(&fa->fa_lock); >>>> + write_lock_irq(&fa->fa_lock); >>>> fa->fa_file = NULL; >>>> - spin_unlock_irq(&fa->fa_lock); >>>> + write_unlock_irq(&fa->fa_lock); >>>> >>>> *fp = fa->fa_next; >>>> call_rcu(&fa->fa_rcu, fasync_free_rcu); >>>> @@ -912,13 +912,13 @@ struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasy >>>> if (fa->fa_file != filp) >>>> continue; >>>> >>>> - spin_lock_irq(&fa->fa_lock); >>>> + write_lock_irq(&fa->fa_lock); >>>> fa->fa_fd = fd; >>>> - spin_unlock_irq(&fa->fa_lock); >>>> + write_unlock_irq(&fa->fa_lock); >>>> goto out; >>>> } >>>> >>>> - spin_lock_init(&new->fa_lock); >>>> + rwlock_init(&new->fa_lock); >>>> new->magic = FASYNC_MAGIC; >>>> new->fa_file = filp; >>>> new->fa_fd = fd; >>>> @@ -981,14 +981,13 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) >>>> { >>>> while (fa) { >>>> struct fown_struct *fown; >>>> - unsigned long flags; >>>> >>>> if (fa->magic != FASYNC_MAGIC) { >>>> printk(KERN_ERR "kill_fasync: bad magic number in " >>>> "fasync_struct!\n"); >>>> return; >>>> } >>>> - spin_lock_irqsave(&fa->fa_lock, flags); >>>> + read_lock(&fa->fa_lock); >>> >>> Does this need to be read_lock_irq? Why is it ok to allow interrupts >>> here? >> >> Read locked rwlock can be taken for reading from IRQ once again even >> if there is a writer pending, while spin lock can't: >> >> void queued_read_lock_slowpath(struct qrwlock *lock) >> { >> /* >> * Readers come here when they cannot get the lock without waiting >> */ >> if (unlikely(in_interrupt())) { >> /* >> * Readers in interrupt context will get the lock immediately >> * if the writer is just waiting (not holding the lock yet), >> * so spin with ACQUIRE semantics until the lock is available >> * without waiting in the queue. >> */ >> atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED)); >> return; >> } >> >> So, when we replace spinlock with read_lock(), we don't need disable IRQs anymore. >> All we need is to make write_lock always disable IRQs. > > Got it, thanks. > > read_lock_irq is still used in several (rather obscure) places. Does > this mean that we should do a global s/read_lock_irq/read_lock/ and > remove it? Or is it still useful to disable irqs for some read_lock > acquisitions? I haven't analyzed them, but it seems it's possible to introduce a situation, when rwlock nests with exclusive lock and require to disable IRQ. Let's see at fasync example. The deadlock also was in: https://lkml.org/lkml/2018/4/5/125 and we could fixed it in another way by disabling IRQ during read_lock(). But in case of fasync we are successful as exclusive lock is not need, and we replaced spin lock with rwlock. If the rest of places nest read_lock() with spin lock, they are need in irq disable. >> >>>> if (fa->fa_file) { >>>> fown = &fa->fa_file->f_owner; >>>> /* Don't send SIGURG to processes which have not set a >>>> @@ -997,7 +996,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) >>>> if (!(sig == SIGURG && fown->signum == 0)) >>>> send_sigio(fown, fa->fa_fd, band); >>>> } >>>> - spin_unlock_irqrestore(&fa->fa_lock, flags); >>>> + read_unlock(&fa->fa_lock); >>>> fa = rcu_dereference(fa->fa_next); >>>> } >>>> } >>>> diff --git a/include/linux/fs.h b/include/linux/fs.h >>>> index c6baf767619e..297e2dcd9dd2 100644 >>>> --- a/include/linux/fs.h >>>> +++ b/include/linux/fs.h >>>> @@ -1250,7 +1250,7 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) >>>> } >>>> >>>> struct fasync_struct { >>>> - spinlock_t fa_lock; >>>> + rwlock_t fa_lock; >>>> int magic; >>>> int fa_fd; >>>> struct fasync_struct *fa_next; /* singly linked list */ >>>> >>> >>> I've no objection to the patch in principle, but I'm not as familiar >>> with the fasync code as others here. >> >> I took the reviewers list from MAINTAINERS and ./scripts/get_maintainer.pl, >> don't have an ideas what else should be CCed. Oh, my English. I.e., "who else". > > > No worries. The patch seems sane enough to me. You can add: > > Acked-by: Jeff Layton <jlayton@kernel.org> Thanks! Should I resend this with some more CC or you are going to take the patch via your tree? Kirill
On Thu, Apr 05, 2018 at 02:58:06PM +0300, Kirill Tkhai wrote: > I observed the following deadlock between them: > > [task 1] [task 2] [task 3] > kill_fasync() mm_update_next_owner() copy_process() > spin_lock_irqsave(&fa->fa_lock) read_lock(&tasklist_lock) write_lock_irq(&tasklist_lock) > send_sigio() <IRQ> ... > read_lock(&fown->lock) kill_fasync() ... > read_lock(&tasklist_lock) spin_lock_irqsave(&fa->fa_lock) ... > > Task 1 can't acquire read locked tasklist_lock, since there is > already task 3 expressed its wish to take the lock exclusive. > Task 2 holds the read locked lock, but it can't take the spin lock. I think the important question is to Peter ... why didn't lockdep catch this? > - spin_lock_irq(&fa->fa_lock); > + write_lock_irq(&fa->fa_lock); > fa->fa_file = NULL; > - spin_unlock_irq(&fa->fa_lock); > + write_unlock_irq(&fa->fa_lock); ... > - spin_lock_irq(&fa->fa_lock); > + write_lock_irq(&fa->fa_lock); > fa->fa_fd = fd; > - spin_unlock_irq(&fa->fa_lock); > + write_unlock_irq(&fa->fa_lock); Do we really need a lock here? If we convert each of these into WRITE_ONCE, then ... > - spin_lock_irqsave(&fa->fa_lock, flags); > + read_lock(&fa->fa_lock); > if (fa->fa_file) { file = READ_ONCE(fa->fa_file) then we're not opening any new races, are we? > fown = &fa->fa_file->f_owner; > /* Don't send SIGURG to processes which have not set a > @@ -997,7 +996,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) > if (!(sig == SIGURG && fown->signum == 0)) > send_sigio(fown, fa->fa_fd, band); > } > - spin_unlock_irqrestore(&fa->fa_lock, flags); > + read_unlock(&fa->fa_lock); > fa = rcu_dereference(fa->fa_next); > } > } > diff --git a/include/linux/fs.h b/include/linux/fs.h > index c6baf767619e..297e2dcd9dd2 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -1250,7 +1250,7 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) > } > > struct fasync_struct { > - spinlock_t fa_lock; > + rwlock_t fa_lock; > int magic; > int fa_fd; > struct fasync_struct *fa_next; /* singly linked list */ >
On 17.04.2018 17:01, Matthew Wilcox wrote: > On Thu, Apr 05, 2018 at 02:58:06PM +0300, Kirill Tkhai wrote: >> I observed the following deadlock between them: >> >> [task 1] [task 2] [task 3] >> kill_fasync() mm_update_next_owner() copy_process() >> spin_lock_irqsave(&fa->fa_lock) read_lock(&tasklist_lock) write_lock_irq(&tasklist_lock) >> send_sigio() <IRQ> ... >> read_lock(&fown->lock) kill_fasync() ... >> read_lock(&tasklist_lock) spin_lock_irqsave(&fa->fa_lock) ... >> >> Task 1 can't acquire read locked tasklist_lock, since there is >> already task 3 expressed its wish to take the lock exclusive. >> Task 2 holds the read locked lock, but it can't take the spin lock. > > I think the important question is to Peter ... why didn't lockdep catch this? > >> - spin_lock_irq(&fa->fa_lock); >> + write_lock_irq(&fa->fa_lock); >> fa->fa_file = NULL; >> - spin_unlock_irq(&fa->fa_lock); >> + write_unlock_irq(&fa->fa_lock); > ... >> - spin_lock_irq(&fa->fa_lock); >> + write_lock_irq(&fa->fa_lock); >> fa->fa_fd = fd; >> - spin_unlock_irq(&fa->fa_lock); >> + write_unlock_irq(&fa->fa_lock); > > Do we really need a lock here? If we convert each of these into WRITE_ONCE, We want to pass specific fd to send_sigio(), not a random one. Also, we do want to dereference specific file in kill_fasync_rcu() without a danger it will be freed in parallel. So, since there is no rcu_read_lock() or another protection in readers of this data, we *can't* drop the lock. > then > > ... >> - spin_lock_irqsave(&fa->fa_lock, flags); >> + read_lock(&fa->fa_lock); >> if (fa->fa_file) { > > file = READ_ONCE(fa->fa_file) > > then we're not opening any new races, are we? > >> fown = &fa->fa_file->f_owner; >> /* Don't send SIGURG to processes which have not set a >> @@ -997,7 +996,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) >> if (!(sig == SIGURG && fown->signum == 0)) >> send_sigio(fown, fa->fa_fd, band); >> } >> - spin_unlock_irqrestore(&fa->fa_lock, flags); >> + read_unlock(&fa->fa_lock); >> fa = rcu_dereference(fa->fa_next); >> } >> } >> diff --git a/include/linux/fs.h b/include/linux/fs.h >> index c6baf767619e..297e2dcd9dd2 100644 >> --- a/include/linux/fs.h >> +++ b/include/linux/fs.h >> @@ -1250,7 +1250,7 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) >> } >> >> struct fasync_struct { >> - spinlock_t fa_lock; >> + rwlock_t fa_lock; >> int magic; >> int fa_fd; >> struct fasync_struct *fa_next; /* singly linked list */ Kirill
On Tue, 2018-04-17 at 17:15 +0300, Kirill Tkhai wrote: > On 17.04.2018 17:01, Matthew Wilcox wrote: > > On Thu, Apr 05, 2018 at 02:58:06PM +0300, Kirill Tkhai wrote: > > > I observed the following deadlock between them: > > > > > > [task 1] [task 2] [task 3] > > > kill_fasync() mm_update_next_owner() copy_process() > > > spin_lock_irqsave(&fa->fa_lock) read_lock(&tasklist_lock) write_lock_irq(&tasklist_lock) > > > send_sigio() <IRQ> ... > > > read_lock(&fown->lock) kill_fasync() ... > > > read_lock(&tasklist_lock) spin_lock_irqsave(&fa->fa_lock) ... > > > > > > Task 1 can't acquire read locked tasklist_lock, since there is > > > already task 3 expressed its wish to take the lock exclusive. > > > Task 2 holds the read locked lock, but it can't take the spin lock. > > > > I think the important question is to Peter ... why didn't lockdep catch this? > > > > > - spin_lock_irq(&fa->fa_lock); > > > + write_lock_irq(&fa->fa_lock); > > > fa->fa_file = NULL; > > > - spin_unlock_irq(&fa->fa_lock); > > > + write_unlock_irq(&fa->fa_lock); > > > > ... > > > - spin_lock_irq(&fa->fa_lock); > > > + write_lock_irq(&fa->fa_lock); > > > fa->fa_fd = fd; > > > - spin_unlock_irq(&fa->fa_lock); > > > + write_unlock_irq(&fa->fa_lock); > > > > Do we really need a lock here? If we convert each of these into WRITE_ONCE, > > We want to pass specific fd to send_sigio(), not a random one. Also, we do want > to dereference specific file in kill_fasync_rcu() without a danger it will be freed > in parallel. So, since there is no rcu_read_lock() or another protection in readers > of this data, we *can't* drop the lock. > I think it's probably possible to do this with RCU. You'd need to put the whole fasync structure under RCU, and we'd have to stop resetting fa_fd in fasync_insert_entry, and just insert a new one and delete the old when there was one like that. That'd be no big deal though since we always allocate one and pass it in anyway. We could also turn the nasty singly-linked list into a standard hlist too, which would be nice. Regardless...for now we should probably take this patch and do any further work on the code from that point. I'll plan to pick up the patch for now unless Al or Bruce want it. > > then > > > > ... > > > - spin_lock_irqsave(&fa->fa_lock, flags); > > > + read_lock(&fa->fa_lock); > > > if (fa->fa_file) { > > > > file = READ_ONCE(fa->fa_file) > > > > then we're not opening any new races, are we? > > > > > fown = &fa->fa_file->f_owner; > > > /* Don't send SIGURG to processes which have not set a > > > @@ -997,7 +996,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) > > > if (!(sig == SIGURG && fown->signum == 0)) > > > send_sigio(fown, fa->fa_fd, band); > > > } > > > - spin_unlock_irqrestore(&fa->fa_lock, flags); > > > + read_unlock(&fa->fa_lock); > > > fa = rcu_dereference(fa->fa_next); > > > } > > > } > > > diff --git a/include/linux/fs.h b/include/linux/fs.h > > > index c6baf767619e..297e2dcd9dd2 100644 > > > --- a/include/linux/fs.h > > > +++ b/include/linux/fs.h > > > @@ -1250,7 +1250,7 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) > > > } > > > > > > struct fasync_struct { > > > - spinlock_t fa_lock; > > > + rwlock_t fa_lock; > > > int magic; > > > int fa_fd; > > > struct fasync_struct *fa_next; /* singly linked list */ > > Kirill
On 18.04.2018 23:00, Jeff Layton wrote: > On Tue, 2018-04-17 at 17:15 +0300, Kirill Tkhai wrote: >> On 17.04.2018 17:01, Matthew Wilcox wrote: >>> On Thu, Apr 05, 2018 at 02:58:06PM +0300, Kirill Tkhai wrote: >>>> I observed the following deadlock between them: >>>> >>>> [task 1] [task 2] [task 3] >>>> kill_fasync() mm_update_next_owner() copy_process() >>>> spin_lock_irqsave(&fa->fa_lock) read_lock(&tasklist_lock) write_lock_irq(&tasklist_lock) >>>> send_sigio() <IRQ> ... >>>> read_lock(&fown->lock) kill_fasync() ... >>>> read_lock(&tasklist_lock) spin_lock_irqsave(&fa->fa_lock) ... >>>> >>>> Task 1 can't acquire read locked tasklist_lock, since there is >>>> already task 3 expressed its wish to take the lock exclusive. >>>> Task 2 holds the read locked lock, but it can't take the spin lock. >>> >>> I think the important question is to Peter ... why didn't lockdep catch this? >>> >>>> - spin_lock_irq(&fa->fa_lock); >>>> + write_lock_irq(&fa->fa_lock); >>>> fa->fa_file = NULL; >>>> - spin_unlock_irq(&fa->fa_lock); >>>> + write_unlock_irq(&fa->fa_lock); >>> >>> ... >>>> - spin_lock_irq(&fa->fa_lock); >>>> + write_lock_irq(&fa->fa_lock); >>>> fa->fa_fd = fd; >>>> - spin_unlock_irq(&fa->fa_lock); >>>> + write_unlock_irq(&fa->fa_lock); >>> >>> Do we really need a lock here? If we convert each of these into WRITE_ONCE, >> >> We want to pass specific fd to send_sigio(), not a random one. Also, we do want >> to dereference specific file in kill_fasync_rcu() without a danger it will be freed >> in parallel. So, since there is no rcu_read_lock() or another protection in readers >> of this data, we *can't* drop the lock. >> > > I think it's probably possible to do this with RCU. > > You'd need to put the whole fasync structure under RCU, and we'd have > to stop resetting fa_fd in fasync_insert_entry, and just insert a new > one and delete the old when there was one like that. That'd be no big > deal though since we always allocate one and pass it in anyway. > > We could also turn the nasty singly-linked list into a standard hlist > too, which would be nice. I don't have objections it's possible, but does read lock there is real performance problem we meet? Write lock is taken for a single operation, so it does not seem to take much time. We used to leave with exclusive spinlock for long time, and even with it nobody complained. If there had been one, I assume, he/she would have submitted the patch to fix that... Anyway, if we have to make this code completely lockless, we may. But I would not take a risk to do this as a fix, which could be backported to stable kernel. I agree with you, it should better be made as a development work on top of fix. > Regardless...for now we should probably take this patch and do any > further work on the code from that point. I'll plan to pick up the > patch for now unless Al or Bruce want it. Ok, thanks for clarifying, Jeff. >>> then >>> >>> ... >>>> - spin_lock_irqsave(&fa->fa_lock, flags); >>>> + read_lock(&fa->fa_lock); >>>> if (fa->fa_file) { >>> >>> file = READ_ONCE(fa->fa_file) >>> >>> then we're not opening any new races, are we? >>> >>>> fown = &fa->fa_file->f_owner; >>>> /* Don't send SIGURG to processes which have not set a >>>> @@ -997,7 +996,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) >>>> if (!(sig == SIGURG && fown->signum == 0)) >>>> send_sigio(fown, fa->fa_fd, band); >>>> } >>>> - spin_unlock_irqrestore(&fa->fa_lock, flags); >>>> + read_unlock(&fa->fa_lock); >>>> fa = rcu_dereference(fa->fa_next); >>>> } >>>> } >>>> diff --git a/include/linux/fs.h b/include/linux/fs.h >>>> index c6baf767619e..297e2dcd9dd2 100644 >>>> --- a/include/linux/fs.h >>>> +++ b/include/linux/fs.h >>>> @@ -1250,7 +1250,7 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) >>>> } >>>> >>>> struct fasync_struct { >>>> - spinlock_t fa_lock; >>>> + rwlock_t fa_lock; >>>> int magic; >>>> int fa_fd; >>>> struct fasync_struct *fa_next; /* singly linked list */ >> >> Kirill Kirill
On Tue, Apr 17, 2018 at 07:01:10AM -0700, Matthew Wilcox wrote: > On Thu, Apr 05, 2018 at 02:58:06PM +0300, Kirill Tkhai wrote: > > I observed the following deadlock between them: > > > > [task 1] [task 2] [task 3] > > kill_fasync() mm_update_next_owner() copy_process() > > spin_lock_irqsave(&fa->fa_lock) read_lock(&tasklist_lock) write_lock_irq(&tasklist_lock) > > send_sigio() <IRQ> ... > > read_lock(&fown->lock) kill_fasync() ... > > read_lock(&tasklist_lock) spin_lock_irqsave(&fa->fa_lock) ... > > > > Task 1 can't acquire read locked tasklist_lock, since there is > > already task 3 expressed its wish to take the lock exclusive. > > Task 2 holds the read locked lock, but it can't take the spin lock. > > I think the important question is to Peter ... why didn't lockdep catch this? > I think the following will help lockdep to catch this: @@ -570,7 +588,14 @@ static inline void print_irqtrace_events(struct task_struct *curr) #define spin_release(l, n, i) lock_release(l, n, i) #define rwlock_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) -#define rwlock_acquire_read(l, s, t, i) lock_acquire_shared_recursive(l, s, t, NULL, i) +#define rwlock_acquire_read(l, s, t, i) \ +do { \ + if (!IS_ENABLED(CONFIG_QUEUED_RWLOCKS) || in_interrupt()) \ + lock_acquire_shared_recursive(l, s, t, NULL, i); \ + else \ + lock_acquire_shared(l, s, t, NULL, i); \ +} while (0) + However, this will break several self tests in lib/locking-selftest.c, because we used to treat read_lock() as recursive read locks for all callsites from lockdep's viewpoint. Besides, the above change will bring an interesting challenge for the recursive read lock deadlock detection work that I'm doing: https://marc.info/?l=linux-kernel&m=152345444100825 I will explain that in the thread of that patchset and add you and others Cced in case that you're interested. Regards, Boqun > > - spin_lock_irq(&fa->fa_lock); > > + write_lock_irq(&fa->fa_lock); > > fa->fa_file = NULL; > > - spin_unlock_irq(&fa->fa_lock); > > + write_unlock_irq(&fa->fa_lock); > ... > > - spin_lock_irq(&fa->fa_lock); > > + write_lock_irq(&fa->fa_lock); > > fa->fa_fd = fd; > > - spin_unlock_irq(&fa->fa_lock); > > + write_unlock_irq(&fa->fa_lock); > > Do we really need a lock here? If we convert each of these into WRITE_ONCE, > then > > ... > > - spin_lock_irqsave(&fa->fa_lock, flags); > > + read_lock(&fa->fa_lock); > > if (fa->fa_file) { > > file = READ_ONCE(fa->fa_file) > > then we're not opening any new races, are we? > > > fown = &fa->fa_file->f_owner; > > /* Don't send SIGURG to processes which have not set a > > @@ -997,7 +996,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) > > if (!(sig == SIGURG && fown->signum == 0)) > > send_sigio(fown, fa->fa_fd, band); > > } > > - spin_unlock_irqrestore(&fa->fa_lock, flags); > > + read_unlock(&fa->fa_lock); > > fa = rcu_dereference(fa->fa_next); > > } > > } > > diff --git a/include/linux/fs.h b/include/linux/fs.h > > index c6baf767619e..297e2dcd9dd2 100644 > > --- a/include/linux/fs.h > > +++ b/include/linux/fs.h > > @@ -1250,7 +1250,7 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) > > } > > > > struct fasync_struct { > > - spinlock_t fa_lock; > > + rwlock_t fa_lock; > > int magic; > > int fa_fd; > > struct fasync_struct *fa_next; /* singly linked list */ > >
diff --git a/fs/fcntl.c b/fs/fcntl.c index 1e97f1fda90c..780161a11f9d 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -865,9 +865,9 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) if (fa->fa_file != filp) continue; - spin_lock_irq(&fa->fa_lock); + write_lock_irq(&fa->fa_lock); fa->fa_file = NULL; - spin_unlock_irq(&fa->fa_lock); + write_unlock_irq(&fa->fa_lock); *fp = fa->fa_next; call_rcu(&fa->fa_rcu, fasync_free_rcu); @@ -912,13 +912,13 @@ struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasy if (fa->fa_file != filp) continue; - spin_lock_irq(&fa->fa_lock); + write_lock_irq(&fa->fa_lock); fa->fa_fd = fd; - spin_unlock_irq(&fa->fa_lock); + write_unlock_irq(&fa->fa_lock); goto out; } - spin_lock_init(&new->fa_lock); + rwlock_init(&new->fa_lock); new->magic = FASYNC_MAGIC; new->fa_file = filp; new->fa_fd = fd; @@ -981,14 +981,13 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) { while (fa) { struct fown_struct *fown; - unsigned long flags; if (fa->magic != FASYNC_MAGIC) { printk(KERN_ERR "kill_fasync: bad magic number in " "fasync_struct!\n"); return; } - spin_lock_irqsave(&fa->fa_lock, flags); + read_lock(&fa->fa_lock); if (fa->fa_file) { fown = &fa->fa_file->f_owner; /* Don't send SIGURG to processes which have not set a @@ -997,7 +996,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) if (!(sig == SIGURG && fown->signum == 0)) send_sigio(fown, fa->fa_fd, band); } - spin_unlock_irqrestore(&fa->fa_lock, flags); + read_unlock(&fa->fa_lock); fa = rcu_dereference(fa->fa_next); } } diff --git a/include/linux/fs.h b/include/linux/fs.h index c6baf767619e..297e2dcd9dd2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1250,7 +1250,7 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) } struct fasync_struct { - spinlock_t fa_lock; + rwlock_t fa_lock; int magic; int fa_fd; struct fasync_struct *fa_next; /* singly linked list */
I observed the following deadlock between them: [task 1] [task 2] [task 3] kill_fasync() mm_update_next_owner() copy_process() spin_lock_irqsave(&fa->fa_lock) read_lock(&tasklist_lock) write_lock_irq(&tasklist_lock) send_sigio() <IRQ> ... read_lock(&fown->lock) kill_fasync() ... read_lock(&tasklist_lock) spin_lock_irqsave(&fa->fa_lock) ... Task 1 can't acquire read locked tasklist_lock, since there is already task 3 expressed its wish to take the lock exclusive. Task 2 holds the read locked lock, but it can't take the spin lock. Also, there is possible another deadlock (which I haven't observed): [task 1] [task 2] f_getown() kill_fasync() read_lock(&f_own->lock) spin_lock_irqsave(&fa->fa_lock,) <IRQ> send_sigio() write_lock_irq(&f_own->lock) kill_fasync() read_lock(&fown->lock) spin_lock_irqsave(&fa->fa_lock,) Actually, we do not need exclusive fa->fa_lock in kill_fasync_rcu(), as it guarantees fa->fa_file->f_owner integrity only. It may seem, that it used to give a task a small possibility to receive two sequential signals, if there are two parallel kill_fasync() callers, and task handles the first signal fastly, but the behaviour won't become different, since there is exclusive sighand lock in do_send_sig_info(). The patch converts fa_lock into rwlock_t, and this fixes two above deadlocks, as rwlock is allowed to be taken from interrupt handler by qrwlock design. Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com> I used the following program for testing: #include <unistd.h> #include <stdlib.h> #include <signal.h> #include <fcntl.h> #include <errno.h> #include <stdio.h> #ifndef F_SETSIG #define F_SETSIG 10 #endif void handler(int sig) { } main() { unsigned int flags; int fd; system("echo 8 > /proc/sys/kernel/random/read_wakeup_threshold"); system("while :; do ls -R / > /dev/random 2>&1 ; echo 3 > /proc/sys/vm/drop_caches; done &"); if (signal(SIGINT, handler) < 0) { perror("Signal"); exit(1); } fd = open("/dev/random", O_RDWR); if (fd < 0) { perror("Can't open"); exit(1); } flags = FASYNC | fcntl(fd, F_GETFL); if (fcntl(fd, F_SETFL, flags) < 0) { perror("Setfl"); exit(1); } if (fcntl(fd, F_SETOWN, getpid())) { perror("Setown"); exit(1); } if (fcntl(fd, F_SETSIG, SIGINT)) { perror("Setsig"); exit(1); } while (1) sleep(100); } --- fs/fcntl.c | 15 +++++++-------- include/linux/fs.h | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-)