diff mbox

fasync: Fix deadlock between task-context and interrupt-context kill_fasync()

Message ID 152292939368.19745.13784475656016424647.stgit@localhost.localdomain (mailing list archive)
State New, archived
Headers show

Commit Message

Kirill Tkhai April 5, 2018, 11:58 a.m. UTC
I observed the following deadlock between them:

[task 1]                          [task 2]                         [task 3]
kill_fasync()                     mm_update_next_owner()           copy_process()
 spin_lock_irqsave(&fa->fa_lock)   read_lock(&tasklist_lock)        write_lock_irq(&tasklist_lock)
  send_sigio()                    <IRQ>                             ...
   read_lock(&fown->lock)         kill_fasync()                     ...
    read_lock(&tasklist_lock)      spin_lock_irqsave(&fa->fa_lock)  ...

Task 1 can't acquire read locked tasklist_lock, since there is
already task 3 expressed its wish to take the lock exclusive.
Task 2 holds the read locked lock, but it can't take the spin lock.

Also, there is possible another deadlock (which I haven't observed):

[task 1]                            [task 2]
f_getown()                          kill_fasync()
 read_lock(&f_own->lock)             spin_lock_irqsave(&fa->fa_lock,)
 <IRQ>                               send_sigio()                     write_lock_irq(&f_own->lock)
  kill_fasync()                       read_lock(&fown->lock)
   spin_lock_irqsave(&fa->fa_lock,)

Actually, we do not need exclusive fa->fa_lock in kill_fasync_rcu(),
as it guarantees fa->fa_file->f_owner integrity only. It may seem,
that it used to give a task a small possibility to receive two sequential
signals, if there are two parallel kill_fasync() callers, and task
handles the first signal fastly, but the behaviour won't become
different, since there is exclusive sighand lock in do_send_sig_info().

The patch converts fa_lock into rwlock_t, and this fixes two above
deadlocks, as rwlock is allowed to be taken from interrupt handler
by qrwlock design.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>

I used the following program for testing:

#include <unistd.h>
#include <stdlib.h>
#include <signal.h>
#include <fcntl.h>
#include <errno.h>
#include <stdio.h>

#ifndef F_SETSIG
#define F_SETSIG 10
#endif

void handler(int sig)
{
}

main()
{
	unsigned int flags;
	int fd;

	system("echo 8 > /proc/sys/kernel/random/read_wakeup_threshold");
	system("while :; do ls -R / > /dev/random 2>&1 ; echo 3 > /proc/sys/vm/drop_caches; done &");

	if (signal(SIGINT, handler) < 0) {
		perror("Signal");
		exit(1);
	}

	fd = open("/dev/random", O_RDWR);
	if (fd < 0) {
		perror("Can't open");
		exit(1);
	}

	flags = FASYNC | fcntl(fd, F_GETFL);
	if (fcntl(fd, F_SETFL, flags) < 0) {
		perror("Setfl");
		exit(1);
	}
	if (fcntl(fd, F_SETOWN, getpid())) {
		perror("Setown");
		exit(1);
	}
	if (fcntl(fd, F_SETSIG, SIGINT)) {
		perror("Setsig");
		exit(1);
	}

	while (1)
		sleep(100);
}
---
 fs/fcntl.c         |   15 +++++++--------
 include/linux/fs.h |    2 +-
 2 files changed, 8 insertions(+), 9 deletions(-)

Comments

Kirill Tkhai April 17, 2018, 9:04 a.m. UTC | #1
Hi,

almost two weeks passed, while there is no reaction.

Jeff, Bruce, what is your point of view?

Just to underline, the problem is because of rw_lock fairness, which does not
allow a reader to take a read locked lock in case of there is already a writer
called write_lock(). See queued_read_lock_slowpath() for the details. This makes
rw_lock fair, and it does not allow readers to occupy the lock forever without
a possibility for writers to take it.

Kirill

On 05.04.2018 14:58, Kirill Tkhai wrote:
> I observed the following deadlock between them:
> 
> [task 1]                          [task 2]                         [task 3]
> kill_fasync()                     mm_update_next_owner()           copy_process()
>  spin_lock_irqsave(&fa->fa_lock)   read_lock(&tasklist_lock)        write_lock_irq(&tasklist_lock)
>   send_sigio()                    <IRQ>                             ...
>    read_lock(&fown->lock)         kill_fasync()                     ...
>     read_lock(&tasklist_lock)      spin_lock_irqsave(&fa->fa_lock)  ...
> 
> Task 1 can't acquire read locked tasklist_lock, since there is
> already task 3 expressed its wish to take the lock exclusive.
> Task 2 holds the read locked lock, but it can't take the spin lock.
> 
> Also, there is possible another deadlock (which I haven't observed):
> 
> [task 1]                            [task 2]
> f_getown()                          kill_fasync()
>  read_lock(&f_own->lock)             spin_lock_irqsave(&fa->fa_lock,)
>  <IRQ>                               send_sigio()                     write_lock_irq(&f_own->lock)
>   kill_fasync()                       read_lock(&fown->lock)
>    spin_lock_irqsave(&fa->fa_lock,)
> 
> Actually, we do not need exclusive fa->fa_lock in kill_fasync_rcu(),
> as it guarantees fa->fa_file->f_owner integrity only. It may seem,
> that it used to give a task a small possibility to receive two sequential
> signals, if there are two parallel kill_fasync() callers, and task
> handles the first signal fastly, but the behaviour won't become
> different, since there is exclusive sighand lock in do_send_sig_info().
> 
> The patch converts fa_lock into rwlock_t, and this fixes two above
> deadlocks, as rwlock is allowed to be taken from interrupt handler
> by qrwlock design.
> 
> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> 
> I used the following program for testing:
> 
> #include <unistd.h>
> #include <stdlib.h>
> #include <signal.h>
> #include <fcntl.h>
> #include <errno.h>
> #include <stdio.h>
> 
> #ifndef F_SETSIG
> #define F_SETSIG 10
> #endif
> 
> void handler(int sig)
> {
> }
> 
> main()
> {
> 	unsigned int flags;
> 	int fd;
> 
> 	system("echo 8 > /proc/sys/kernel/random/read_wakeup_threshold");
> 	system("while :; do ls -R / > /dev/random 2>&1 ; echo 3 > /proc/sys/vm/drop_caches; done &");
> 
> 	if (signal(SIGINT, handler) < 0) {
> 		perror("Signal");
> 		exit(1);
> 	}
> 
> 	fd = open("/dev/random", O_RDWR);
> 	if (fd < 0) {
> 		perror("Can't open");
> 		exit(1);
> 	}
> 
> 	flags = FASYNC | fcntl(fd, F_GETFL);
> 	if (fcntl(fd, F_SETFL, flags) < 0) {
> 		perror("Setfl");
> 		exit(1);
> 	}
> 	if (fcntl(fd, F_SETOWN, getpid())) {
> 		perror("Setown");
> 		exit(1);
> 	}
> 	if (fcntl(fd, F_SETSIG, SIGINT)) {
> 		perror("Setsig");
> 		exit(1);
> 	}
> 
> 	while (1)
> 		sleep(100);
> }
> ---
>  fs/fcntl.c         |   15 +++++++--------
>  include/linux/fs.h |    2 +-
>  2 files changed, 8 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/fcntl.c b/fs/fcntl.c
> index 1e97f1fda90c..780161a11f9d 100644
> --- a/fs/fcntl.c
> +++ b/fs/fcntl.c
> @@ -865,9 +865,9 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
>  		if (fa->fa_file != filp)
>  			continue;
>  
> -		spin_lock_irq(&fa->fa_lock);
> +		write_lock_irq(&fa->fa_lock);
>  		fa->fa_file = NULL;
> -		spin_unlock_irq(&fa->fa_lock);
> +		write_unlock_irq(&fa->fa_lock);
>  
>  		*fp = fa->fa_next;
>  		call_rcu(&fa->fa_rcu, fasync_free_rcu);
> @@ -912,13 +912,13 @@ struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasy
>  		if (fa->fa_file != filp)
>  			continue;
>  
> -		spin_lock_irq(&fa->fa_lock);
> +		write_lock_irq(&fa->fa_lock);
>  		fa->fa_fd = fd;
> -		spin_unlock_irq(&fa->fa_lock);
> +		write_unlock_irq(&fa->fa_lock);
>  		goto out;
>  	}
>  
> -	spin_lock_init(&new->fa_lock);
> +	rwlock_init(&new->fa_lock);
>  	new->magic = FASYNC_MAGIC;
>  	new->fa_file = filp;
>  	new->fa_fd = fd;
> @@ -981,14 +981,13 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
>  {
>  	while (fa) {
>  		struct fown_struct *fown;
> -		unsigned long flags;
>  
>  		if (fa->magic != FASYNC_MAGIC) {
>  			printk(KERN_ERR "kill_fasync: bad magic number in "
>  			       "fasync_struct!\n");
>  			return;
>  		}
> -		spin_lock_irqsave(&fa->fa_lock, flags);
> +		read_lock(&fa->fa_lock);
>  		if (fa->fa_file) {
>  			fown = &fa->fa_file->f_owner;
>  			/* Don't send SIGURG to processes which have not set a
> @@ -997,7 +996,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
>  			if (!(sig == SIGURG && fown->signum == 0))
>  				send_sigio(fown, fa->fa_fd, band);
>  		}
> -		spin_unlock_irqrestore(&fa->fa_lock, flags);
> +		read_unlock(&fa->fa_lock);
>  		fa = rcu_dereference(fa->fa_next);
>  	}
>  }
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index c6baf767619e..297e2dcd9dd2 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1250,7 +1250,7 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
>  }
>  
>  struct fasync_struct {
> -	spinlock_t		fa_lock;
> +	rwlock_t		fa_lock;
>  	int			magic;
>  	int			fa_fd;
>  	struct fasync_struct	*fa_next; /* singly linked list */
>
Jeff Layton April 17, 2018, 11:42 a.m. UTC | #2
On Thu, 2018-04-05 at 14:58 +0300, Kirill Tkhai wrote:
> I observed the following deadlock between them:
> 
> [task 1]                          [task 2]                         [task 3]
> kill_fasync()                     mm_update_next_owner()           copy_process()
>  spin_lock_irqsave(&fa->fa_lock)   read_lock(&tasklist_lock)        write_lock_irq(&tasklist_lock)
>   send_sigio()                    <IRQ>                             ...
>    read_lock(&fown->lock)         kill_fasync()                     ...
>     read_lock(&tasklist_lock)      spin_lock_irqsave(&fa->fa_lock)  ...
> 
> Task 1 can't acquire read locked tasklist_lock, since there is
> already task 3 expressed its wish to take the lock exclusive.
> Task 2 holds the read locked lock, but it can't take the spin lock.
> 
> Also, there is possible another deadlock (which I haven't observed):
> 
> [task 1]                            [task 2]
> f_getown()                          kill_fasync()
>  read_lock(&f_own->lock)             spin_lock_irqsave(&fa->fa_lock,)
>  <IRQ>                               send_sigio()                     write_lock_irq(&f_own->lock)
>   kill_fasync()                       read_lock(&fown->lock)
>    spin_lock_irqsave(&fa->fa_lock,)
> 
> Actually, we do not need exclusive fa->fa_lock in kill_fasync_rcu(),
> as it guarantees fa->fa_file->f_owner integrity only. It may seem,
> that it used to give a task a small possibility to receive two sequential
> signals, if there are two parallel kill_fasync() callers, and task
> handles the first signal fastly, but the behaviour won't become
> different, since there is exclusive sighand lock in do_send_sig_info().
> 
> The patch converts fa_lock into rwlock_t, and this fixes two above
> deadlocks, as rwlock is allowed to be taken from interrupt handler
> by qrwlock design.
> 
> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> 
> I used the following program for testing:
> 
> #include <unistd.h>
> #include <stdlib.h>
> #include <signal.h>
> #include <fcntl.h>
> #include <errno.h>
> #include <stdio.h>
> 
> #ifndef F_SETSIG
> #define F_SETSIG 10
> #endif
> 
> void handler(int sig)
> {
> }
> 
> main()
> {
> 	unsigned int flags;
> 	int fd;
> 
> 	system("echo 8 > /proc/sys/kernel/random/read_wakeup_threshold");
> 	system("while :; do ls -R / > /dev/random 2>&1 ; echo 3 > /proc/sys/vm/drop_caches; done &");
> 
> 	if (signal(SIGINT, handler) < 0) {
> 		perror("Signal");
> 		exit(1);
> 	}
> 
> 	fd = open("/dev/random", O_RDWR);
> 	if (fd < 0) {
> 		perror("Can't open");
> 		exit(1);
> 	}
> 
> 	flags = FASYNC | fcntl(fd, F_GETFL);
> 	if (fcntl(fd, F_SETFL, flags) < 0) {
> 		perror("Setfl");
> 		exit(1);
> 	}
> 	if (fcntl(fd, F_SETOWN, getpid())) {
> 		perror("Setown");
> 		exit(1);
> 	}
> 	if (fcntl(fd, F_SETSIG, SIGINT)) {
> 		perror("Setsig");
> 		exit(1);
> 	}
> 
> 	while (1)
> 		sleep(100);
> }
> ---
>  fs/fcntl.c         |   15 +++++++--------
>  include/linux/fs.h |    2 +-
>  2 files changed, 8 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/fcntl.c b/fs/fcntl.c
> index 1e97f1fda90c..780161a11f9d 100644
> --- a/fs/fcntl.c
> +++ b/fs/fcntl.c
> @@ -865,9 +865,9 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
>  		if (fa->fa_file != filp)
>  			continue;
>  
> -		spin_lock_irq(&fa->fa_lock);
> +		write_lock_irq(&fa->fa_lock);
>  		fa->fa_file = NULL;
> -		spin_unlock_irq(&fa->fa_lock);
> +		write_unlock_irq(&fa->fa_lock);
>  
>  		*fp = fa->fa_next;
>  		call_rcu(&fa->fa_rcu, fasync_free_rcu);
> @@ -912,13 +912,13 @@ struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasy
>  		if (fa->fa_file != filp)
>  			continue;
>  
> -		spin_lock_irq(&fa->fa_lock);
> +		write_lock_irq(&fa->fa_lock);
>  		fa->fa_fd = fd;
> -		spin_unlock_irq(&fa->fa_lock);
> +		write_unlock_irq(&fa->fa_lock);
>  		goto out;
>  	}
>  
> -	spin_lock_init(&new->fa_lock);
> +	rwlock_init(&new->fa_lock);
>  	new->magic = FASYNC_MAGIC;
>  	new->fa_file = filp;
>  	new->fa_fd = fd;
> @@ -981,14 +981,13 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
>  {
>  	while (fa) {
>  		struct fown_struct *fown;
> -		unsigned long flags;
>  
>  		if (fa->magic != FASYNC_MAGIC) {
>  			printk(KERN_ERR "kill_fasync: bad magic number in "
>  			       "fasync_struct!\n");
>  			return;
>  		}
> -		spin_lock_irqsave(&fa->fa_lock, flags);
> +		read_lock(&fa->fa_lock);

Does this need to be read_lock_irq? Why is it ok to allow interrupts
here?

>  		if (fa->fa_file) {
>  			fown = &fa->fa_file->f_owner;
>  			/* Don't send SIGURG to processes which have not set a
> @@ -997,7 +996,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
>  			if (!(sig == SIGURG && fown->signum == 0))
>  				send_sigio(fown, fa->fa_fd, band);
>  		}
> -		spin_unlock_irqrestore(&fa->fa_lock, flags);
> +		read_unlock(&fa->fa_lock);
>  		fa = rcu_dereference(fa->fa_next);
>  	}
>  }
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index c6baf767619e..297e2dcd9dd2 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1250,7 +1250,7 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
>  }
>  
>  struct fasync_struct {
> -	spinlock_t		fa_lock;
> +	rwlock_t		fa_lock;
>  	int			magic;
>  	int			fa_fd;
>  	struct fasync_struct	*fa_next; /* singly linked list */
> 

I've no objection to the patch in principle, but I'm not as familiar
with the fasync code as others here.
Kirill Tkhai April 17, 2018, 11:53 a.m. UTC | #3
Hi, Jeff,

On 17.04.2018 14:42, Jeff Layton wrote:
> On Thu, 2018-04-05 at 14:58 +0300, Kirill Tkhai wrote:
>> I observed the following deadlock between them:
>>
>> [task 1]                          [task 2]                         [task 3]
>> kill_fasync()                     mm_update_next_owner()           copy_process()
>>  spin_lock_irqsave(&fa->fa_lock)   read_lock(&tasklist_lock)        write_lock_irq(&tasklist_lock)
>>   send_sigio()                    <IRQ>                             ...
>>    read_lock(&fown->lock)         kill_fasync()                     ...
>>     read_lock(&tasklist_lock)      spin_lock_irqsave(&fa->fa_lock)  ...
>>
>> Task 1 can't acquire read locked tasklist_lock, since there is
>> already task 3 expressed its wish to take the lock exclusive.
>> Task 2 holds the read locked lock, but it can't take the spin lock.
>>
>> Also, there is possible another deadlock (which I haven't observed):
>>
>> [task 1]                            [task 2]
>> f_getown()                          kill_fasync()
>>  read_lock(&f_own->lock)             spin_lock_irqsave(&fa->fa_lock,)
>>  <IRQ>                               send_sigio()                     write_lock_irq(&f_own->lock)
>>   kill_fasync()                       read_lock(&fown->lock)
>>    spin_lock_irqsave(&fa->fa_lock,)
>>
>> Actually, we do not need exclusive fa->fa_lock in kill_fasync_rcu(),
>> as it guarantees fa->fa_file->f_owner integrity only. It may seem,
>> that it used to give a task a small possibility to receive two sequential
>> signals, if there are two parallel kill_fasync() callers, and task
>> handles the first signal fastly, but the behaviour won't become
>> different, since there is exclusive sighand lock in do_send_sig_info().
>>
>> The patch converts fa_lock into rwlock_t, and this fixes two above
>> deadlocks, as rwlock is allowed to be taken from interrupt handler
>> by qrwlock design.
>>
>> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
>>
>> I used the following program for testing:
>>
>> #include <unistd.h>
>> #include <stdlib.h>
>> #include <signal.h>
>> #include <fcntl.h>
>> #include <errno.h>
>> #include <stdio.h>
>>
>> #ifndef F_SETSIG
>> #define F_SETSIG 10
>> #endif
>>
>> void handler(int sig)
>> {
>> }
>>
>> main()
>> {
>> 	unsigned int flags;
>> 	int fd;
>>
>> 	system("echo 8 > /proc/sys/kernel/random/read_wakeup_threshold");
>> 	system("while :; do ls -R / > /dev/random 2>&1 ; echo 3 > /proc/sys/vm/drop_caches; done &");
>>
>> 	if (signal(SIGINT, handler) < 0) {
>> 		perror("Signal");
>> 		exit(1);
>> 	}
>>
>> 	fd = open("/dev/random", O_RDWR);
>> 	if (fd < 0) {
>> 		perror("Can't open");
>> 		exit(1);
>> 	}
>>
>> 	flags = FASYNC | fcntl(fd, F_GETFL);
>> 	if (fcntl(fd, F_SETFL, flags) < 0) {
>> 		perror("Setfl");
>> 		exit(1);
>> 	}
>> 	if (fcntl(fd, F_SETOWN, getpid())) {
>> 		perror("Setown");
>> 		exit(1);
>> 	}
>> 	if (fcntl(fd, F_SETSIG, SIGINT)) {
>> 		perror("Setsig");
>> 		exit(1);
>> 	}
>>
>> 	while (1)
>> 		sleep(100);
>> }
>> ---
>>  fs/fcntl.c         |   15 +++++++--------
>>  include/linux/fs.h |    2 +-
>>  2 files changed, 8 insertions(+), 9 deletions(-)
>>
>> diff --git a/fs/fcntl.c b/fs/fcntl.c
>> index 1e97f1fda90c..780161a11f9d 100644
>> --- a/fs/fcntl.c
>> +++ b/fs/fcntl.c
>> @@ -865,9 +865,9 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
>>  		if (fa->fa_file != filp)
>>  			continue;
>>  
>> -		spin_lock_irq(&fa->fa_lock);
>> +		write_lock_irq(&fa->fa_lock);
>>  		fa->fa_file = NULL;
>> -		spin_unlock_irq(&fa->fa_lock);
>> +		write_unlock_irq(&fa->fa_lock);
>>  
>>  		*fp = fa->fa_next;
>>  		call_rcu(&fa->fa_rcu, fasync_free_rcu);
>> @@ -912,13 +912,13 @@ struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasy
>>  		if (fa->fa_file != filp)
>>  			continue;
>>  
>> -		spin_lock_irq(&fa->fa_lock);
>> +		write_lock_irq(&fa->fa_lock);
>>  		fa->fa_fd = fd;
>> -		spin_unlock_irq(&fa->fa_lock);
>> +		write_unlock_irq(&fa->fa_lock);
>>  		goto out;
>>  	}
>>  
>> -	spin_lock_init(&new->fa_lock);
>> +	rwlock_init(&new->fa_lock);
>>  	new->magic = FASYNC_MAGIC;
>>  	new->fa_file = filp;
>>  	new->fa_fd = fd;
>> @@ -981,14 +981,13 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
>>  {
>>  	while (fa) {
>>  		struct fown_struct *fown;
>> -		unsigned long flags;
>>  
>>  		if (fa->magic != FASYNC_MAGIC) {
>>  			printk(KERN_ERR "kill_fasync: bad magic number in "
>>  			       "fasync_struct!\n");
>>  			return;
>>  		}
>> -		spin_lock_irqsave(&fa->fa_lock, flags);
>> +		read_lock(&fa->fa_lock);
> 
> Does this need to be read_lock_irq? Why is it ok to allow interrupts
> here?

Read locked rwlock can be taken for reading from IRQ once again even
if there is a writer pending, while spin lock can't:

void queued_read_lock_slowpath(struct qrwlock *lock)
{
        /*
         * Readers come here when they cannot get the lock without waiting
         */
        if (unlikely(in_interrupt())) {
                /*
                 * Readers in interrupt context will get the lock immediately
                 * if the writer is just waiting (not holding the lock yet),
                 * so spin with ACQUIRE semantics until the lock is available
                 * without waiting in the queue.
                 */
                atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED));
                return;
        }

So, when we replace spinlock with read_lock(), we don't need disable IRQs anymore.
All we need is to make write_lock always disable IRQs.

>>  		if (fa->fa_file) {
>>  			fown = &fa->fa_file->f_owner;
>>  			/* Don't send SIGURG to processes which have not set a
>> @@ -997,7 +996,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
>>  			if (!(sig == SIGURG && fown->signum == 0))
>>  				send_sigio(fown, fa->fa_fd, band);
>>  		}
>> -		spin_unlock_irqrestore(&fa->fa_lock, flags);
>> +		read_unlock(&fa->fa_lock);
>>  		fa = rcu_dereference(fa->fa_next);
>>  	}
>>  }
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index c6baf767619e..297e2dcd9dd2 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -1250,7 +1250,7 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
>>  }
>>  
>>  struct fasync_struct {
>> -	spinlock_t		fa_lock;
>> +	rwlock_t		fa_lock;
>>  	int			magic;
>>  	int			fa_fd;
>>  	struct fasync_struct	*fa_next; /* singly linked list */
>>
> 
> I've no objection to the patch in principle, but I'm not as familiar
> with the fasync code as others here.

I took the reviewers list from MAINTAINERS and ./scripts/get_maintainer.pl,
don't have an ideas what else should be CCed.

Thanks,
Kirill
Jeff Layton April 17, 2018, 1:31 p.m. UTC | #4
On Tue, 2018-04-17 at 14:53 +0300, Kirill Tkhai wrote:
> Hi, Jeff,
> 
> On 17.04.2018 14:42, Jeff Layton wrote:
> > On Thu, 2018-04-05 at 14:58 +0300, Kirill Tkhai wrote:
> > > I observed the following deadlock between them:
> > > 
> > > [task 1]                          [task 2]                         [task 3]
> > > kill_fasync()                     mm_update_next_owner()           copy_process()
> > >  spin_lock_irqsave(&fa->fa_lock)   read_lock(&tasklist_lock)        write_lock_irq(&tasklist_lock)
> > >   send_sigio()                    <IRQ>                             ...
> > >    read_lock(&fown->lock)         kill_fasync()                     ...
> > >     read_lock(&tasklist_lock)      spin_lock_irqsave(&fa->fa_lock)  ...
> > > 
> > > Task 1 can't acquire read locked tasklist_lock, since there is
> > > already task 3 expressed its wish to take the lock exclusive.
> > > Task 2 holds the read locked lock, but it can't take the spin lock.
> > > 
> > > Also, there is possible another deadlock (which I haven't observed):
> > > 
> > > [task 1]                            [task 2]
> > > f_getown()                          kill_fasync()
> > >  read_lock(&f_own->lock)             spin_lock_irqsave(&fa->fa_lock,)
> > >  <IRQ>                               send_sigio()                     write_lock_irq(&f_own->lock)
> > >   kill_fasync()                       read_lock(&fown->lock)
> > >    spin_lock_irqsave(&fa->fa_lock,)
> > > 
> > > Actually, we do not need exclusive fa->fa_lock in kill_fasync_rcu(),
> > > as it guarantees fa->fa_file->f_owner integrity only. It may seem,
> > > that it used to give a task a small possibility to receive two sequential
> > > signals, if there are two parallel kill_fasync() callers, and task
> > > handles the first signal fastly, but the behaviour won't become
> > > different, since there is exclusive sighand lock in do_send_sig_info().
> > > 
> > > The patch converts fa_lock into rwlock_t, and this fixes two above
> > > deadlocks, as rwlock is allowed to be taken from interrupt handler
> > > by qrwlock design.
> > > 
> > > Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> > > 
> > > I used the following program for testing:
> > > 
> > > #include <unistd.h>
> > > #include <stdlib.h>
> > > #include <signal.h>
> > > #include <fcntl.h>
> > > #include <errno.h>
> > > #include <stdio.h>
> > > 
> > > #ifndef F_SETSIG
> > > #define F_SETSIG 10
> > > #endif
> > > 
> > > void handler(int sig)
> > > {
> > > }
> > > 
> > > main()
> > > {
> > > 	unsigned int flags;
> > > 	int fd;
> > > 
> > > 	system("echo 8 > /proc/sys/kernel/random/read_wakeup_threshold");
> > > 	system("while :; do ls -R / > /dev/random 2>&1 ; echo 3 > /proc/sys/vm/drop_caches; done &");
> > > 
> > > 	if (signal(SIGINT, handler) < 0) {
> > > 		perror("Signal");
> > > 		exit(1);
> > > 	}
> > > 
> > > 	fd = open("/dev/random", O_RDWR);
> > > 	if (fd < 0) {
> > > 		perror("Can't open");
> > > 		exit(1);
> > > 	}
> > > 
> > > 	flags = FASYNC | fcntl(fd, F_GETFL);
> > > 	if (fcntl(fd, F_SETFL, flags) < 0) {
> > > 		perror("Setfl");
> > > 		exit(1);
> > > 	}
> > > 	if (fcntl(fd, F_SETOWN, getpid())) {
> > > 		perror("Setown");
> > > 		exit(1);
> > > 	}
> > > 	if (fcntl(fd, F_SETSIG, SIGINT)) {
> > > 		perror("Setsig");
> > > 		exit(1);
> > > 	}
> > > 
> > > 	while (1)
> > > 		sleep(100);
> > > }
> > > ---
> > >  fs/fcntl.c         |   15 +++++++--------
> > >  include/linux/fs.h |    2 +-
> > >  2 files changed, 8 insertions(+), 9 deletions(-)
> > > 
> > > diff --git a/fs/fcntl.c b/fs/fcntl.c
> > > index 1e97f1fda90c..780161a11f9d 100644
> > > --- a/fs/fcntl.c
> > > +++ b/fs/fcntl.c
> > > @@ -865,9 +865,9 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
> > >  		if (fa->fa_file != filp)
> > >  			continue;
> > >  
> > > -		spin_lock_irq(&fa->fa_lock);
> > > +		write_lock_irq(&fa->fa_lock);
> > >  		fa->fa_file = NULL;
> > > -		spin_unlock_irq(&fa->fa_lock);
> > > +		write_unlock_irq(&fa->fa_lock);
> > >  
> > >  		*fp = fa->fa_next;
> > >  		call_rcu(&fa->fa_rcu, fasync_free_rcu);
> > > @@ -912,13 +912,13 @@ struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasy
> > >  		if (fa->fa_file != filp)
> > >  			continue;
> > >  
> > > -		spin_lock_irq(&fa->fa_lock);
> > > +		write_lock_irq(&fa->fa_lock);
> > >  		fa->fa_fd = fd;
> > > -		spin_unlock_irq(&fa->fa_lock);
> > > +		write_unlock_irq(&fa->fa_lock);
> > >  		goto out;
> > >  	}
> > >  
> > > -	spin_lock_init(&new->fa_lock);
> > > +	rwlock_init(&new->fa_lock);
> > >  	new->magic = FASYNC_MAGIC;
> > >  	new->fa_file = filp;
> > >  	new->fa_fd = fd;
> > > @@ -981,14 +981,13 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
> > >  {
> > >  	while (fa) {
> > >  		struct fown_struct *fown;
> > > -		unsigned long flags;
> > >  
> > >  		if (fa->magic != FASYNC_MAGIC) {
> > >  			printk(KERN_ERR "kill_fasync: bad magic number in "
> > >  			       "fasync_struct!\n");
> > >  			return;
> > >  		}
> > > -		spin_lock_irqsave(&fa->fa_lock, flags);
> > > +		read_lock(&fa->fa_lock);
> > 
> > Does this need to be read_lock_irq? Why is it ok to allow interrupts
> > here?
> 
> Read locked rwlock can be taken for reading from IRQ once again even
> if there is a writer pending, while spin lock can't:
> 
> void queued_read_lock_slowpath(struct qrwlock *lock)
> {
>         /*
>          * Readers come here when they cannot get the lock without waiting
>          */
>         if (unlikely(in_interrupt())) {
>                 /*
>                  * Readers in interrupt context will get the lock immediately
>                  * if the writer is just waiting (not holding the lock yet),
>                  * so spin with ACQUIRE semantics until the lock is available
>                  * without waiting in the queue.
>                  */
>                 atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED));
>                 return;
>         }
> 
> So, when we replace spinlock with read_lock(), we don't need disable IRQs anymore.
> All we need is to make write_lock always disable IRQs.

Got it, thanks.

read_lock_irq is still used in several (rather obscure) places. Does
this mean that we should do a global s/read_lock_irq/read_lock/ and
remove it? Or is it still useful to disable irqs for some read_lock
acquisitions?

> 
> > >  		if (fa->fa_file) {
> > >  			fown = &fa->fa_file->f_owner;
> > >  			/* Don't send SIGURG to processes which have not set a
> > > @@ -997,7 +996,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
> > >  			if (!(sig == SIGURG && fown->signum == 0))
> > >  				send_sigio(fown, fa->fa_fd, band);
> > >  		}
> > > -		spin_unlock_irqrestore(&fa->fa_lock, flags);
> > > +		read_unlock(&fa->fa_lock);
> > >  		fa = rcu_dereference(fa->fa_next);
> > >  	}
> > >  }
> > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > index c6baf767619e..297e2dcd9dd2 100644
> > > --- a/include/linux/fs.h
> > > +++ b/include/linux/fs.h
> > > @@ -1250,7 +1250,7 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
> > >  }
> > >  
> > >  struct fasync_struct {
> > > -	spinlock_t		fa_lock;
> > > +	rwlock_t		fa_lock;
> > >  	int			magic;
> > >  	int			fa_fd;
> > >  	struct fasync_struct	*fa_next; /* singly linked list */
> > > 
> > 
> > I've no objection to the patch in principle, but I'm not as familiar
> > with the fasync code as others here.
> 
> I took the reviewers list from MAINTAINERS and ./scripts/get_maintainer.pl,
> don't have an ideas what else should be CCed.


No worries. The patch seems sane enough to me. You can add:

Acked-by: Jeff Layton <jlayton@kernel.org>
Kirill Tkhai April 17, 2018, 1:59 p.m. UTC | #5
On 17.04.2018 16:31, Jeff Layton wrote:
> On Tue, 2018-04-17 at 14:53 +0300, Kirill Tkhai wrote:
>> Hi, Jeff,
>>
>> On 17.04.2018 14:42, Jeff Layton wrote:
>>> On Thu, 2018-04-05 at 14:58 +0300, Kirill Tkhai wrote:
>>>> I observed the following deadlock between them:
>>>>
>>>> [task 1]                          [task 2]                         [task 3]
>>>> kill_fasync()                     mm_update_next_owner()           copy_process()
>>>>  spin_lock_irqsave(&fa->fa_lock)   read_lock(&tasklist_lock)        write_lock_irq(&tasklist_lock)
>>>>   send_sigio()                    <IRQ>                             ...
>>>>    read_lock(&fown->lock)         kill_fasync()                     ...
>>>>     read_lock(&tasklist_lock)      spin_lock_irqsave(&fa->fa_lock)  ...
>>>>
>>>> Task 1 can't acquire read locked tasklist_lock, since there is
>>>> already task 3 expressed its wish to take the lock exclusive.
>>>> Task 2 holds the read locked lock, but it can't take the spin lock.
>>>>
>>>> Also, there is possible another deadlock (which I haven't observed):
>>>>
>>>> [task 1]                            [task 2]
>>>> f_getown()                          kill_fasync()
>>>>  read_lock(&f_own->lock)             spin_lock_irqsave(&fa->fa_lock,)
>>>>  <IRQ>                               send_sigio()                     write_lock_irq(&f_own->lock)
>>>>   kill_fasync()                       read_lock(&fown->lock)
>>>>    spin_lock_irqsave(&fa->fa_lock,)
>>>>
>>>> Actually, we do not need exclusive fa->fa_lock in kill_fasync_rcu(),
>>>> as it guarantees fa->fa_file->f_owner integrity only. It may seem,
>>>> that it used to give a task a small possibility to receive two sequential
>>>> signals, if there are two parallel kill_fasync() callers, and task
>>>> handles the first signal fastly, but the behaviour won't become
>>>> different, since there is exclusive sighand lock in do_send_sig_info().
>>>>
>>>> The patch converts fa_lock into rwlock_t, and this fixes two above
>>>> deadlocks, as rwlock is allowed to be taken from interrupt handler
>>>> by qrwlock design.
>>>>
>>>> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
>>>>
>>>> I used the following program for testing:
>>>>
>>>> #include <unistd.h>
>>>> #include <stdlib.h>
>>>> #include <signal.h>
>>>> #include <fcntl.h>
>>>> #include <errno.h>
>>>> #include <stdio.h>
>>>>
>>>> #ifndef F_SETSIG
>>>> #define F_SETSIG 10
>>>> #endif
>>>>
>>>> void handler(int sig)
>>>> {
>>>> }
>>>>
>>>> main()
>>>> {
>>>> 	unsigned int flags;
>>>> 	int fd;
>>>>
>>>> 	system("echo 8 > /proc/sys/kernel/random/read_wakeup_threshold");
>>>> 	system("while :; do ls -R / > /dev/random 2>&1 ; echo 3 > /proc/sys/vm/drop_caches; done &");
>>>>
>>>> 	if (signal(SIGINT, handler) < 0) {
>>>> 		perror("Signal");
>>>> 		exit(1);
>>>> 	}
>>>>
>>>> 	fd = open("/dev/random", O_RDWR);
>>>> 	if (fd < 0) {
>>>> 		perror("Can't open");
>>>> 		exit(1);
>>>> 	}
>>>>
>>>> 	flags = FASYNC | fcntl(fd, F_GETFL);
>>>> 	if (fcntl(fd, F_SETFL, flags) < 0) {
>>>> 		perror("Setfl");
>>>> 		exit(1);
>>>> 	}
>>>> 	if (fcntl(fd, F_SETOWN, getpid())) {
>>>> 		perror("Setown");
>>>> 		exit(1);
>>>> 	}
>>>> 	if (fcntl(fd, F_SETSIG, SIGINT)) {
>>>> 		perror("Setsig");
>>>> 		exit(1);
>>>> 	}
>>>>
>>>> 	while (1)
>>>> 		sleep(100);
>>>> }
>>>> ---
>>>>  fs/fcntl.c         |   15 +++++++--------
>>>>  include/linux/fs.h |    2 +-
>>>>  2 files changed, 8 insertions(+), 9 deletions(-)
>>>>
>>>> diff --git a/fs/fcntl.c b/fs/fcntl.c
>>>> index 1e97f1fda90c..780161a11f9d 100644
>>>> --- a/fs/fcntl.c
>>>> +++ b/fs/fcntl.c
>>>> @@ -865,9 +865,9 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
>>>>  		if (fa->fa_file != filp)
>>>>  			continue;
>>>>  
>>>> -		spin_lock_irq(&fa->fa_lock);
>>>> +		write_lock_irq(&fa->fa_lock);
>>>>  		fa->fa_file = NULL;
>>>> -		spin_unlock_irq(&fa->fa_lock);
>>>> +		write_unlock_irq(&fa->fa_lock);
>>>>  
>>>>  		*fp = fa->fa_next;
>>>>  		call_rcu(&fa->fa_rcu, fasync_free_rcu);
>>>> @@ -912,13 +912,13 @@ struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasy
>>>>  		if (fa->fa_file != filp)
>>>>  			continue;
>>>>  
>>>> -		spin_lock_irq(&fa->fa_lock);
>>>> +		write_lock_irq(&fa->fa_lock);
>>>>  		fa->fa_fd = fd;
>>>> -		spin_unlock_irq(&fa->fa_lock);
>>>> +		write_unlock_irq(&fa->fa_lock);
>>>>  		goto out;
>>>>  	}
>>>>  
>>>> -	spin_lock_init(&new->fa_lock);
>>>> +	rwlock_init(&new->fa_lock);
>>>>  	new->magic = FASYNC_MAGIC;
>>>>  	new->fa_file = filp;
>>>>  	new->fa_fd = fd;
>>>> @@ -981,14 +981,13 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
>>>>  {
>>>>  	while (fa) {
>>>>  		struct fown_struct *fown;
>>>> -		unsigned long flags;
>>>>  
>>>>  		if (fa->magic != FASYNC_MAGIC) {
>>>>  			printk(KERN_ERR "kill_fasync: bad magic number in "
>>>>  			       "fasync_struct!\n");
>>>>  			return;
>>>>  		}
>>>> -		spin_lock_irqsave(&fa->fa_lock, flags);
>>>> +		read_lock(&fa->fa_lock);
>>>
>>> Does this need to be read_lock_irq? Why is it ok to allow interrupts
>>> here?
>>
>> Read locked rwlock can be taken for reading from IRQ once again even
>> if there is a writer pending, while spin lock can't:
>>
>> void queued_read_lock_slowpath(struct qrwlock *lock)
>> {
>>         /*
>>          * Readers come here when they cannot get the lock without waiting
>>          */
>>         if (unlikely(in_interrupt())) {
>>                 /*
>>                  * Readers in interrupt context will get the lock immediately
>>                  * if the writer is just waiting (not holding the lock yet),
>>                  * so spin with ACQUIRE semantics until the lock is available
>>                  * without waiting in the queue.
>>                  */
>>                 atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED));
>>                 return;
>>         }
>>
>> So, when we replace spinlock with read_lock(), we don't need disable IRQs anymore.
>> All we need is to make write_lock always disable IRQs.
> 
> Got it, thanks.
> 
> read_lock_irq is still used in several (rather obscure) places. Does
> this mean that we should do a global s/read_lock_irq/read_lock/ and
> remove it? Or is it still useful to disable irqs for some read_lock
> acquisitions?

I haven't analyzed them, but it seems it's possible to introduce a situation,
when rwlock nests with exclusive lock and require to disable IRQ. Let's see
at fasync example. The deadlock also was in:

	https://lkml.org/lkml/2018/4/5/125

and we could fixed it in another way by disabling IRQ during read_lock(). But
in case of fasync we are successful as exclusive lock is not need, and we replaced
spin lock with rwlock.

If the rest of places nest read_lock() with spin lock, they are need in irq disable.
 
>>
>>>>  		if (fa->fa_file) {
>>>>  			fown = &fa->fa_file->f_owner;
>>>>  			/* Don't send SIGURG to processes which have not set a
>>>> @@ -997,7 +996,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
>>>>  			if (!(sig == SIGURG && fown->signum == 0))
>>>>  				send_sigio(fown, fa->fa_fd, band);
>>>>  		}
>>>> -		spin_unlock_irqrestore(&fa->fa_lock, flags);
>>>> +		read_unlock(&fa->fa_lock);
>>>>  		fa = rcu_dereference(fa->fa_next);
>>>>  	}
>>>>  }
>>>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>>>> index c6baf767619e..297e2dcd9dd2 100644
>>>> --- a/include/linux/fs.h
>>>> +++ b/include/linux/fs.h
>>>> @@ -1250,7 +1250,7 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
>>>>  }
>>>>  
>>>>  struct fasync_struct {
>>>> -	spinlock_t		fa_lock;
>>>> +	rwlock_t		fa_lock;
>>>>  	int			magic;
>>>>  	int			fa_fd;
>>>>  	struct fasync_struct	*fa_next; /* singly linked list */
>>>>
>>>
>>> I've no objection to the patch in principle, but I'm not as familiar
>>> with the fasync code as others here.
>>
>> I took the reviewers list from MAINTAINERS and ./scripts/get_maintainer.pl,
>> don't have an ideas what else should be CCed.

Oh, my English. I.e., "who else". 

> 
> 
> No worries. The patch seems sane enough to me. You can add:
> 
> Acked-by: Jeff Layton <jlayton@kernel.org>

Thanks! Should I resend this with some more CC or you are going to take the patch
via your tree?

Kirill
Matthew Wilcox April 17, 2018, 2:01 p.m. UTC | #6
On Thu, Apr 05, 2018 at 02:58:06PM +0300, Kirill Tkhai wrote:
> I observed the following deadlock between them:
> 
> [task 1]                          [task 2]                         [task 3]
> kill_fasync()                     mm_update_next_owner()           copy_process()
>  spin_lock_irqsave(&fa->fa_lock)   read_lock(&tasklist_lock)        write_lock_irq(&tasklist_lock)
>   send_sigio()                    <IRQ>                             ...
>    read_lock(&fown->lock)         kill_fasync()                     ...
>     read_lock(&tasklist_lock)      spin_lock_irqsave(&fa->fa_lock)  ...
> 
> Task 1 can't acquire read locked tasklist_lock, since there is
> already task 3 expressed its wish to take the lock exclusive.
> Task 2 holds the read locked lock, but it can't take the spin lock.

I think the important question is to Peter ... why didn't lockdep catch this?

> -		spin_lock_irq(&fa->fa_lock);
> +		write_lock_irq(&fa->fa_lock);
>  		fa->fa_file = NULL;
> -		spin_unlock_irq(&fa->fa_lock);
> +		write_unlock_irq(&fa->fa_lock);
...
> -		spin_lock_irq(&fa->fa_lock);
> +		write_lock_irq(&fa->fa_lock);
>  		fa->fa_fd = fd;
> -		spin_unlock_irq(&fa->fa_lock);
> +		write_unlock_irq(&fa->fa_lock);

Do we really need a lock here?  If we convert each of these into WRITE_ONCE,
then 

...
> -		spin_lock_irqsave(&fa->fa_lock, flags);
> +		read_lock(&fa->fa_lock);
>  		if (fa->fa_file) {

file = READ_ONCE(fa->fa_file)

then we're not opening any new races, are we?

>  			fown = &fa->fa_file->f_owner;
>  			/* Don't send SIGURG to processes which have not set a
> @@ -997,7 +996,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
>  			if (!(sig == SIGURG && fown->signum == 0))
>  				send_sigio(fown, fa->fa_fd, band);
>  		}
> -		spin_unlock_irqrestore(&fa->fa_lock, flags);
> +		read_unlock(&fa->fa_lock);
>  		fa = rcu_dereference(fa->fa_next);
>  	}
>  }
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index c6baf767619e..297e2dcd9dd2 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1250,7 +1250,7 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
>  }
>  
>  struct fasync_struct {
> -	spinlock_t		fa_lock;
> +	rwlock_t		fa_lock;
>  	int			magic;
>  	int			fa_fd;
>  	struct fasync_struct	*fa_next; /* singly linked list */
>
Kirill Tkhai April 17, 2018, 2:15 p.m. UTC | #7
On 17.04.2018 17:01, Matthew Wilcox wrote:
> On Thu, Apr 05, 2018 at 02:58:06PM +0300, Kirill Tkhai wrote:
>> I observed the following deadlock between them:
>>
>> [task 1]                          [task 2]                         [task 3]
>> kill_fasync()                     mm_update_next_owner()           copy_process()
>>  spin_lock_irqsave(&fa->fa_lock)   read_lock(&tasklist_lock)        write_lock_irq(&tasklist_lock)
>>   send_sigio()                    <IRQ>                             ...
>>    read_lock(&fown->lock)         kill_fasync()                     ...
>>     read_lock(&tasklist_lock)      spin_lock_irqsave(&fa->fa_lock)  ...
>>
>> Task 1 can't acquire read locked tasklist_lock, since there is
>> already task 3 expressed its wish to take the lock exclusive.
>> Task 2 holds the read locked lock, but it can't take the spin lock.
> 
> I think the important question is to Peter ... why didn't lockdep catch this?
> 
>> -		spin_lock_irq(&fa->fa_lock);
>> +		write_lock_irq(&fa->fa_lock);
>>  		fa->fa_file = NULL;
>> -		spin_unlock_irq(&fa->fa_lock);
>> +		write_unlock_irq(&fa->fa_lock);
> ...
>> -		spin_lock_irq(&fa->fa_lock);
>> +		write_lock_irq(&fa->fa_lock);
>>  		fa->fa_fd = fd;
>> -		spin_unlock_irq(&fa->fa_lock);
>> +		write_unlock_irq(&fa->fa_lock);
> 
> Do we really need a lock here?  If we convert each of these into WRITE_ONCE,

We want to pass specific fd to send_sigio(), not a random one. Also, we do want
to dereference specific file in kill_fasync_rcu() without a danger it will be freed
in parallel. So, since there is no rcu_read_lock() or another protection in readers
of this data, we *can't* drop the lock.

> then 
> 
> ...
>> -		spin_lock_irqsave(&fa->fa_lock, flags);
>> +		read_lock(&fa->fa_lock);
>>  		if (fa->fa_file) {
> 
> file = READ_ONCE(fa->fa_file)
> 
> then we're not opening any new races, are we?
> 
>>  			fown = &fa->fa_file->f_owner;
>>  			/* Don't send SIGURG to processes which have not set a
>> @@ -997,7 +996,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
>>  			if (!(sig == SIGURG && fown->signum == 0))
>>  				send_sigio(fown, fa->fa_fd, band);
>>  		}
>> -		spin_unlock_irqrestore(&fa->fa_lock, flags);
>> +		read_unlock(&fa->fa_lock);
>>  		fa = rcu_dereference(fa->fa_next);
>>  	}
>>  }
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index c6baf767619e..297e2dcd9dd2 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -1250,7 +1250,7 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
>>  }
>>  
>>  struct fasync_struct {
>> -	spinlock_t		fa_lock;
>> +	rwlock_t		fa_lock;
>>  	int			magic;
>>  	int			fa_fd;
>>  	struct fasync_struct	*fa_next; /* singly linked list */

Kirill
Jeff Layton April 18, 2018, 8 p.m. UTC | #8
On Tue, 2018-04-17 at 17:15 +0300, Kirill Tkhai wrote:
> On 17.04.2018 17:01, Matthew Wilcox wrote:
> > On Thu, Apr 05, 2018 at 02:58:06PM +0300, Kirill Tkhai wrote:
> > > I observed the following deadlock between them:
> > > 
> > > [task 1]                          [task 2]                         [task 3]
> > > kill_fasync()                     mm_update_next_owner()           copy_process()
> > >  spin_lock_irqsave(&fa->fa_lock)   read_lock(&tasklist_lock)        write_lock_irq(&tasklist_lock)
> > >   send_sigio()                    <IRQ>                             ...
> > >    read_lock(&fown->lock)         kill_fasync()                     ...
> > >     read_lock(&tasklist_lock)      spin_lock_irqsave(&fa->fa_lock)  ...
> > > 
> > > Task 1 can't acquire read locked tasklist_lock, since there is
> > > already task 3 expressed its wish to take the lock exclusive.
> > > Task 2 holds the read locked lock, but it can't take the spin lock.
> > 
> > I think the important question is to Peter ... why didn't lockdep catch this?
> > 
> > > -		spin_lock_irq(&fa->fa_lock);
> > > +		write_lock_irq(&fa->fa_lock);
> > >  		fa->fa_file = NULL;
> > > -		spin_unlock_irq(&fa->fa_lock);
> > > +		write_unlock_irq(&fa->fa_lock);
> > 
> > ...
> > > -		spin_lock_irq(&fa->fa_lock);
> > > +		write_lock_irq(&fa->fa_lock);
> > >  		fa->fa_fd = fd;
> > > -		spin_unlock_irq(&fa->fa_lock);
> > > +		write_unlock_irq(&fa->fa_lock);
> > 
> > Do we really need a lock here?  If we convert each of these into WRITE_ONCE,
> 
> We want to pass specific fd to send_sigio(), not a random one. Also, we do want
> to dereference specific file in kill_fasync_rcu() without a danger it will be freed
> in parallel. So, since there is no rcu_read_lock() or another protection in readers
> of this data, we *can't* drop the lock.
> 

I think it's probably possible to do this with RCU.

You'd need to put the whole fasync structure under RCU, and we'd have
to stop resetting fa_fd in fasync_insert_entry, and just insert a new
one and delete the old when there was one like that. That'd be no big
deal though since we always allocate one and pass it in anyway.

We could also turn the nasty singly-linked list into a standard hlist
too, which would be nice.

Regardless...for now we should probably take this patch and do any
further work on the code from that point. I'll plan to pick up the
patch for now unless Al or Bruce want it.

> > then 
> > 
> > ...
> > > -		spin_lock_irqsave(&fa->fa_lock, flags);
> > > +		read_lock(&fa->fa_lock);
> > >  		if (fa->fa_file) {
> > 
> > file = READ_ONCE(fa->fa_file)
> > 
> > then we're not opening any new races, are we?
> > 
> > >  			fown = &fa->fa_file->f_owner;
> > >  			/* Don't send SIGURG to processes which have not set a
> > > @@ -997,7 +996,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
> > >  			if (!(sig == SIGURG && fown->signum == 0))
> > >  				send_sigio(fown, fa->fa_fd, band);
> > >  		}
> > > -		spin_unlock_irqrestore(&fa->fa_lock, flags);
> > > +		read_unlock(&fa->fa_lock);
> > >  		fa = rcu_dereference(fa->fa_next);
> > >  	}
> > >  }
> > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > index c6baf767619e..297e2dcd9dd2 100644
> > > --- a/include/linux/fs.h
> > > +++ b/include/linux/fs.h
> > > @@ -1250,7 +1250,7 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
> > >  }
> > >  
> > >  struct fasync_struct {
> > > -	spinlock_t		fa_lock;
> > > +	rwlock_t		fa_lock;
> > >  	int			magic;
> > >  	int			fa_fd;
> > >  	struct fasync_struct	*fa_next; /* singly linked list */
> 
> Kirill
Kirill Tkhai April 18, 2018, 10:40 p.m. UTC | #9
On 18.04.2018 23:00, Jeff Layton wrote:
> On Tue, 2018-04-17 at 17:15 +0300, Kirill Tkhai wrote:
>> On 17.04.2018 17:01, Matthew Wilcox wrote:
>>> On Thu, Apr 05, 2018 at 02:58:06PM +0300, Kirill Tkhai wrote:
>>>> I observed the following deadlock between them:
>>>>
>>>> [task 1]                          [task 2]                         [task 3]
>>>> kill_fasync()                     mm_update_next_owner()           copy_process()
>>>>  spin_lock_irqsave(&fa->fa_lock)   read_lock(&tasklist_lock)        write_lock_irq(&tasklist_lock)
>>>>   send_sigio()                    <IRQ>                             ...
>>>>    read_lock(&fown->lock)         kill_fasync()                     ...
>>>>     read_lock(&tasklist_lock)      spin_lock_irqsave(&fa->fa_lock)  ...
>>>>
>>>> Task 1 can't acquire read locked tasklist_lock, since there is
>>>> already task 3 expressed its wish to take the lock exclusive.
>>>> Task 2 holds the read locked lock, but it can't take the spin lock.
>>>
>>> I think the important question is to Peter ... why didn't lockdep catch this?
>>>
>>>> -		spin_lock_irq(&fa->fa_lock);
>>>> +		write_lock_irq(&fa->fa_lock);
>>>>  		fa->fa_file = NULL;
>>>> -		spin_unlock_irq(&fa->fa_lock);
>>>> +		write_unlock_irq(&fa->fa_lock);
>>>
>>> ...
>>>> -		spin_lock_irq(&fa->fa_lock);
>>>> +		write_lock_irq(&fa->fa_lock);
>>>>  		fa->fa_fd = fd;
>>>> -		spin_unlock_irq(&fa->fa_lock);
>>>> +		write_unlock_irq(&fa->fa_lock);
>>>
>>> Do we really need a lock here?  If we convert each of these into WRITE_ONCE,
>>
>> We want to pass specific fd to send_sigio(), not a random one. Also, we do want
>> to dereference specific file in kill_fasync_rcu() without a danger it will be freed
>> in parallel. So, since there is no rcu_read_lock() or another protection in readers
>> of this data, we *can't* drop the lock.
>>
> 
> I think it's probably possible to do this with RCU.
> 
> You'd need to put the whole fasync structure under RCU, and we'd have
> to stop resetting fa_fd in fasync_insert_entry, and just insert a new
> one and delete the old when there was one like that. That'd be no big
> deal though since we always allocate one and pass it in anyway.
> 
> We could also turn the nasty singly-linked list into a standard hlist
> too, which would be nice.

I don't have objections it's possible, but does read lock there is real
performance problem we meet? Write lock is taken for a single operation,
so it does not seem to take much time. We used to leave with exclusive
spinlock for long time, and even with it nobody complained. If there had
been one, I assume, he/she would have submitted the patch to fix that...

Anyway, if we have to make this code completely lockless, we may. But
I would not take a risk to do this as a fix, which could be backported
to stable kernel. I agree with you, it should better be made as
a development work on top of fix.
 
> Regardless...for now we should probably take this patch and do any
> further work on the code from that point. I'll plan to pick up the
> patch for now unless Al or Bruce want it.

Ok, thanks for clarifying, Jeff.
 
>>> then 
>>>
>>> ...
>>>> -		spin_lock_irqsave(&fa->fa_lock, flags);
>>>> +		read_lock(&fa->fa_lock);
>>>>  		if (fa->fa_file) {
>>>
>>> file = READ_ONCE(fa->fa_file)
>>>
>>> then we're not opening any new races, are we?
>>>
>>>>  			fown = &fa->fa_file->f_owner;
>>>>  			/* Don't send SIGURG to processes which have not set a
>>>> @@ -997,7 +996,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
>>>>  			if (!(sig == SIGURG && fown->signum == 0))
>>>>  				send_sigio(fown, fa->fa_fd, band);
>>>>  		}
>>>> -		spin_unlock_irqrestore(&fa->fa_lock, flags);
>>>> +		read_unlock(&fa->fa_lock);
>>>>  		fa = rcu_dereference(fa->fa_next);
>>>>  	}
>>>>  }
>>>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>>>> index c6baf767619e..297e2dcd9dd2 100644
>>>> --- a/include/linux/fs.h
>>>> +++ b/include/linux/fs.h
>>>> @@ -1250,7 +1250,7 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
>>>>  }
>>>>  
>>>>  struct fasync_struct {
>>>> -	spinlock_t		fa_lock;
>>>> +	rwlock_t		fa_lock;
>>>>  	int			magic;
>>>>  	int			fa_fd;
>>>>  	struct fasync_struct	*fa_next; /* singly linked list */
>>
>> Kirill

Kirill
Boqun Feng April 27, 2018, 1:44 p.m. UTC | #10
On Tue, Apr 17, 2018 at 07:01:10AM -0700, Matthew Wilcox wrote:
> On Thu, Apr 05, 2018 at 02:58:06PM +0300, Kirill Tkhai wrote:
> > I observed the following deadlock between them:
> > 
> > [task 1]                          [task 2]                         [task 3]
> > kill_fasync()                     mm_update_next_owner()           copy_process()
> >  spin_lock_irqsave(&fa->fa_lock)   read_lock(&tasklist_lock)        write_lock_irq(&tasklist_lock)
> >   send_sigio()                    <IRQ>                             ...
> >    read_lock(&fown->lock)         kill_fasync()                     ...
> >     read_lock(&tasklist_lock)      spin_lock_irqsave(&fa->fa_lock)  ...
> > 
> > Task 1 can't acquire read locked tasklist_lock, since there is
> > already task 3 expressed its wish to take the lock exclusive.
> > Task 2 holds the read locked lock, but it can't take the spin lock.
> 
> I think the important question is to Peter ... why didn't lockdep catch this?
> 

I think the following will help lockdep to catch this:


@@ -570,7 +588,14 @@ static inline void print_irqtrace_events(struct task_struct *curr)
 #define spin_release(l, n, i)			lock_release(l, n, i)
 
 #define rwlock_acquire(l, s, t, i)		lock_acquire_exclusive(l, s, t, NULL, i)
-#define rwlock_acquire_read(l, s, t, i)		lock_acquire_shared_recursive(l, s, t, NULL, i)
+#define rwlock_acquire_read(l, s, t, i)				\
+do {									\
+	if (!IS_ENABLED(CONFIG_QUEUED_RWLOCKS) || in_interrupt())	\
+		lock_acquire_shared_recursive(l, s, t, NULL, i);	\
+	else								\
+		lock_acquire_shared(l, s, t, NULL, i);			\
+} while (0)
+

However, this will break several self tests in lib/locking-selftest.c,
because we used to treat read_lock() as recursive read locks for all
callsites from lockdep's viewpoint.

Besides, the above change will bring an interesting challenge for the
recursive read lock deadlock detection work that I'm doing:

	https://marc.info/?l=linux-kernel&m=152345444100825

I will explain that in the thread of that patchset and add you and
others Cced in case that you're interested.

Regards,
Boqun

> > -		spin_lock_irq(&fa->fa_lock);
> > +		write_lock_irq(&fa->fa_lock);
> >  		fa->fa_file = NULL;
> > -		spin_unlock_irq(&fa->fa_lock);
> > +		write_unlock_irq(&fa->fa_lock);
> ...
> > -		spin_lock_irq(&fa->fa_lock);
> > +		write_lock_irq(&fa->fa_lock);
> >  		fa->fa_fd = fd;
> > -		spin_unlock_irq(&fa->fa_lock);
> > +		write_unlock_irq(&fa->fa_lock);
> 
> Do we really need a lock here?  If we convert each of these into WRITE_ONCE,
> then 
> 
> ...
> > -		spin_lock_irqsave(&fa->fa_lock, flags);
> > +		read_lock(&fa->fa_lock);
> >  		if (fa->fa_file) {
> 
> file = READ_ONCE(fa->fa_file)
> 
> then we're not opening any new races, are we?
> 
> >  			fown = &fa->fa_file->f_owner;
> >  			/* Don't send SIGURG to processes which have not set a
> > @@ -997,7 +996,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
> >  			if (!(sig == SIGURG && fown->signum == 0))
> >  				send_sigio(fown, fa->fa_fd, band);
> >  		}
> > -		spin_unlock_irqrestore(&fa->fa_lock, flags);
> > +		read_unlock(&fa->fa_lock);
> >  		fa = rcu_dereference(fa->fa_next);
> >  	}
> >  }
> > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > index c6baf767619e..297e2dcd9dd2 100644
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -1250,7 +1250,7 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
> >  }
> >  
> >  struct fasync_struct {
> > -	spinlock_t		fa_lock;
> > +	rwlock_t		fa_lock;
> >  	int			magic;
> >  	int			fa_fd;
> >  	struct fasync_struct	*fa_next; /* singly linked list */
> >
diff mbox

Patch

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 1e97f1fda90c..780161a11f9d 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -865,9 +865,9 @@  int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
 		if (fa->fa_file != filp)
 			continue;
 
-		spin_lock_irq(&fa->fa_lock);
+		write_lock_irq(&fa->fa_lock);
 		fa->fa_file = NULL;
-		spin_unlock_irq(&fa->fa_lock);
+		write_unlock_irq(&fa->fa_lock);
 
 		*fp = fa->fa_next;
 		call_rcu(&fa->fa_rcu, fasync_free_rcu);
@@ -912,13 +912,13 @@  struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasy
 		if (fa->fa_file != filp)
 			continue;
 
-		spin_lock_irq(&fa->fa_lock);
+		write_lock_irq(&fa->fa_lock);
 		fa->fa_fd = fd;
-		spin_unlock_irq(&fa->fa_lock);
+		write_unlock_irq(&fa->fa_lock);
 		goto out;
 	}
 
-	spin_lock_init(&new->fa_lock);
+	rwlock_init(&new->fa_lock);
 	new->magic = FASYNC_MAGIC;
 	new->fa_file = filp;
 	new->fa_fd = fd;
@@ -981,14 +981,13 @@  static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
 {
 	while (fa) {
 		struct fown_struct *fown;
-		unsigned long flags;
 
 		if (fa->magic != FASYNC_MAGIC) {
 			printk(KERN_ERR "kill_fasync: bad magic number in "
 			       "fasync_struct!\n");
 			return;
 		}
-		spin_lock_irqsave(&fa->fa_lock, flags);
+		read_lock(&fa->fa_lock);
 		if (fa->fa_file) {
 			fown = &fa->fa_file->f_owner;
 			/* Don't send SIGURG to processes which have not set a
@@ -997,7 +996,7 @@  static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
 			if (!(sig == SIGURG && fown->signum == 0))
 				send_sigio(fown, fa->fa_fd, band);
 		}
-		spin_unlock_irqrestore(&fa->fa_lock, flags);
+		read_unlock(&fa->fa_lock);
 		fa = rcu_dereference(fa->fa_next);
 	}
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c6baf767619e..297e2dcd9dd2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1250,7 +1250,7 @@  static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
 }
 
 struct fasync_struct {
-	spinlock_t		fa_lock;
+	rwlock_t		fa_lock;
 	int			magic;
 	int			fa_fd;
 	struct fasync_struct	*fa_next; /* singly linked list */