diff mbox

[v2] blktrace: Fix potentail deadlock between delete & sysfs ops

Message ID 1502916040-18067-1-git-send-email-longman@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Waiman Long Aug. 16, 2017, 8:40 p.m. UTC
The lockdep code had reported the following unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(s_active#228);
                               lock(&bdev->bd_mutex/1);
                               lock(s_active#228);
  lock(&bdev->bd_mutex);

 *** DEADLOCK ***

The deadlock may happen when one task (CPU1) is trying to delete
a partition in a block device and another task (CPU0) is accessing
tracing sysfs file in that partition.

To avoid that, accessing tracing sysfs file will now use a mutex
trylock loop and the operation will fail if a delete operation is
in progress.

Signed-off-by: Waiman Long <longman@redhat.com>
---

 v2:
   - Use READ_ONCE() and smp_store_mb() to read and write bd_deleting.
   - Check for signal in the mutex_trylock loops.
   - Use usleep() instead of schedule() for RT tasks.

 block/ioctl.c           |  3 +++
 include/linux/fs.h      |  1 +
 kernel/trace/blktrace.c | 39 +++++++++++++++++++++++++++++++++++++--
 3 files changed, 41 insertions(+), 2 deletions(-)

Comments

Steven Rostedt Aug. 17, 2017, 1:34 p.m. UTC | #1
On Wed, 16 Aug 2017 16:40:40 -0400
Waiman Long <longman@redhat.com> wrote:

> The lockdep code had reported the following unsafe locking scenario:
> 
>        CPU0                    CPU1
>        ----                    ----
>   lock(s_active#228);
>                                lock(&bdev->bd_mutex/1);
>                                lock(s_active#228);
>   lock(&bdev->bd_mutex);

Can you show the exact locations of these locks. I have no idea where
this "s_active" is.

> 
>  *** DEADLOCK ***
> 
> The deadlock may happen when one task (CPU1) is trying to delete
> a partition in a block device and another task (CPU0) is accessing
> tracing sysfs file in that partition.
> 
> To avoid that, accessing tracing sysfs file will now use a mutex
> trylock loop and the operation will fail if a delete operation is
> in progress.
> 
> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
> 
>  v2:
>    - Use READ_ONCE() and smp_store_mb() to read and write bd_deleting.
>    - Check for signal in the mutex_trylock loops.
>    - Use usleep() instead of schedule() for RT tasks.

I'm sorry but I really do hate this patch.

> 
>  block/ioctl.c           |  3 +++
>  include/linux/fs.h      |  1 +
>  kernel/trace/blktrace.c | 39 +++++++++++++++++++++++++++++++++++++--
>  3 files changed, 41 insertions(+), 2 deletions(-)
> 
> diff --git a/block/ioctl.c b/block/ioctl.c
> index 0de02ee..b920329 100644
> --- a/block/ioctl.c
> +++ b/block/ioctl.c
> @@ -86,12 +86,15 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
>  				return -EBUSY;
>  			}
>  			/* all seems OK */
> +			smp_store_mb(bdev->bd_deleting, 1);

No comment to explain what is happening here, and why.

>  			fsync_bdev(bdevp);
>  			invalidate_bdev(bdevp);
>  
>  			mutex_lock_nested(&bdev->bd_mutex, 1);
>  			delete_partition(disk, partno);
>  			mutex_unlock(&bdev->bd_mutex);
> +			smp_store_mb(bdev->bd_deleting, 0);
> +

ditto.

>  			mutex_unlock(&bdevp->bd_mutex);
>  			bdput(bdevp);
>  
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 6e1fd5d..c2ba35e 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -427,6 +427,7 @@ struct block_device {
>  #endif
>  	struct block_device *	bd_contains;
>  	unsigned		bd_block_size;
> +	int			bd_deleting;
>  	struct hd_struct *	bd_part;
>  	/* number of times partitions within this device have been opened. */
>  	unsigned		bd_part_count;
> diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
> index bc364f8..b2dffa9 100644
> --- a/kernel/trace/blktrace.c
> +++ b/kernel/trace/blktrace.c
> @@ -27,6 +27,8 @@
>  #include <linux/time.h>
>  #include <linux/uaccess.h>
>  #include <linux/list.h>
> +#include <linux/delay.h>
> +#include <linux/sched/rt.h>
>  
>  #include "../../block/blk.h"
>  
> @@ -1605,6 +1607,23 @@ static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
>  	return bdev_get_queue(bdev);
>  }
>  
> +/*
> + * Read/write to the tracing sysfs file requires taking references to the

What's the "tracing sysfs" file? tracefs?

> + * sysfs file and then acquiring the bd_mutex. Deleting a block device
> + * requires acquiring the bd_mutex and then waiting for all the sysfs
> + * references to be gone. This can lead to deadlock if both operations
> + * happen simultaneously. To avoid this problem, read/write to the
> + * the tracing sysfs files can now fail if the bd_mutex cannot be
> + * acquired while a deletion operation is in progress.
> + *
> + * A mutex trylock loop is used assuming that tracing sysfs operations

A mutex trylock loop is not enough to stop a deadlock. But I'm guessing
the undocumented bd_deleting may prevent that.

> + * aren't frequently enough to cause any contention problem.
> + *
> + * For RT tasks, a running high priority task will prevent any lower
> + * priority RT tasks from being run. So they do need to actually sleep
> + * when the trylock fails to allow lower priority tasks to make forward
> + * progress.
> + */
>  static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
>  					 struct device_attribute *attr,
>  					 char *buf)
> @@ -1622,7 +1641,15 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
>  	if (q == NULL)
>  		goto out_bdput;
>  
> -	mutex_lock(&bdev->bd_mutex);
> +	while (!mutex_trylock(&bdev->bd_mutex)) {
> +		if (READ_ONCE(bdev->bd_deleting))
> +			goto out_bdput;
> +		if (signal_pending(current)) {
> +			ret = -EINTR;
> +			goto out_bdput;
> +		}
> +		rt_task(current) ? usleep_range(10, 10) : schedule();

We need to come up with a better solution. This is just a hack that
circumvents a lot of the lockdep infrastructure.

-- Steve

> +	}
>  
>  	if (attr == &dev_attr_enable) {
>  		ret = sprintf(buf, "%u\n", !!q->blk_trace);
> @@ -1683,7 +1710,15 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
>  	if (q == NULL)
>  		goto out_bdput;
>  
> -	mutex_lock(&bdev->bd_mutex);
> +	while (!mutex_trylock(&bdev->bd_mutex)) {
> +		if (READ_ONCE(bdev->bd_deleting))
> +			goto out_bdput;
> +		if (signal_pending(current)) {
> +			ret = -EINTR;
> +			goto out_bdput;
> +		}
> +		rt_task(current) ? usleep_range(10, 10) : schedule();
> +	}
>  
>  	if (attr == &dev_attr_enable) {
>  		if (value)
Waiman Long Aug. 17, 2017, 4:24 p.m. UTC | #2
On 08/17/2017 09:34 AM, Steven Rostedt wrote:
> On Wed, 16 Aug 2017 16:40:40 -0400
> Waiman Long <longman@redhat.com> wrote:
>
>> The lockdep code had reported the following unsafe locking scenario:
>>
>>        CPU0                    CPU1
>>        ----                    ----
>>   lock(s_active#228);
>>                                lock(&bdev->bd_mutex/1);
>>                                lock(s_active#228);
>>   lock(&bdev->bd_mutex);
> Can you show the exact locations of these locks. I have no idea where
> this "s_active" is.
The s_active isn't an actual lock. It is a reference count (kn->count)
on the sysfs (kernfs) file. Removal of a sysfs file, however, require
a wait until all the references are gone. The reference count is
treated like a rwsem using lockdep instrumentation code.

>>  *** DEADLOCK ***
>>
>> The deadlock may happen when one task (CPU1) is trying to delete
>> a partition in a block device and another task (CPU0) is accessing
>> tracing sysfs file in that partition.
>>
>> To avoid that, accessing tracing sysfs file will now use a mutex
>> trylock loop and the operation will fail if a delete operation is
>> in progress.
>>
>> Signed-off-by: Waiman Long <longman@redhat.com>
>> ---
>>
>>  v2:
>>    - Use READ_ONCE() and smp_store_mb() to read and write bd_deleting.
>>    - Check for signal in the mutex_trylock loops.
>>    - Use usleep() instead of schedule() for RT tasks.
> I'm sorry but I really do hate this patch.

Any suggestion on how to make it better?

>>  block/ioctl.c           |  3 +++
>>  include/linux/fs.h      |  1 +
>>  kernel/trace/blktrace.c | 39 +++++++++++++++++++++++++++++++++++++--
>>  3 files changed, 41 insertions(+), 2 deletions(-)
>>
>> diff --git a/block/ioctl.c b/block/ioctl.c
>> index 0de02ee..b920329 100644
>> --- a/block/ioctl.c
>> +++ b/block/ioctl.c
>> @@ -86,12 +86,15 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
>>  				return -EBUSY;
>>  			}
>>  			/* all seems OK */
>> +			smp_store_mb(bdev->bd_deleting, 1);
> No comment to explain what is happening here, and why.

I am going to add a comment block here.

>>  			fsync_bdev(bdevp);
>>  			invalidate_bdev(bdevp);
>>  
>>  			mutex_lock_nested(&bdev->bd_mutex, 1);
>>  			delete_partition(disk, partno);
>>  			mutex_unlock(&bdev->bd_mutex);
>> +			smp_store_mb(bdev->bd_deleting, 0);
>> +
> ditto.
>
>>  			mutex_unlock(&bdevp->bd_mutex);
>>  			bdput(bdevp);
>>  
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index 6e1fd5d..c2ba35e 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -427,6 +427,7 @@ struct block_device {
>>  #endif
>>  	struct block_device *	bd_contains;
>>  	unsigned		bd_block_size;
>> +	int			bd_deleting;
>>  	struct hd_struct *	bd_part;
>>  	/* number of times partitions within this device have been opened. */
>>  	unsigned		bd_part_count;
>> diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
>> index bc364f8..b2dffa9 100644
>> --- a/kernel/trace/blktrace.c
>> +++ b/kernel/trace/blktrace.c
>> @@ -27,6 +27,8 @@
>>  #include <linux/time.h>
>>  #include <linux/uaccess.h>
>>  #include <linux/list.h>
>> +#include <linux/delay.h>
>> +#include <linux/sched/rt.h>
>>  
>>  #include "../../block/blk.h"
>>  
>> @@ -1605,6 +1607,23 @@ static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
>>  	return bdev_get_queue(bdev);
>>  }
>>  
>> +/*
>> + * Read/write to the tracing sysfs file requires taking references to the
> What's the "tracing sysfs" file? tracefs?

I am referring to blk_trace sysfs files which are used for enable
tracing of block operations.

>> + * sysfs file and then acquiring the bd_mutex. Deleting a block device
>> + * requires acquiring the bd_mutex and then waiting for all the sysfs
>> + * references to be gone. This can lead to deadlock if both operations
>> + * happen simultaneously. To avoid this problem, read/write to the
>> + * the tracing sysfs files can now fail if the bd_mutex cannot be
>> + * acquired while a deletion operation is in progress.
>> + *
>> + * A mutex trylock loop is used assuming that tracing sysfs operations
> A mutex trylock loop is not enough to stop a deadlock. But I'm guessing
> the undocumented bd_deleting may prevent that.

Yes, that is what the bd_deleting flag is for.

I was thinking about failing the sysfs operation after a certain number
of trylock attempts, but then it will require changes to user space code
to handle the occasional failures. Finally I decided to fail it only
when a delete operation is in progress as all hopes are lost in this case.

>> + * aren't frequently enough to cause any contention problem.
>> + *
>> + * For RT tasks, a running high priority task will prevent any lower
>> + * priority RT tasks from being run. So they do need to actually sleep
>> + * when the trylock fails to allow lower priority tasks to make forward
>> + * progress.
>> + */
>>  static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
>>  					 struct device_attribute *attr,
>>  					 char *buf)
>> @@ -1622,7 +1641,15 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
>>  	if (q == NULL)
>>  		goto out_bdput;
>>  
>> -	mutex_lock(&bdev->bd_mutex);
>> +	while (!mutex_trylock(&bdev->bd_mutex)) {
>> +		if (READ_ONCE(bdev->bd_deleting))
>> +			goto out_bdput;
>> +		if (signal_pending(current)) {
>> +			ret = -EINTR;
>> +			goto out_bdput;
>> +		}
>> +		rt_task(current) ? usleep_range(10, 10) : schedule();
> We need to come up with a better solution. This is just a hack that
> circumvents a lot of the lockdep infrastructure.
>
> -- Steve

The root cause is the lock inversion under this circumstance. I think
modifying the blk_trace code has the least impact overall. I agree that
the code is ugly. If you have a better suggestion, I will certainly like
to hear it.

Cheers,
Longman
Steven Rostedt Aug. 17, 2017, 8:30 p.m. UTC | #3
On Thu, 17 Aug 2017 12:24:39 -0400
Waiman Long <longman@redhat.com> wrote:

> On 08/17/2017 09:34 AM, Steven Rostedt wrote:
> > On Wed, 16 Aug 2017 16:40:40 -0400
> > Waiman Long <longman@redhat.com> wrote:
> >  
> >> The lockdep code had reported the following unsafe locking scenario:
> >>
> >>        CPU0                    CPU1
> >>        ----                    ----
> >>   lock(s_active#228);
> >>                                lock(&bdev->bd_mutex/1);
> >>                                lock(s_active#228);
> >>   lock(&bdev->bd_mutex);  
> > Can you show the exact locations of these locks. I have no idea where
> > this "s_active" is.  
> The s_active isn't an actual lock. It is a reference count (kn->count)
> on the sysfs (kernfs) file. Removal of a sysfs file, however, require
> a wait until all the references are gone. The reference count is
> treated like a rwsem using lockdep instrumentation code.

Which kernel is this? I don't see any lockdep annotation around
kn->count (doing a git grep, I find it referenced in fs/kernfs/dir.c)

> 
> >>  *** DEADLOCK ***
> >>
> >> The deadlock may happen when one task (CPU1) is trying to delete
> >> a partition in a block device and another task (CPU0) is accessing
> >> tracing sysfs file in that partition.
> >>
> >> To avoid that, accessing tracing sysfs file will now use a mutex
> >> trylock loop and the operation will fail if a delete operation is
> >> in progress.
> >>
> >> Signed-off-by: Waiman Long <longman@redhat.com>
> >> ---
> >>
> >>  v2:
> >>    - Use READ_ONCE() and smp_store_mb() to read and write bd_deleting.
> >>    - Check for signal in the mutex_trylock loops.
> >>    - Use usleep() instead of schedule() for RT tasks.  
> > I'm sorry but I really do hate this patch.  
> 
> Any suggestion on how to make it better?

I'd like to be able to at least trigger the warning. And see the lock
issues. I wont be able to recommend anything until I understand what is
happening.


> The root cause is the lock inversion under this circumstance. I think
> modifying the blk_trace code has the least impact overall. I agree that
> the code is ugly. If you have a better suggestion, I will certainly like
> to hear it.

Again, I need to see where the issue lies before recommending something
else. I would hope there is a more elegant solution to this.

-- Steve
Bart Van Assche Aug. 17, 2017, 8:41 p.m. UTC | #4
On Thu, 2017-08-17 at 16:30 -0400, Steven Rostedt wrote:
> On Thu, 17 Aug 2017 12:24:39 -0400

> Waiman Long <longman@redhat.com> wrote:

> 

> > On 08/17/2017 09:34 AM, Steven Rostedt wrote:

> > > On Wed, 16 Aug 2017 16:40:40 -0400

> > > Waiman Long <longman@redhat.com> wrote:

> > >  

> > > > The lockdep code had reported the following unsafe locking scenario:

> > > > 

> > > >        CPU0                    CPU1

> > > >        ----                    ----

> > > >   lock(s_active#228);

> > > >                                lock(&bdev->bd_mutex/1);

> > > >                                lock(s_active#228);

> > > >   lock(&bdev->bd_mutex);  

> > > 

> > > Can you show the exact locations of these locks. I have no idea where

> > > this "s_active" is.  

> > 

> > The s_active isn't an actual lock. It is a reference count (kn->count)

> > on the sysfs (kernfs) file. Removal of a sysfs file, however, require

> > a wait until all the references are gone. The reference count is

> > treated like a rwsem using lockdep instrumentation code.

> 

> Which kernel is this? I don't see any lockdep annotation around

> kn->count (doing a git grep, I find it referenced in fs/kernfs/dir.c)


As far as I know the s_active lockdep annotations were introduced in 2007
through commit 0ab66088c855 ("sysfs: implement sysfs_dirent active reference
and immediate disconnect"). Today these annotations exist in kernfs:
$ git grep -nHw dep_map fs/kernfs
fs/kernfs/dir.c:421:		rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
fs/kernfs/dir.c:441:		rwsem_release(&kn->dep_map, 1, _RET_IP_);
fs/kernfs/dir.c:468:		rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
fs/kernfs/dir.c:470:			lock_contended(&kn->dep_map, _RET_IP_);
fs/kernfs/dir.c:478:		lock_acquired(&kn->dep_map, _RET_IP_);
fs/kernfs/dir.c:479:		rwsem_release(&kn->dep_map, 1, _RET_IP_);
fs/kernfs/dir.c:1385:		rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_);
fs/kernfs/file.c:1000:		lockdep_init_map(&kn->dep_map, "s_active", key, 0);

Bart.
Steven Rostedt Aug. 17, 2017, 8:56 p.m. UTC | #5
On Thu, 17 Aug 2017 20:41:10 +0000
Bart Van Assche <Bart.VanAssche@wdc.com> wrote:

> On Thu, 2017-08-17 at 16:30 -0400, Steven Rostedt wrote:
> > On Thu, 17 Aug 2017 12:24:39 -0400
> > Waiman Long <longman@redhat.com> wrote:
> >   
> > > On 08/17/2017 09:34 AM, Steven Rostedt wrote:  
> > > > On Wed, 16 Aug 2017 16:40:40 -0400
> > > > Waiman Long <longman@redhat.com> wrote:
> > > >    
> > > > > The lockdep code had reported the following unsafe locking scenario:
> > > > > 
> > > > >        CPU0                    CPU1
> > > > >        ----                    ----
> > > > >   lock(s_active#228);
> > > > >                                lock(&bdev->bd_mutex/1);
> > > > >                                lock(s_active#228);
> > > > >   lock(&bdev->bd_mutex);    
> > > > 
> > > > Can you show the exact locations of these locks. I have no idea where
> > > > this "s_active" is.    
> > > 
> > > The s_active isn't an actual lock. It is a reference count (kn->count)
> > > on the sysfs (kernfs) file. Removal of a sysfs file, however, require
> > > a wait until all the references are gone. The reference count is
> > > treated like a rwsem using lockdep instrumentation code.  
> > 
> > Which kernel is this? I don't see any lockdep annotation around
> > kn->count (doing a git grep, I find it referenced in fs/kernfs/dir.c)  
> 
> As far as I know the s_active lockdep annotations were introduced in 2007
> through commit 0ab66088c855 ("sysfs: implement sysfs_dirent active reference
> and immediate disconnect"). Today these annotations exist in kernfs:
> $ git grep -nHw dep_map fs/kernfs
> fs/kernfs/dir.c:421:		rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
> fs/kernfs/dir.c:441:		rwsem_release(&kn->dep_map, 1, _RET_IP_);
> fs/kernfs/dir.c:468:		rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
> fs/kernfs/dir.c:470:			lock_contended(&kn->dep_map, _RET_IP_);
> fs/kernfs/dir.c:478:		lock_acquired(&kn->dep_map, _RET_IP_);
> fs/kernfs/dir.c:479:		rwsem_release(&kn->dep_map, 1, _RET_IP_);
> fs/kernfs/dir.c:1385:		rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_);
> fs/kernfs/file.c:1000:		lockdep_init_map(&kn->dep_map, "s_active", key, 0);
> 

Ah thanks. I was searching for the wrong thing. :-/

-- Steve
Steven Rostedt Aug. 17, 2017, 9:10 p.m. UTC | #6
On Thu, 17 Aug 2017 12:24:39 -0400
Waiman Long <longman@redhat.com> wrote:

> >> + * sysfs file and then acquiring the bd_mutex. Deleting a block device
> >> + * requires acquiring the bd_mutex and then waiting for all the sysfs
> >> + * references to be gone. This can lead to deadlock if both operations
> >> + * happen simultaneously. To avoid this problem, read/write to the
> >> + * the tracing sysfs files can now fail if the bd_mutex cannot be
> >> + * acquired while a deletion operation is in progress.
> >> + *
> >> + * A mutex trylock loop is used assuming that tracing sysfs operations  
> > A mutex trylock loop is not enough to stop a deadlock. But I'm guessing
> > the undocumented bd_deleting may prevent that.  
> 
> Yes, that is what the bd_deleting flag is for.
> 
> I was thinking about failing the sysfs operation after a certain number
> of trylock attempts, but then it will require changes to user space code
> to handle the occasional failures. Finally I decided to fail it only
> when a delete operation is in progress as all hopes are lost in this case.

Actually, why not just fail the attr read on deletion? If it is being
deleted, and one is reading the attribute, perhaps -ENODEV is the
proper return value?

> 
> The root cause is the lock inversion under this circumstance. I think
> modifying the blk_trace code has the least impact overall. I agree that
> the code is ugly. If you have a better suggestion, I will certainly like
> to hear it.

Instead of playing games with taking the lock, the only way this race
is hit, is if the partition is being deleted and the sysfs attribute is
being read at the same time, correct? In that case, just return
-ENODEV, and be done with it.

-- Steve
Waiman Long Aug. 17, 2017, 9:27 p.m. UTC | #7
On 08/17/2017 05:10 PM, Steven Rostedt wrote:
> On Thu, 17 Aug 2017 12:24:39 -0400
> Waiman Long <longman@redhat.com> wrote:
>
>>>> + * sysfs file and then acquiring the bd_mutex. Deleting a block device
>>>> + * requires acquiring the bd_mutex and then waiting for all the sysfs
>>>> + * references to be gone. This can lead to deadlock if both operations
>>>> + * happen simultaneously. To avoid this problem, read/write to the
>>>> + * the tracing sysfs files can now fail if the bd_mutex cannot be
>>>> + * acquired while a deletion operation is in progress.
>>>> + *
>>>> + * A mutex trylock loop is used assuming that tracing sysfs operations  
>>> A mutex trylock loop is not enough to stop a deadlock. But I'm guessing
>>> the undocumented bd_deleting may prevent that.  
>> Yes, that is what the bd_deleting flag is for.
>>
>> I was thinking about failing the sysfs operation after a certain number
>> of trylock attempts, but then it will require changes to user space code
>> to handle the occasional failures. Finally I decided to fail it only
>> when a delete operation is in progress as all hopes are lost in this case.
> Actually, why not just fail the attr read on deletion? If it is being
> deleted, and one is reading the attribute, perhaps -ENODEV is the
> proper return value?
>
>> The root cause is the lock inversion under this circumstance. I think
>> modifying the blk_trace code has the least impact overall. I agree that
>> the code is ugly. If you have a better suggestion, I will certainly like
>> to hear it.
> Instead of playing games with taking the lock, the only way this race
> is hit, is if the partition is being deleted and the sysfs attribute is
> being read at the same time, correct? In that case, just return
> -ENODEV, and be done with it.
>
> -- Steve

It is actually what the patch is trying to do by checking for the
deletion flag in the mutex_trylock loop. Please note that mutex does not
guarantee FIFO ordering of lock acquisition. As a result, cpu1 may call
mutex_lock() and wait for it while cpu2 can set the deletion flag later
and get the mutex first before cpu1. So checking for the deletion flag
before taking the mutex isn't enough.

Cheers,
Longman
Steven Rostedt Aug. 17, 2017, 9:30 p.m. UTC | #8
On Thu, 17 Aug 2017 17:10:07 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:


> Instead of playing games with taking the lock, the only way this race
> is hit, is if the partition is being deleted and the sysfs attribute is
> being read at the same time, correct? In that case, just return
> -ENODEV, and be done with it.

Nevermind that wont work. Too bad there's not a mutex_lock_timeout()
that we could use in a loop. It would solve the issue of forward
progress with RT tasks, and will break after a timeout in case of
deadlock.

-- Steve
Steven Rostedt Aug. 17, 2017, 10:13 p.m. UTC | #9
On Thu, 17 Aug 2017 17:27:22 -0400
Waiman Long <longman@redhat.com> wrote:


> It is actually what the patch is trying to do by checking for the
> deletion flag in the mutex_trylock loop. Please note that mutex does not
> guarantee FIFO ordering of lock acquisition. As a result, cpu1 may call
> mutex_lock() and wait for it while cpu2 can set the deletion flag later
> and get the mutex first before cpu1. So checking for the deletion flag
> before taking the mutex isn't enough.

Yeah, I figured that out already (crossed emails). BTW, how did you
trigger this warning. I'm playing around with adding loop devices,
volume groups, and logical volumes, and reading the trace files
created in the sysfs directory, then removing those items, but it's
not triggering the "delete" path. What operation deletes the partition?

Thanks

-- Steve
Steven Rostedt Aug. 17, 2017, 10:18 p.m. UTC | #10
On Thu, 17 Aug 2017 18:13:20 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Thu, 17 Aug 2017 17:27:22 -0400
> Waiman Long <longman@redhat.com> wrote:
> 
> 
> > It is actually what the patch is trying to do by checking for the
> > deletion flag in the mutex_trylock loop. Please note that mutex does not
> > guarantee FIFO ordering of lock acquisition. As a result, cpu1 may call
> > mutex_lock() and wait for it while cpu2 can set the deletion flag later
> > and get the mutex first before cpu1. So checking for the deletion flag
> > before taking the mutex isn't enough.  
> 
> Yeah, I figured that out already (crossed emails). BTW, how did you
> trigger this warning. I'm playing around with adding loop devices,
> volume groups, and logical volumes, and reading the trace files
> created in the sysfs directory, then removing those items, but it's
> not triggering the "delete" path. What operation deletes the partition?

I'm guessing that deleting an actual partition may work (unfortunately,
my test box has no partition to delete ;-) I'll find another box to
test on.

-- Steve
Steven Rostedt Aug. 17, 2017, 11:23 p.m. UTC | #11
On Thu, 17 Aug 2017 18:18:18 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Thu, 17 Aug 2017 18:13:20 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > On Thu, 17 Aug 2017 17:27:22 -0400
> > Waiman Long <longman@redhat.com> wrote:
> > 
> >   
> > > It is actually what the patch is trying to do by checking for the
> > > deletion flag in the mutex_trylock loop. Please note that mutex does not
> > > guarantee FIFO ordering of lock acquisition. As a result, cpu1 may call
> > > mutex_lock() and wait for it while cpu2 can set the deletion flag later
> > > and get the mutex first before cpu1. So checking for the deletion flag
> > > before taking the mutex isn't enough.    
> > 
> > Yeah, I figured that out already (crossed emails). BTW, how did you
> > trigger this warning. I'm playing around with adding loop devices,
> > volume groups, and logical volumes, and reading the trace files
> > created in the sysfs directory, then removing those items, but it's
> > not triggering the "delete" path. What operation deletes the partition?  
> 
> I'm guessing that deleting an actual partition may work (unfortunately,
> my test box has no partition to delete ;-) I'll find another box to
> test on.
>

OK, deleting a partition doesn't trigger the lockdep splat. But I also
added a printk in the BLKPG_DEL_PARTITION case switch, which also
doesn't print. What command do I need to do to trigger that path?

Thanks,

-- Steve
Waiman Long Aug. 18, 2017, 1:42 p.m. UTC | #12
On 08/17/2017 07:23 PM, Steven Rostedt wrote:
> On Thu, 17 Aug 2017 18:18:18 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
>
>> On Thu, 17 Aug 2017 18:13:20 -0400
>> Steven Rostedt <rostedt@goodmis.org> wrote:
>>
>>> On Thu, 17 Aug 2017 17:27:22 -0400
>>> Waiman Long <longman@redhat.com> wrote:
>>>
>>>   
>>>> It is actually what the patch is trying to do by checking for the
>>>> deletion flag in the mutex_trylock loop. Please note that mutex does not
>>>> guarantee FIFO ordering of lock acquisition. As a result, cpu1 may call
>>>> mutex_lock() and wait for it while cpu2 can set the deletion flag later
>>>> and get the mutex first before cpu1. So checking for the deletion flag
>>>> before taking the mutex isn't enough.    
>>> Yeah, I figured that out already (crossed emails). BTW, how did you
>>> trigger this warning. I'm playing around with adding loop devices,
>>> volume groups, and logical volumes, and reading the trace files
>>> created in the sysfs directory, then removing those items, but it's
>>> not triggering the "delete" path. What operation deletes the partition?  
>> I'm guessing that deleting an actual partition may work (unfortunately,
>> my test box has no partition to delete ;-) I'll find another box to
>> test on.
>>
> OK, deleting a partition doesn't trigger the lockdep splat. But I also
> added a printk in the BLKPG_DEL_PARTITION case switch, which also
> doesn't print. What command do I need to do to trigger that path?
>
> Thanks,
>
> -- Steve

Attached is a reproducer that was used to trigger the warning. Some
tuning may be needed depend on the actual configuration of the test machine.

Cheers,
Longman
Waiman Long Aug. 18, 2017, 1:55 p.m. UTC | #13
On 08/17/2017 05:30 PM, Steven Rostedt wrote:
> On Thu, 17 Aug 2017 17:10:07 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
>
>
>> Instead of playing games with taking the lock, the only way this race
>> is hit, is if the partition is being deleted and the sysfs attribute is
>> being read at the same time, correct? In that case, just return
>> -ENODEV, and be done with it.
> Nevermind that wont work. Too bad there's not a mutex_lock_timeout()
> that we could use in a loop. It would solve the issue of forward
> progress with RT tasks, and will break after a timeout in case of
> deadlock.
>
> -- Steve

I think it will be useful to have mutex_timed_lock(). RT-mutex does have
a timed version, so I guess it shouldn't be hard to implement one for
mutex. I can take a shot at trying to do that.

Thanks,
Longman
Bart Van Assche Aug. 18, 2017, 4:21 p.m. UTC | #14
On Fri, 2017-08-18 at 09:55 -0400, Waiman Long wrote:
> On 08/17/2017 05:30 PM, Steven Rostedt wrote:

> > On Thu, 17 Aug 2017 17:10:07 -0400

> > Steven Rostedt <rostedt@goodmis.org> wrote:

> > > Instead of playing games with taking the lock, the only way this race

> > > is hit, is if the partition is being deleted and the sysfs attribute is

> > > being read at the same time, correct? In that case, just return

> > > -ENODEV, and be done with it.

> > 

> > Nevermind that wont work. Too bad there's not a mutex_lock_timeout()

> > that we could use in a loop. It would solve the issue of forward

> > progress with RT tasks, and will break after a timeout in case of

> > deadlock.

> 

> I think it will be useful to have mutex_timed_lock(). RT-mutex does have

> a timed version, so I guess it shouldn't be hard to implement one for

> mutex. I can take a shot at trying to do that.


(just caught up with the entire e-mail thread)

Sorry Waiman but personally I thoroughly detest loops around mutex_trylock() or
mutex_timed_lock() because such loops are usually used to paper over a problem
instead of fixing the root cause. What I understood from the comment in v1 of your
patch is that bd_mutex is not only held during block device creation and removal
but additionally that bd_mutex is obtained inside sysfs attribute callback methods?
That pattern is guaranteed to lead to deadlocks. Since the block device removal
code waits until all sysfs callback methods have finished there is no need to
protect against block device removal inside the sysfs callback methods. My proposal
is to split bd_mutex: one global mutex that serializes block device creation and
removal and one mutex per block device that serializes changes to a single block
device. Obtaining the global mutex from inside a block device sysfs callback
function is not safe but obtaining the per-block-device mutex from inside a sysfs
callback function is safe.

Bart.
Waiman Long Aug. 18, 2017, 5:22 p.m. UTC | #15
On 08/18/2017 12:21 PM, Bart Van Assche wrote:
> On Fri, 2017-08-18 at 09:55 -0400, Waiman Long wrote:
>> On 08/17/2017 05:30 PM, Steven Rostedt wrote:
>>> On Thu, 17 Aug 2017 17:10:07 -0400
>>> Steven Rostedt <rostedt@goodmis.org> wrote:
>>>> Instead of playing games with taking the lock, the only way this race
>>>> is hit, is if the partition is being deleted and the sysfs attribute is
>>>> being read at the same time, correct? In that case, just return
>>>> -ENODEV, and be done with it.
>>> Nevermind that wont work. Too bad there's not a mutex_lock_timeout()
>>> that we could use in a loop. It would solve the issue of forward
>>> progress with RT tasks, and will break after a timeout in case of
>>> deadlock.
>> I think it will be useful to have mutex_timed_lock(). RT-mutex does have
>> a timed version, so I guess it shouldn't be hard to implement one for
>> mutex. I can take a shot at trying to do that.
> (just caught up with the entire e-mail thread)
>
> Sorry Waiman but personally I thoroughly detest loops around mutex_trylock() or
> mutex_timed_lock() because such loops are usually used to paper over a problem
> instead of fixing the root cause. What I understood from the comment in v1 of your
> patch is that bd_mutex is not only held during block device creation and removal
> but additionally that bd_mutex is obtained inside sysfs attribute callback methods?
> That pattern is guaranteed to lead to deadlocks. Since the block device removal
> code waits until all sysfs callback methods have finished there is no need to
> protect against block device removal inside the sysfs callback methods. My proposal

You are right. We don't really need to take the bd_mutex as the fact
that inside the sysfs callback method will guarantee the block device
won't go away.

> is to split bd_mutex: one global mutex that serializes block device creation and
> removal and one mutex per block device that serializes changes to a single block
> device. Obtaining the global mutex from inside a block device sysfs callback
> function is not safe but obtaining the per-block-device mutex from inside a sysfs
> callback function is safe.
>
> Bart.

The bd_mutex we are talking here is already per block device. I am
thinking about having a global blktrace mutex that is used to serialize
the read and write of blktrace attributes. Since blktrace sysfs files
are not supposed to be frequently accessed, having a global lock
shouldn't cause any problem.

Thanks,
Longman
Steven Rostedt Aug. 18, 2017, 5:26 p.m. UTC | #16
On Fri, 18 Aug 2017 16:21:46 +0000
Bart Van Assche <Bart.VanAssche@wdc.com> wrote:

> Sorry Waiman but personally I thoroughly detest loops around mutex_trylock() or
> mutex_timed_lock() because such loops are usually used to paper over a problem
> instead of fixing the root cause.

100% agree.

-- Steve
diff mbox

Patch

diff --git a/block/ioctl.c b/block/ioctl.c
index 0de02ee..b920329 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -86,12 +86,15 @@  static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 				return -EBUSY;
 			}
 			/* all seems OK */
+			smp_store_mb(bdev->bd_deleting, 1);
 			fsync_bdev(bdevp);
 			invalidate_bdev(bdevp);
 
 			mutex_lock_nested(&bdev->bd_mutex, 1);
 			delete_partition(disk, partno);
 			mutex_unlock(&bdev->bd_mutex);
+			smp_store_mb(bdev->bd_deleting, 0);
+
 			mutex_unlock(&bdevp->bd_mutex);
 			bdput(bdevp);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6e1fd5d..c2ba35e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -427,6 +427,7 @@  struct block_device {
 #endif
 	struct block_device *	bd_contains;
 	unsigned		bd_block_size;
+	int			bd_deleting;
 	struct hd_struct *	bd_part;
 	/* number of times partitions within this device have been opened. */
 	unsigned		bd_part_count;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index bc364f8..b2dffa9 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -27,6 +27,8 @@ 
 #include <linux/time.h>
 #include <linux/uaccess.h>
 #include <linux/list.h>
+#include <linux/delay.h>
+#include <linux/sched/rt.h>
 
 #include "../../block/blk.h"
 
@@ -1605,6 +1607,23 @@  static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
 	return bdev_get_queue(bdev);
 }
 
+/*
+ * Read/write to the tracing sysfs file requires taking references to the
+ * sysfs file and then acquiring the bd_mutex. Deleting a block device
+ * requires acquiring the bd_mutex and then waiting for all the sysfs
+ * references to be gone. This can lead to deadlock if both operations
+ * happen simultaneously. To avoid this problem, read/write to the
+ * the tracing sysfs files can now fail if the bd_mutex cannot be
+ * acquired while a deletion operation is in progress.
+ *
+ * A mutex trylock loop is used assuming that tracing sysfs operations
+ * aren't frequently enough to cause any contention problem.
+ *
+ * For RT tasks, a running high priority task will prevent any lower
+ * priority RT tasks from being run. So they do need to actually sleep
+ * when the trylock fails to allow lower priority tasks to make forward
+ * progress.
+ */
 static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 					 struct device_attribute *attr,
 					 char *buf)
@@ -1622,7 +1641,15 @@  static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 	if (q == NULL)
 		goto out_bdput;
 
-	mutex_lock(&bdev->bd_mutex);
+	while (!mutex_trylock(&bdev->bd_mutex)) {
+		if (READ_ONCE(bdev->bd_deleting))
+			goto out_bdput;
+		if (signal_pending(current)) {
+			ret = -EINTR;
+			goto out_bdput;
+		}
+		rt_task(current) ? usleep_range(10, 10) : schedule();
+	}
 
 	if (attr == &dev_attr_enable) {
 		ret = sprintf(buf, "%u\n", !!q->blk_trace);
@@ -1683,7 +1710,15 @@  static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 	if (q == NULL)
 		goto out_bdput;
 
-	mutex_lock(&bdev->bd_mutex);
+	while (!mutex_trylock(&bdev->bd_mutex)) {
+		if (READ_ONCE(bdev->bd_deleting))
+			goto out_bdput;
+		if (signal_pending(current)) {
+			ret = -EINTR;
+			goto out_bdput;
+		}
+		rt_task(current) ? usleep_range(10, 10) : schedule();
+	}
 
 	if (attr == &dev_attr_enable) {
 		if (value)