diff mbox series

[RFC] blk-mq: Clean up references when freeing rqs

Message ID 1606827738-238646-1-git-send-email-john.garry@huawei.com (mailing list archive)
State New, archived
Headers show
Series [RFC] blk-mq: Clean up references when freeing rqs | expand

Commit Message

John Garry Dec. 1, 2020, 1:02 p.m. UTC
It has been reported many times that a use-after-free can be intermittently
found when iterating busy requests:

- https://lore.kernel.org/linux-block/8376443a-ec1b-0cef-8244-ed584b96fa96@huawei.com/
- https://lore.kernel.org/linux-block/5c3ac5af-ed81-11e4-fee3-f92175f14daf@acm.org/T/#m6c1ac11540522716f645d004e2a5a13c9f218908
- https://lore.kernel.org/linux-block/04e2f9e8-79fa-f1cb-ab23-4a15bf3f64cc@kernel.dk/

The issue is that when we switch scheduler or change queue nr_requests,
the driver tagset may keep references to the stale requests.

As a solution, clean up any references to those requests in the driver
tagset when freeing. This is done with a cmpxchg to make safe any race
with setting the driver tagset request from another queue.

Signed-off-by: John Garry <john.garry@huawei.com>
--
Set as RFC as I need to test more. And not sure on solution method, as
Bart had another idea.

Comments

Ming Lei Dec. 2, 2020, 3:31 a.m. UTC | #1
On Tue, Dec 01, 2020 at 09:02:18PM +0800, John Garry wrote:
> It has been reported many times that a use-after-free can be intermittently
> found when iterating busy requests:
> 
> - https://lore.kernel.org/linux-block/8376443a-ec1b-0cef-8244-ed584b96fa96@huawei.com/
> - https://lore.kernel.org/linux-block/5c3ac5af-ed81-11e4-fee3-f92175f14daf@acm.org/T/#m6c1ac11540522716f645d004e2a5a13c9f218908
> - https://lore.kernel.org/linux-block/04e2f9e8-79fa-f1cb-ab23-4a15bf3f64cc@kernel.dk/
> 
> The issue is that when we switch scheduler or change queue nr_requests,
> the driver tagset may keep references to the stale requests.
> 
> As a solution, clean up any references to those requests in the driver
> tagset when freeing. This is done with a cmpxchg to make safe any race
> with setting the driver tagset request from another queue.
> 
> Signed-off-by: John Garry <john.garry@huawei.com>
> --
> Set as RFC as I need to test more. And not sure on solution method, as
> Bart had another idea.
> 
> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> index d1eafe2c045c..9b042c7036b3 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -621,7 +621,7 @@ void blk_mq_sched_free_requests(struct request_queue *q)
>  
>  	queue_for_each_hw_ctx(q, hctx, i) {
>  		if (hctx->sched_tags)
> -			blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
> +			blk_mq_free_rqs_ext(q->tag_set, hctx->sched_tags, i, hctx->tags);
>  	}
>  }
>  
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 9c92053e704d..562db72e7d79 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -576,7 +576,7 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
>  			return -ENOMEM;
>  		}
>  
> -		blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
> +		blk_mq_free_rqs_ext(set, *tagsptr, hctx->queue_num, hctx->tags);
>  		blk_mq_free_rq_map(*tagsptr, flags);
>  		*tagsptr = new;
>  	} else {
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 55bcee5dc032..f3aad695cd25 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2271,8 +2271,8 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
>  	return BLK_QC_T_NONE;
>  }
>  
> -void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
> -		     unsigned int hctx_idx)
> +void blk_mq_free_rqs_ext(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
> +		     unsigned int hctx_idx, struct blk_mq_tags *references)
>  {
>  	struct page *page;
>  
> @@ -2281,10 +2281,13 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
>  
>  		for (i = 0; i < tags->nr_tags; i++) {
>  			struct request *rq = tags->static_rqs[i];
> +			int j;
>  
>  			if (!rq)
>  				continue;
>  			set->ops->exit_request(set, rq, hctx_idx);
> +			for (j = 0; references && j < references->nr_tags; j++)
> +				cmpxchg(&references->rqs[j], rq, 0);

Seems you didn't address the comment in the following link:

	https://lore.kernel.org/linux-block/10331543-9e45-ae63-8cdb-17e5a2a3b7ef@huawei.com/

The request to be freed may still be refered in another path, such as blk_mq_queue_tag_busy_iter
or blk_mq_tagset_busy_iter(), and cmpxchg() doesn't drain/wait for other refers.

Thanks,
Ming
John Garry Dec. 2, 2020, 11:18 a.m. UTC | #2
On 02/12/2020 03:31, Ming Lei wrote:
> On Tue, Dec 01, 2020 at 09:02:18PM +0800, John Garry wrote:
>> It has been reported many times that a use-after-free can be intermittently
>> found when iterating busy requests:
>>
>> - https://lore.kernel.org/linux-block/8376443a-ec1b-0cef-8244-ed584b96fa96@huawei.com/
>> - https://lore.kernel.org/linux-block/5c3ac5af-ed81-11e4-fee3-f92175f14daf@acm.org/T/#m6c1ac11540522716f645d004e2a5a13c9f218908
>> - https://lore.kernel.org/linux-block/04e2f9e8-79fa-f1cb-ab23-4a15bf3f64cc@kernel.dk/
>>
>> The issue is that when we switch scheduler or change queue nr_requests,
>> the driver tagset may keep references to the stale requests.
>>
>> As a solution, clean up any references to those requests in the driver
>> tagset when freeing. This is done with a cmpxchg to make safe any race
>> with setting the driver tagset request from another queue.
>>
>> Signed-off-by: John Garry <john.garry@huawei.com>
>> --
>> Set as RFC as I need to test more. And not sure on solution method, as
>> Bart had another idea.
>>
>> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
>> index d1eafe2c045c..9b042c7036b3 100644
>> --- a/block/blk-mq-sched.c
>> +++ b/block/blk-mq-sched.c
>> @@ -621,7 +621,7 @@ void blk_mq_sched_free_requests(struct request_queue *q)
>>   
>>   	queue_for_each_hw_ctx(q, hctx, i) {
>>   		if (hctx->sched_tags)
>> -			blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
>> +			blk_mq_free_rqs_ext(q->tag_set, hctx->sched_tags, i, hctx->tags);
>>   	}
>>   }
>>   
>> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
>> index 9c92053e704d..562db72e7d79 100644
>> --- a/block/blk-mq-tag.c
>> +++ b/block/blk-mq-tag.c
>> @@ -576,7 +576,7 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
>>   			return -ENOMEM;
>>   		}
>>   
>> -		blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
>> +		blk_mq_free_rqs_ext(set, *tagsptr, hctx->queue_num, hctx->tags);
>>   		blk_mq_free_rq_map(*tagsptr, flags);
>>   		*tagsptr = new;
>>   	} else {
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index 55bcee5dc032..f3aad695cd25 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -2271,8 +2271,8 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
>>   	return BLK_QC_T_NONE;
>>   }
>>   
>> -void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
>> -		     unsigned int hctx_idx)
>> +void blk_mq_free_rqs_ext(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
>> +		     unsigned int hctx_idx, struct blk_mq_tags *references)
>>   {
>>   	struct page *page;
>>   
>> @@ -2281,10 +2281,13 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
>>   
>>   		for (i = 0; i < tags->nr_tags; i++) {
>>   			struct request *rq = tags->static_rqs[i];
>> +			int j;
>>   
>>   			if (!rq)
>>   				continue;
>>   			set->ops->exit_request(set, rq, hctx_idx);
>> +			for (j = 0; references && j < references->nr_tags; j++)
>> +				cmpxchg(&references->rqs[j], rq, 0);
> 
> Seems you didn't address the comment in the following link:
> 
> 	https://lore.kernel.org/linux-block/10331543-9e45-ae63-8cdb-17e5a2a3b7ef@huawei.com/
> 
> The request to be freed may still be refered in another path, such as blk_mq_queue_tag_busy_iter
> or blk_mq_tagset_busy_iter(), and cmpxchg() doesn't drain/wait for other refers.
> 

Hi Ming,

Yeah, so I said that was another problem which you mentioned there, 
which I'm not addressing, but I don't think that I'm making thing worse 
here.

So AFAICS, the blk-mq/sched code doesn't wait for any "readers" to be 
finished, such as those running blk_mq_queue_tag_busy_iter or 
blk_mq_tagset_busy_iter() in another context.

So how about the idea of introducing some synchronization primitive, 
such as semaphore, which those "readers" must grab and release at start 
and end (of iter), to ensure the requests are not freed during the 
iteration?

Thanks,
John
Ming Lei Dec. 3, 2020, 12:55 a.m. UTC | #3
On Wed, Dec 02, 2020 at 11:18:31AM +0000, John Garry wrote:
> On 02/12/2020 03:31, Ming Lei wrote:
> > On Tue, Dec 01, 2020 at 09:02:18PM +0800, John Garry wrote:
> > > It has been reported many times that a use-after-free can be intermittently
> > > found when iterating busy requests:
> > > 
> > > - https://lore.kernel.org/linux-block/8376443a-ec1b-0cef-8244-ed584b96fa96@huawei.com/
> > > - https://lore.kernel.org/linux-block/5c3ac5af-ed81-11e4-fee3-f92175f14daf@acm.org/T/#m6c1ac11540522716f645d004e2a5a13c9f218908
> > > - https://lore.kernel.org/linux-block/04e2f9e8-79fa-f1cb-ab23-4a15bf3f64cc@kernel.dk/
> > > 
> > > The issue is that when we switch scheduler or change queue nr_requests,
> > > the driver tagset may keep references to the stale requests.
> > > 
> > > As a solution, clean up any references to those requests in the driver
> > > tagset when freeing. This is done with a cmpxchg to make safe any race
> > > with setting the driver tagset request from another queue.
> > > 
> > > Signed-off-by: John Garry <john.garry@huawei.com>
> > > --
> > > Set as RFC as I need to test more. And not sure on solution method, as
> > > Bart had another idea.
> > > 
> > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> > > index d1eafe2c045c..9b042c7036b3 100644
> > > --- a/block/blk-mq-sched.c
> > > +++ b/block/blk-mq-sched.c
> > > @@ -621,7 +621,7 @@ void blk_mq_sched_free_requests(struct request_queue *q)
> > >   	queue_for_each_hw_ctx(q, hctx, i) {
> > >   		if (hctx->sched_tags)
> > > -			blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
> > > +			blk_mq_free_rqs_ext(q->tag_set, hctx->sched_tags, i, hctx->tags);
> > >   	}
> > >   }
> > > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> > > index 9c92053e704d..562db72e7d79 100644
> > > --- a/block/blk-mq-tag.c
> > > +++ b/block/blk-mq-tag.c
> > > @@ -576,7 +576,7 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
> > >   			return -ENOMEM;
> > >   		}
> > > -		blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
> > > +		blk_mq_free_rqs_ext(set, *tagsptr, hctx->queue_num, hctx->tags);
> > >   		blk_mq_free_rq_map(*tagsptr, flags);
> > >   		*tagsptr = new;
> > >   	} else {
> > > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > > index 55bcee5dc032..f3aad695cd25 100644
> > > --- a/block/blk-mq.c
> > > +++ b/block/blk-mq.c
> > > @@ -2271,8 +2271,8 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
> > >   	return BLK_QC_T_NONE;
> > >   }
> > > -void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
> > > -		     unsigned int hctx_idx)
> > > +void blk_mq_free_rqs_ext(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
> > > +		     unsigned int hctx_idx, struct blk_mq_tags *references)
> > >   {
> > >   	struct page *page;
> > > @@ -2281,10 +2281,13 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
> > >   		for (i = 0; i < tags->nr_tags; i++) {
> > >   			struct request *rq = tags->static_rqs[i];
> > > +			int j;
> > >   			if (!rq)
> > >   				continue;
> > >   			set->ops->exit_request(set, rq, hctx_idx);
> > > +			for (j = 0; references && j < references->nr_tags; j++)
> > > +				cmpxchg(&references->rqs[j], rq, 0);
> > 
> > Seems you didn't address the comment in the following link:
> > 
> > 	https://lore.kernel.org/linux-block/10331543-9e45-ae63-8cdb-17e5a2a3b7ef@huawei.com/
> > 
> > The request to be freed may still be refered in another path, such as blk_mq_queue_tag_busy_iter
> > or blk_mq_tagset_busy_iter(), and cmpxchg() doesn't drain/wait for other refers.
> > 
> 
> Hi Ming,
> 
> Yeah, so I said that was another problem which you mentioned there, which
> I'm not addressing, but I don't think that I'm making thing worse here.

The thing is that this patch does not fix the issue completely.

> 
> So AFAICS, the blk-mq/sched code doesn't wait for any "readers" to be
> finished, such as those running blk_mq_queue_tag_busy_iter or
> blk_mq_tagset_busy_iter() in another context.
> 
> So how about the idea of introducing some synchronization primitive, such as
> semaphore, which those "readers" must grab and release at start and end (of
> iter), to ensure the requests are not freed during the iteration?

It looks good, however devil is in details, please make into patch for
review.


thanks,
Ming
John Garry Dec. 3, 2020, 9:26 a.m. UTC | #4
On 03/12/2020 00:55, Ming Lei wrote:

Hi Ming,

>> Yeah, so I said that was another problem which you mentioned there, which
>> I'm not addressing, but I don't think that I'm making thing worse here.
> The thing is that this patch does not fix the issue completely.
> 
>> So AFAICS, the blk-mq/sched code doesn't wait for any "readers" to be
>> finished, such as those running blk_mq_queue_tag_busy_iter or
>> blk_mq_tagset_busy_iter() in another context.
>>
>> So how about the idea of introducing some synchronization primitive, such as
>> semaphore, which those "readers" must grab and release at start and end (of
>> iter), to ensure the requests are not freed during the iteration?
> It looks good, however devil is in details, please make into patch for
> review.

OK, but another thing to say is that I need to find a somewhat reliable 
reproducer for the potential problem you mention. So far this patch 
solves the issue I see (in that kasan stops warning). Let me analyze 
this a bit further.

Thanks,
John
John Garry Dec. 8, 2020, 11:36 a.m. UTC | #5
On 03/12/2020 09:26, John Garry wrote:
> On 03/12/2020 00:55, Ming Lei wrote:
> 
> Hi Ming,
> 
>>> Yeah, so I said that was another problem which you mentioned there, 
>>> which
>>> I'm not addressing, but I don't think that I'm making thing worse here.
>> The thing is that this patch does not fix the issue completely.
>>
>>> So AFAICS, the blk-mq/sched code doesn't wait for any "readers" to be
>>> finished, such as those running blk_mq_queue_tag_busy_iter or
>>> blk_mq_tagset_busy_iter() in another context.
>>>
>>> So how about the idea of introducing some synchronization primitive, 
>>> such as
>>> semaphore, which those "readers" must grab and release at start and 
>>> end (of
>>> iter), to ensure the requests are not freed during the iteration?
>> It looks good, however devil is in details, please make into patch for
>> review.
> 
> OK, but another thing to say is that I need to find a somewhat reliable 
> reproducer for the potential problem you mention. So far this patch 
> solves the issue I see (in that kasan stops warning). Let me analyze 
> this a bit further.
> 

Hi Ming,

I am just looking at this again, and have some doubt on your concern [0].

 From checking blk_mq_queue_tag_busy_iter() specifically, don't we 
actually guard against this with the q->q_usage_counter mechanism? That 
is, an agent needs to grab a q counter ref when attempting the iter. 
This will fail when the queue IO sched is being changed, as we freeze 
the queue during this time, which is when the requests are freed, so no 
agent can hold a reference to a freed request then. And same goes for 
blk_mq_update_nr_requests(), where we freeze the queue.

But I didn't see such a guard for blk_mq_tagset_busy_iter().

Thanks,
John

[0] https://lore.kernel.org/linux-block/20200826123453.GA126923@T590/

Ps. sorry for sending twice
John Garry Dec. 8, 2020, 5:36 p.m. UTC | #6
On 08/12/2020 11:36, John Garry wrote:
>>
>> OK, but another thing to say is that I need to find a somewhat 
>> reliable reproducer for the potential problem you mention. So far this 
>> patch solves the issue I see (in that kasan stops warning). Let me 
>> analyze this a bit further.
>>
> 
> Hi Ming,
> 
> I am just looking at this again, and have some doubt on your concern [0].
> 
>  From checking blk_mq_queue_tag_busy_iter() specifically, don't we 
> actually guard against this with the q->q_usage_counter mechanism? That 
> is, an agent needs to grab a q counter ref when attempting the iter. 
> This will fail when the queue IO sched is being changed, as we freeze 
> the queue during this time, which is when the requests are freed, so no 
> agent can hold a reference to a freed request then. And same goes for 
> blk_mq_update_nr_requests(), where we freeze the queue.
> 
> But I didn't see such a guard for blk_mq_tagset_busy_iter().
> 

So I was able to trigger a use-after-free BUG in 
blk_mq_tagset_busy_iter() even with my change, but I needed to add a 
delay, as follows:

--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -278,6 +278,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, 
unsigned int bitnr, void *data)
                 rq = tags->rqs[bitnr];
         if (!rq)
                 return true;
+       msleep(50);
         if ((iter_data->flags & BT_TAG_ITER_STARTED) &&
             !blk_mq_request_started(rq))
                 return true;


And here's the splat:

[  319.771745] BUG: KASAN: use-after-free in bt_tags_iter+0xe0/0x128 

[  319.777832] Read of size 4 at addr ffff0010b6bd27cc by task more/1866 

[  319.784262]
[  319.785753] CPU: 61 PID: 1866 Comm: more Tainted: G        W 
5.10.0-rc4-18118-gaa7b9c30d8ff #1070
[  319.795312] Hardware name: Huawei Taishan 2280 /D05, BIOS Hisilicon 
D05 IT21 Nemo 2.0 RC0 04/18/2018
[  319.804437] Call trace:
[  319.806892]  dump_backtrace+0x0/0x2d0
[  319.810552]  show_stack+0x18/0x68
[  319.813865]  dump_stack+0x100/0x16c
[  319.817348]  print_address_description.constprop.12+0x6c/0x4e8
[  319.823176]  kasan_report+0x130/0x200
[  319.826831]  __asan_load4+0x9c/0xd8
[  319.830315]  bt_tags_iter+0xe0/0x128
[  319.833884]  __blk_mq_all_tag_iter+0x320/0x3a8
[  319.838320]  blk_mq_tagset_busy_iter+0x8c/0xd8
[  319.842760]  scsi_host_busy+0x88/0xb8
[  319.846418]  show_host_busy+0x1c/0x48
[  319.850079]  dev_attr_show+0x44/0x90
[  319.853655]  sysfs_kf_seq_show+0x128/0x1c8
[  319.857744]  kernfs_seq_show+0xa0/0xb8
[  319.861489]  seq_read_iter+0x1ec/0x6a0
[  319.865230]  seq_read+0x1d0/0x250
[  319.868539]  kernfs_fop_read+0x70/0x330
[  319.872369]  vfs_read+0xe4/0x250
[  319.875590]  ksys_read+0xc8/0x178
[  319.878898]  __arm64_sys_read+0x44/0x58
[  319.882730]  el0_svc_common.constprop.2+0xc4/0x1e8
[  319.887515]  do_el0_svc+0x90/0xa0
[  319.890824]  el0_sync_handler+0x128/0x178
[  319.894825]  el0_sync+0x158/0x180
[  319.898131]
[  319.899614] The buggy address belongs to the page:
[  319.904403] page:000000004e9e6864 refcount:0 mapcount:0 
mapping:0000000000000000 index:0x0 pfn:0x10b6bd2
[  319.913876] flags: 0xbfffc0000000000()
[  319.917626] raw: 0bfffc0000000000 0000000000000000 fffffe0000000000 
0000000000000000
[  319.925363] raw: 0000000000000000 0000000000000000 00000000ffffffff 
0000000000000000
[  319.933096] page dumped because: kasan: bad access detected
[  319.938658]
[  319.940141] Memory state around the buggy address:
[  319.944925]  ffff0010b6bd2680: ff ff ff ff ff ff ff ff ff ff ff ff ff 
ff ff ff
[  319.952139]  ffff0010b6bd2700: ff ff ff ff ff ff ff ff ff ff ff ff ff 
ff ff ff
[  319.959354] >ffff0010b6bd2780: ff ff ff ff ff ff ff ff ff ff ff ff ff 
ff ff ff
[  319.966566] ^
[  319.972131]  ffff0010b6bd2800: ff ff ff ff ff ff ff ff ff ff ff ff ff 
ff ff ff
[  319.979344]  ffff0010b6bd2880: ff ff ff ff ff ff ff ff ff ff ff ff ff 
ff ff ff
[  319.986557] 
==================================================================
[  319.993770] Disabling lock debugging due to kernel taint

So to trigger this, I start fio on a disk, and then have one script 
which constantly enables and disables an IO scheduler for that disk, and 
another which constantly reads /sys/class/scsi_host/host0/host_busy.

I need the delay to make triggering the issue quick, as the window is so 
small between the tag bit being cleared at the point the queue is being 
frozen, and clearing the reference in the tagset.

Anyway, solving this doesn't look trivial...

Thanks,
john
Ming Lei Dec. 9, 2020, 1:01 a.m. UTC | #7
On Tue, Dec 08, 2020 at 11:36:58AM +0000, John Garry wrote:
> On 03/12/2020 09:26, John Garry wrote:
> > On 03/12/2020 00:55, Ming Lei wrote:
> > 
> > Hi Ming,
> > 
> > > > Yeah, so I said that was another problem which you mentioned
> > > > there, which
> > > > I'm not addressing, but I don't think that I'm making thing worse here.
> > > The thing is that this patch does not fix the issue completely.
> > > 
> > > > So AFAICS, the blk-mq/sched code doesn't wait for any "readers" to be
> > > > finished, such as those running blk_mq_queue_tag_busy_iter or
> > > > blk_mq_tagset_busy_iter() in another context.
> > > > 
> > > > So how about the idea of introducing some synchronization
> > > > primitive, such as
> > > > semaphore, which those "readers" must grab and release at start
> > > > and end (of
> > > > iter), to ensure the requests are not freed during the iteration?
> > > It looks good, however devil is in details, please make into patch for
> > > review.
> > 
> > OK, but another thing to say is that I need to find a somewhat reliable
> > reproducer for the potential problem you mention. So far this patch
> > solves the issue I see (in that kasan stops warning). Let me analyze
> > this a bit further.
> > 
> 
> Hi Ming,
> 
> I am just looking at this again, and have some doubt on your concern [0].
> 
> From checking blk_mq_queue_tag_busy_iter() specifically, don't we actually
> guard against this with the q->q_usage_counter mechanism? That is, an agent
> needs to grab a q counter ref when attempting the iter. This will fail when
> the queue IO sched is being changed, as we freeze the queue during this
> time, which is when the requests are freed, so no agent can hold a reference
> to a freed request then. And same goes for blk_mq_update_nr_requests(),
> where we freeze the queue.

blk_mq_queue_tag_busy_iter() can be run on another request queue just
between one driver tag is allocated and updating the request map, so one
extra request reference still can be grabbed.

So looks only holding one queue's usage_counter doesn't help this issue, since
bt_for_each() always iterates on driver tags wide.

> 
> But I didn't see such a guard for blk_mq_tagset_busy_iter().

IMO there isn't real difference between the two iteration.


Thanks, 
Ming
John Garry Dec. 9, 2020, 9:55 a.m. UTC | #8
On 09/12/2020 01:01, Ming Lei wrote:
> blk_mq_queue_tag_busy_iter() can be run on another request queue just
> between one driver tag is allocated and updating the request map, so one
> extra request reference still can be grabbed.
> 
> So looks only holding one queue's usage_counter doesn't help this issue, since
> bt_for_each() always iterates on driver tags wide.
> 
>> But I didn't see such a guard for blk_mq_tagset_busy_iter().
> IMO there isn't real difference between the two iteration.

ok, I see. Let me try to recreate that one, which will prob again 
require artificial delays added.

Apart from this, my concern is that we come with for a solution, but 
it's a complicated solution and may not be accepted as this issue is not 
seen as a problem in practice.

Thanks,
John
Ming Lei Dec. 10, 2020, 2:07 a.m. UTC | #9
On Wed, Dec 09, 2020 at 09:55:30AM +0000, John Garry wrote:
> On 09/12/2020 01:01, Ming Lei wrote:
> > blk_mq_queue_tag_busy_iter() can be run on another request queue just
> > between one driver tag is allocated and updating the request map, so one
> > extra request reference still can be grabbed.
> > 
> > So looks only holding one queue's usage_counter doesn't help this issue, since
> > bt_for_each() always iterates on driver tags wide.
> > 
> > > But I didn't see such a guard for blk_mq_tagset_busy_iter().
> > IMO there isn't real difference between the two iteration.
> 
> ok, I see. Let me try to recreate that one, which will prob again require
> artificial delays added.
> 
> Apart from this, my concern is that we come with for a solution, but it's a
> complicated solution and may not be accepted as this issue is not seen as a
> problem in practice.

If that is the case, I'd suggest to consider the solution in the
following link:

https://lore.kernel.org/linux-block/20200820180335.3109216-1-ming.lei@redhat.com/

At least, the idea is simple, which can be extended to support allocate driver tags
request pool dynamically.


Thanks, 
Ming
John Garry Dec. 10, 2020, 10:44 a.m. UTC | #10
Hi Ming,

On 10/12/2020 02:07, Ming Lei wrote:
>> Apart from this, my concern is that we come with for a solution, but it's a
>> complicated solution and may not be accepted as this issue is not seen as a
>> problem in practice.
> If that is the case, I'd suggest to consider the solution in the
> following link:
> 
> https://lore.kernel.org/linux-block/20200820180335.3109216-1-ming.lei@redhat.com/
> 
> At least, the idea is simple, which can be extended to support allocate driver tags
> request pool dynamically.

As I see with your approach, we may still iterate a stale request, but 
it just has not been freed, so just no use-after-free BUG, right? Rather 
it is cached until all references dropped. It may be best solution.

So I'll try an experiment today to prove your concern about 
blk_mq_queue_tag_busy_iter(). Then look at possible solution which 
builds on patch in $subject, and compare.

Thanks,
John
John Garry Dec. 10, 2020, 12:22 p.m. UTC | #11
Hi Ming,

> 
> So I'll try an experiment today to prove your concern about 
> blk_mq_queue_tag_busy_iter(). Then look at possible solution which 
> builds on patch in $subject, and compare.

JFYI, I was able to trigger this:

[ 1651.324150] 
==================================================================
[ 1651.331400] BUG: KASAN: use-after-free in bt_iter+0xa0/0x120
[ 1651.337054] Read of size 8 at addr ffff00108d589300 by task fio/3052

[ 1651.344891] CPU: 32 PID: 3052 Comm: fio Tainted: GW 
5.10.0-rc4-64839-g2dcf1ee5054f #693
[ 1651.354281] Hardware name: Huawei Taishan 2280 /D05, BIOS Hisilicon 
D05 IT21 Nemo 2.0 RC0 04/18/2018
[ 1651.363407] Call trace:
[ 1651.365861]  dump_backtrace+0x0/0x2d0
[ 1651.369519]  show_stack+0x18/0x68
[ 1651.372833]  dump_stack+0x100/0x16c
[ 1651.376316]  print_address_description.constprop.12+0x6c/0x4e8
[ 1651.382146]  kasan_report+0x130/0x200
[ 1651.385801]  __asan_load8+0x9c/0xd8
[ 1651.389284]  bt_iter+0xa0/0x120
[ 1651.392419]  blk_mq_queue_tag_busy_iter+0x2d8/0x540
[ 1651.397293]  blk_mq_in_flight+0x80/0xb8
[ 1651.401121]  part_stat_show+0xd8/0x238
[ 1651.404867]  dev_attr_show+0x44/0x90
[ 1651.408439]  sysfs_kf_seq_show+0x128/0x1c8
[ 1651.412530]  kernfs_seq_show+0xa0/0xb8
[ 1651.416274]  seq_read_iter+0x1ec/0x6a0
[ 1651.420019]  seq_read+0x1d0/0x250
[ 1651.423331]  kernfs_fop_read+0x70/0x330
[ 1651.427161]  vfs_read+0xe4/0x250
[ 1651.430382]  ksys_read+0xc8/0x178
[ 1651.433690]  __arm64_sys_read+0x44/0x58
[ 1651.437524]  el0_svc_common.constprop.2+0xc4/0x1e8
[ 1651.442309]  do_el0_svc+0x90/0xa0
[ 1651.445619]  el0_sync_handler+0x128/0x178
[ 1651.449623]  el0_sync+0x158/0x180

[ 1651.454418] The buggy address belongs to the page:
[ 1651.459208] page:00000000140c0813 refcount:0 mapcount:0 
mapping:0000000000000000 index:0x0 pfn:0x108d589
[ 1651.468683] flags: 0xbfffc0000000000()
[ 1651.472432] raw: 0bfffc0000000000 0000000000000000 ffffffff42150201 
0000000000000000
[ 1651.480173] raw: 0000000000000000 0000000000000000 00000000ffffffff 
0000000000000000
[ 1651.487909] page dumped because: kasan: bad access detected

[ 1651.494956] Memory state around the buggy address:
[ 1651.499744]  ffff00108d589200: ff ff ff ff ff ff ff ff ff ff ff ff ff 
ff ff ff
[ 1651.506960]  ffff00108d589280: ff ff ff ff ff ff ff ff ff ff ff ff ff 
ff ff ff
[ 1651.514176] >ffff00108d589300: ff ff ff ff ff ff ff ff ff ff ff ff ff 
ff ff ff
[ 1651.521389]       ^
[ 1651.524611]  ffff00108d589380: ff ff ff ff ff ff ff ff ff ff ff ff ff 
ff ff ff
[ 1651.531827]  ffff00108d589400: ff ff ff ff ff ff ff ff ff ff ff ff ff 
ff ff ff
[ 1651.539044] 
==================================================================
[ 1651.546258] Disabling lock debugging due to kernel taint
john@ubuntu:~$

So I run fio on disk providing root partition and another disk, and 
constantly change IO scheduler on other disk.

I did also add this delay to trigger in reasonable timeframe:

+++ b/block/blk-mq-tag.c
@@ -215,8 +215,11 @@ static bool bt_iter(struct sbitmap *bitmap, 
unsigned int bitnr, void *data)
* We can hit rq == NULL here, because the tagging functions
* test and set the bit before assigning ->rqs[].
*/
-  if (rq && rq->q == hctx->queue && rq->mq_hctx == hctx)
-     return iter_data->fn(hctx, rq, iter_data->data, reserved);
+  if (rq) {
+     mdelay(50);
+     if (rq->q == hctx->queue && rq->mq_hctx == hctx)
+        return iter_data->fn(hctx, rq, iter_data->data, reserved);
+       }
        return true;
}


Thanks,
John
Ming Lei Dec. 11, 2020, 12:21 a.m. UTC | #12
On Thu, Dec 10, 2020 at 10:44:54AM +0000, John Garry wrote:
> Hi Ming,
> 
> On 10/12/2020 02:07, Ming Lei wrote:
> > > Apart from this, my concern is that we come with for a solution, but it's a
> > > complicated solution and may not be accepted as this issue is not seen as a
> > > problem in practice.
> > If that is the case, I'd suggest to consider the solution in the
> > following link:
> > 
> > https://lore.kernel.org/linux-block/20200820180335.3109216-1-ming.lei@redhat.com/
> > 
> > At least, the idea is simple, which can be extended to support allocate driver tags
> > request pool dynamically.
> 
> As I see with your approach, we may still iterate a stale request, but it
> just has not been freed, so just no use-after-free BUG, right? Rather it is
> cached until all references dropped. It may be best solution.

The approach just need to read the stale request pointer, and won't
access any field of that request, so no UAF. Yeah, the stale request
won't be freed until when the reference from request pool is dropped.



Thanks,
Ming
diff mbox series

Patch

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index d1eafe2c045c..9b042c7036b3 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -621,7 +621,7 @@  void blk_mq_sched_free_requests(struct request_queue *q)
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (hctx->sched_tags)
-			blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
+			blk_mq_free_rqs_ext(q->tag_set, hctx->sched_tags, i, hctx->tags);
 	}
 }
 
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 9c92053e704d..562db72e7d79 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -576,7 +576,7 @@  int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
 			return -ENOMEM;
 		}
 
-		blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
+		blk_mq_free_rqs_ext(set, *tagsptr, hctx->queue_num, hctx->tags);
 		blk_mq_free_rq_map(*tagsptr, flags);
 		*tagsptr = new;
 	} else {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 55bcee5dc032..f3aad695cd25 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2271,8 +2271,8 @@  blk_qc_t blk_mq_submit_bio(struct bio *bio)
 	return BLK_QC_T_NONE;
 }
 
-void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
-		     unsigned int hctx_idx)
+void blk_mq_free_rqs_ext(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+		     unsigned int hctx_idx, struct blk_mq_tags *references)
 {
 	struct page *page;
 
@@ -2281,10 +2281,13 @@  void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 
 		for (i = 0; i < tags->nr_tags; i++) {
 			struct request *rq = tags->static_rqs[i];
+			int j;
 
 			if (!rq)
 				continue;
 			set->ops->exit_request(set, rq, hctx_idx);
+			for (j = 0; references && j < references->nr_tags; j++)
+				cmpxchg(&references->rqs[j], rq, 0);
 			tags->static_rqs[i] = NULL;
 		}
 	}
diff --git a/block/blk-mq.h b/block/blk-mq.h
index a52703c98b77..53074844e733 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -51,8 +51,10 @@  struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
 /*
  * Internal helpers for allocating/freeing the request map
  */
-void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
-		     unsigned int hctx_idx);
+#define blk_mq_free_rqs(set, tags, hctx_idx) \
+	blk_mq_free_rqs_ext(set, tags, hctx_idx, NULL)
+void blk_mq_free_rqs_ext(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+		     unsigned int hctx_idx, struct blk_mq_tags *references);
 void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags);
 struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
 					unsigned int hctx_idx,