diff mbox

[RFC] target/file: add support of direct and async I/O

Message ID 20180321184702.GA11249@infradead.org (mailing list archive)
State New, archived
Headers show

Commit Message

Christoph Hellwig March 21, 2018, 6:47 p.m. UTC
On Wed, Mar 21, 2018 at 11:16:09AM -0700, Andrei Vagin wrote:
> If we look at lo_rw_aio, we can find that bvec can be allocated or got
> from bio. I think the problem with the second case.
> 
> static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
>                      loff_t pos, bool rw)
> {
> ...
>         if (rq->bio != rq->biotail) {
> 		...
>                 bvec = kmalloc(sizeof(struct bio_vec) * segments,
> GFP_NOIO);
> 		...
> 	} else {
> 		...
>                 bvec = __bvec_iter_bvec(bio->bi_io_vec,
> bio->bi_iter);

That is the local 'bvec' variable, not cmd->bvec which is only
assigned to the kmalloc return value.

> BTW: I tried your patch and I still get the same kasan warning.

Strange, as the trace can't really happen with the patch applied.
In fact the ->bvec field isn't even needed, e.g. we should do something
like this on top of the previous patch:

--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Andrey Vagin March 21, 2018, 8:25 p.m. UTC | #1
On Wed, Mar 21, 2018 at 11:47:02AM -0700, Christoph Hellwig wrote:
> On Wed, Mar 21, 2018 at 11:16:09AM -0700, Andrei Vagin wrote:
> > If we look at lo_rw_aio, we can find that bvec can be allocated or got
> > from bio. I think the problem with the second case.
> > 
> > static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
> >                      loff_t pos, bool rw)
> > {
> > ...
> >         if (rq->bio != rq->biotail) {
> > 		...
> >                 bvec = kmalloc(sizeof(struct bio_vec) * segments,
> > GFP_NOIO);
> > 		...
> > 	} else {
> > 		...
> >                 bvec = __bvec_iter_bvec(bio->bi_io_vec,
> > bio->bi_iter);
> 
> That is the local 'bvec' variable, not cmd->bvec which is only
> assigned to the kmalloc return value.
> 
> > BTW: I tried your patch and I still get the same kasan warning.
> 
> Strange, as the trace can't really happen with the patch applied.

In the trace, we see that bvec is accessed from iov_iter_revert(), but it
has been already released. This means than bvec was not allocated in
lo_rw_aio(). In the third trace, we can see that this bvec was released
in lo_rw_aio_complete()->bio_endio(). This means that we used bvec from
bio.

 [  101.436741] Freed by task 693:
[  101.437187]  __kasan_slab_free+0x136/0x180
[  101.437790]  kmem_cache_free+0xc6/0x310
[  101.438360]  bio_put+0xdb/0xf0
[  101.438844]  bio_endio+0x23b/0x400
[  101.439385]  blk_update_request+0x12a/0x660
[  101.440139]  blk_mq_end_request+0x26/0x80
[  101.440724]  __blk_mq_complete_request+0x30a/0x400
[  101.441406]  blk_mq_complete_request+0x110/0x160
[  101.442071]  lo_rw_aio_complete+0xbe/0x240
[  101.442664]  fuse_aio_complete+0x177/0x220


Bellow you can find traces from the kernel with two last patches.

[  101.335363] ==================================================================
[  101.339372] BUG: KASAN: use-after-free in iov_iter_revert+0xa7/0x440
[  101.340612] Read of size 4 at addr ffff8800b95003d0 by task loop0/697

[  101.342050] CPU: 0 PID: 697 Comm: loop0 Not tainted 4.16.0-rc6-00035-g5b7b2f4a8aba #510
[  101.343481] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1.fc26 04/01/2014
[  101.344762] Call Trace:
[  101.345140]  dump_stack+0xda/0x16f
[  101.345669]  ? _atomic_dec_and_lock+0x101/0x101
[  101.346394]  ? del_timer_sync+0xac/0xd0
[  101.347157]  print_address_description+0x6a/0x270
[  101.348014]  kasan_report+0x258/0x380
[  101.348671]  ? iov_iter_revert+0xa7/0x440
[  101.349511]  iov_iter_revert+0xa7/0x440
[  101.350362]  generic_file_read_iter+0xfd0/0x12f0
[  101.351224]  ? match_held_lock+0x7f/0x340
[  101.351958]  ? __save_stack_trace+0x82/0x100
[  101.352883]  ? filemap_range_has_page+0x1b0/0x1b0
[  101.353788]  ? ret_from_fork+0x3a/0x50
[  101.354716]  ? fuse_simple_request+0x1a7/0x280
[  101.355966]  ? save_stack+0x89/0xb0
[  101.356918]  ? find_held_lock+0x6d/0xd0
[  101.357735]  ? fuse_change_attributes+0x243/0x350
[  101.358676]  ? lock_downgrade+0x320/0x320
[  101.360030]  ? map_id_range_down+0x19a/0x1c0
[  101.361880]  ? do_raw_spin_unlock+0x147/0x220
[  101.363089]  ? do_raw_spin_trylock+0x100/0x100
[  101.363960]  ? do_raw_write_trylock+0x100/0x100
[  101.364816]  ? _raw_spin_unlock+0x24/0x30
[  101.365644]  ? fuse_change_attributes+0x32e/0x350
[  101.366567]  ? fuse_change_attributes_common+0x300/0x300
[  101.367539]  ? fuse_simple_request+0x1a7/0x280
[  101.368339]  ? fuse_simple_request+0x1a7/0x280
[  101.369173]  ? fuse_do_getattr+0x27b/0x6e0
[  101.369950]  ? fuse_dentry_revalidate+0x640/0x640
[  101.370822]  ? find_held_lock+0x6d/0xd0
[  101.371638]  ? lock_release+0x4f0/0x4f0
[  101.372351]  ? match_held_lock+0x7f/0x340
[  101.373130]  lo_rw_aio+0x942/0xd10
[  101.373841]  ? lo_rw_aio_complete+0x240/0x240
[  101.374665]  ? match_held_lock+0x7f/0x340
[  101.375413]  ? clear_buddies+0x42/0x210
[  101.376330]  ? debug_check_no_locks_freed+0x1b0/0x1b0
[  101.377266]  ? finish_task_switch+0x181/0x500
[  101.378099]  ? do_raw_spin_unlock+0x147/0x220
[  101.378872]  loop_queue_work+0x10aa/0x1900
[  101.379635]  ? _raw_spin_unlock_irq+0x29/0x40
[  101.380268]  ? _raw_spin_unlock_irq+0x29/0x40
[  101.380902]  ? finish_task_switch+0x181/0x500
[  101.381534]  ? finish_task_switch+0x10d/0x500
[  101.382165]  ? lo_compat_ioctl+0xe0/0xe0
[  101.382754]  ? set_load_weight+0x110/0x110
[  101.383503]  ? __switch_to_asm+0x34/0x70
[  101.384211]  ? __switch_to_asm+0x34/0x70
[  101.384920]  ? __switch_to_asm+0x40/0x70
[  101.385628]  ? __switch_to_asm+0x34/0x70
[  101.386713]  ? __switch_to_asm+0x40/0x70
[  101.387564]  ? __switch_to_asm+0x34/0x70
[  101.389006]  ? __switch_to_asm+0x40/0x70
[  101.389927]  ? __switch_to_asm+0x34/0x70
[  101.390752]  ? __switch_to_asm+0x40/0x70
[  101.391522]  ? __switch_to_asm+0x34/0x70
[  101.392687]  ? __switch_to_asm+0x40/0x70
[  101.394136]  ? __switch_to_asm+0x34/0x70
[  101.394955]  ? __switch_to_asm+0x40/0x70
[  101.395532]  ? __switch_to_asm+0x34/0x70
[  101.396092]  ? __switch_to_asm+0x40/0x70
[  101.396673]  ? __schedule+0x50e/0x11b0
[  101.397241]  ? mark_held_locks+0x1c/0x90
[  101.397896]  ? _raw_spin_unlock_irq+0x29/0x40
[  101.398839]  ? match_held_lock+0x7f/0x340
[  101.399564]  ? save_trace+0x1e0/0x1e0
[  101.400233]  ? finish_task_switch+0x181/0x500
[  101.401026]  ? finish_task_switch+0x10d/0x500
[  101.401818]  ? __switch_to_asm+0x34/0x70
[  101.402536]  ? __switch_to_asm+0x34/0x70
[  101.403255]  ? set_load_weight+0x110/0x110
[  101.403996]  ? __switch_to_asm+0x34/0x70
[  101.404730]  ? __switch_to_asm+0x34/0x70
[  101.405424]  ? __switch_to_asm+0x40/0x70
[  101.406150]  ? find_held_lock+0x6d/0xd0
[  101.406869]  ? kthread_worker_fn+0x229/0x520
[  101.407635]  ? lock_downgrade+0x320/0x320
[  101.408356]  ? do_raw_spin_unlock+0x147/0x220
[  101.409316]  ? do_raw_spin_trylock+0x100/0x100
[  101.409969]  ? rcu_note_context_switch+0x4e0/0x4e0
[  101.410729]  ? match_held_lock+0x7f/0x340
[  101.411440]  ? mark_held_locks+0x1c/0x90
[  101.412150]  ? _raw_spin_unlock_irq+0x29/0x40
[  101.412959]  kthread_worker_fn+0x272/0x520
[  101.413648]  ? kthread+0x1e0/0x1e0
[  101.414482]  ? find_held_lock+0x6d/0xd0
[  101.415400]  ? lock_downgrade+0x320/0x320
[  101.416319]  ? __schedule+0x11b0/0x11b0
[  101.417130]  ? do_raw_spin_unlock+0x147/0x220
[  101.417847]  ? do_raw_spin_trylock+0x100/0x100
[  101.418560]  ? __lockdep_init_map+0x98/0x2a0
[  101.419446]  ? __init_waitqueue_head+0xbe/0xf0
[  101.420333]  ? mark_held_locks+0x1c/0x90
[  101.421444]  ? _raw_spin_unlock_irqrestore+0x32/0x60
[  101.422726]  ? loop_get_status64+0x100/0x100
[  101.423629]  kthread+0x1b7/0x1e0
[  101.424212]  ? kthread_create_worker_on_cpu+0xc0/0xc0
[  101.425137]  ret_from_fork+0x3a/0x50

[  101.426481] Allocated by task 699:
[  101.427260]  kasan_kmalloc+0xa0/0xd0
[  101.427986]  kmem_cache_alloc+0xca/0x2d0
[  101.428659]  mempool_alloc+0x117/0x2f0
[  101.429330]  bio_alloc_bioset+0x34d/0x4a0
[  101.430147]  mpage_alloc.isra.8+0x45/0x110
[  101.430652]  do_mpage_readpage+0xa70/0xc60
[  101.431158]  mpage_readpages+0x2d1/0x400
[  101.431688]  __do_page_cache_readahead+0x528/0x740
[  101.432381]  force_page_cache_readahead+0x10b/0x170
[  101.433091]  generic_file_read_iter+0xeb2/0x12f0
[  101.433763]  __vfs_read+0x2a9/0x3c0
[  101.434288]  vfs_read+0xb7/0x1b0
[  101.434759]  SyS_read+0xb6/0x140
[  101.435231]  do_syscall_64+0x193/0x470
[  101.435786]  entry_SYSCALL_64_after_hwframe+0x42/0xb7

[  101.436741] Freed by task 693:
[  101.437187]  __kasan_slab_free+0x136/0x180
[  101.437790]  kmem_cache_free+0xc6/0x310
[  101.438360]  bio_put+0xdb/0xf0
[  101.438844]  bio_endio+0x23b/0x400
[  101.439385]  blk_update_request+0x12a/0x660
[  101.440139]  blk_mq_end_request+0x26/0x80
[  101.440724]  __blk_mq_complete_request+0x30a/0x400
[  101.441406]  blk_mq_complete_request+0x110/0x160
[  101.442071]  lo_rw_aio_complete+0xbe/0x240
[  101.442664]  fuse_aio_complete+0x177/0x220
[  101.443241]  request_end+0x237/0x470
[  101.443814]  fuse_dev_do_write+0x1325/0x1910
[  101.444477]  fuse_dev_write+0x11d/0x180
[  101.445037]  do_iter_readv_writev+0x2a5/0x370
[  101.445663]  do_iter_write+0xd1/0x280
[  101.446187]  vfs_writev+0x145/0x230
[  101.446690]  do_writev+0xc0/0x1b0
[  101.447170]  do_syscall_64+0x193/0x470
[  101.447711]  entry_SYSCALL_64_after_hwframe+0x42/0xb7

[  101.448672] The buggy address belongs to the object at ffff8800b9500340
                which belongs to the cache bio-0 of size 200
[  101.450534] The buggy address is located 144 bytes inside of
                200-byte region [ffff8800b9500340, ffff8800b9500408)
[  101.452229] The buggy address belongs to the page:
[  101.452939] page:ffffea0002e54000 count:1 mapcount:0 mapping:0000000000000000 index:0xffff8800b95004c0 compound_mapcount: 0
[  101.454468] flags: 0x1fffc000008100(slab|head)
[  101.455149] raw: 001fffc000008100 0000000000000000 ffff8800b95004c0 0000000100150006
[  101.456525] raw: ffffea00045105a0 ffff8801198e2580 ffff880119a58700 0000000000000000
[  101.457560] page dumped because: kasan: bad access detected

[  101.458526] Memory state around the buggy address:
[  101.459174]  ffff8800b9500280: fb fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[  101.460319]  ffff8800b9500300: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
[  101.461412] >ffff8800b9500380: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[  101.462647]                                                  ^
[  101.463898]  ffff8800b9500400: fb fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[  101.465217]  ffff8800b9500480: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[  101.466250] ==================================================================
[  101.467443] Disabling lock debugging due to kernel taint


> In fact the ->bvec field isn't even needed, e.g. we should do something
> like this on top of the previous patch:

I said this in a previous mail ;)

> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index 9cb6078cd4f0..0401fade3d60 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -477,7 +477,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
>  		     loff_t pos, bool rw)
>  {
>  	struct iov_iter iter;
> -	struct bio_vec *bvec;
> +	struct bio_vec *bvec, *alloc_bvec = NULL;
>  	struct request *rq = cmd->rq;
>  	struct bio *bio = rq->bio;
>  	struct file *file = lo->lo_backing_file;
> @@ -491,10 +491,10 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
>  
>  		__rq_for_each_bio(bio, rq)
>  			segments += bio_segments(bio);
> -		bvec = kmalloc(sizeof(struct bio_vec) * segments, GFP_NOIO);
> -		if (!bvec)
> +		alloc_bvec = kmalloc_array(segments,
> +				sizeof(struct bio_vec), GFP_NOIO);
> +		if (!alloc_bvec)
>  			return -EIO;
> -		cmd->bvec = bvec;
>  
>  		/*
>  		 * The bios of the request may be started from the middle of
> @@ -502,11 +502,12 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
>  		 * copy bio->bi_iov_vec to new bvec. The rq_for_each_segment
>  		 * API will take care of all details for us.
>  		 */
> +		bvec = alloc_bvec;
>  		rq_for_each_segment(tmp, rq, iter) {
>  			*bvec = tmp;
>  			bvec++;
>  		}
> -		bvec = cmd->bvec;
> +		bvec = alloc_bvec;
>  		offset = 0;
>  	} else {
>  		/*
> @@ -535,12 +536,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
>  	else
>  		ret = call_read_iter(file, &cmd->iocb, &iter);
>  
> -	/*
> -	 * Clear bvec to NULL as lo_rw_aio only allocates and sets it for
> -	 * the multiple bios per request case and we want a clean slate here.
> -	 */
> -	kfree(cmd->bvec);
> -	cmd->bvec = NULL;
> +	kfree(alloc_bvec);
>  	kthread_associate_blkcg(NULL);
>  
>  	if (ret != -EIOCBQUEUED)
> diff --git a/drivers/block/loop.h b/drivers/block/loop.h
> index ba65848c7c33..b8eef1977f64 100644
> --- a/drivers/block/loop.h
> +++ b/drivers/block/loop.h
> @@ -70,7 +70,6 @@ struct loop_cmd {
>  	bool use_aio; /* use AIO interface to handle I/O */
>  	long ret;
>  	struct kiocb iocb;
> -	struct bio_vec *bvec;
>  	struct cgroup_subsys_state *css;
>  };
>  
--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 9cb6078cd4f0..0401fade3d60 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -477,7 +477,7 @@  static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 		     loff_t pos, bool rw)
 {
 	struct iov_iter iter;
-	struct bio_vec *bvec;
+	struct bio_vec *bvec, *alloc_bvec = NULL;
 	struct request *rq = cmd->rq;
 	struct bio *bio = rq->bio;
 	struct file *file = lo->lo_backing_file;
@@ -491,10 +491,10 @@  static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 
 		__rq_for_each_bio(bio, rq)
 			segments += bio_segments(bio);
-		bvec = kmalloc(sizeof(struct bio_vec) * segments, GFP_NOIO);
-		if (!bvec)
+		alloc_bvec = kmalloc_array(segments,
+				sizeof(struct bio_vec), GFP_NOIO);
+		if (!alloc_bvec)
 			return -EIO;
-		cmd->bvec = bvec;
 
 		/*
 		 * The bios of the request may be started from the middle of
@@ -502,11 +502,12 @@  static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 		 * copy bio->bi_iov_vec to new bvec. The rq_for_each_segment
 		 * API will take care of all details for us.
 		 */
+		bvec = alloc_bvec;
 		rq_for_each_segment(tmp, rq, iter) {
 			*bvec = tmp;
 			bvec++;
 		}
-		bvec = cmd->bvec;
+		bvec = alloc_bvec;
 		offset = 0;
 	} else {
 		/*
@@ -535,12 +536,7 @@  static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 	else
 		ret = call_read_iter(file, &cmd->iocb, &iter);
 
-	/*
-	 * Clear bvec to NULL as lo_rw_aio only allocates and sets it for
-	 * the multiple bios per request case and we want a clean slate here.
-	 */
-	kfree(cmd->bvec);
-	cmd->bvec = NULL;
+	kfree(alloc_bvec);
 	kthread_associate_blkcg(NULL);
 
 	if (ret != -EIOCBQUEUED)
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index ba65848c7c33..b8eef1977f64 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -70,7 +70,6 @@  struct loop_cmd {
 	bool use_aio; /* use AIO interface to handle I/O */
 	long ret;
 	struct kiocb iocb;
-	struct bio_vec *bvec;
 	struct cgroup_subsys_state *css;
 };