diff mbox

[PATCHSET] Add support for simplified async direct-io

Message ID f8c29e81-7323-7362-b507-1b7c61e8a19a@fb.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jens Axboe Nov. 16, 2016, 4:09 p.m. UTC
On 11/14/2016 11:11 AM, Christoph Hellwig wrote:
> On Mon, Nov 14, 2016 at 11:08:46AM -0700, Jens Axboe wrote:
>> It'd be cleaner to loop one level out, and avoid all that 'dio' stuff
>> instead. And then still retain the separate parts of the sync and async.
>> There's nothing to share there imho, and it just makes the code harder
>> to read.
>
> How do you avoid it for the async case?  We can only call ki_complete
> once all bios have finished, which means we need a tracking structure
> for it.  For the synchronous case we could in theory wait for the
> previous bio before sending the next, but there are plenty of RAID
> arrays that would prefer > 1MB I/O.  And we can pretty much reuse the
> async case for this anyway.

Another idea - just limit the max we can support in the simplified
version as a single bio. If the user is requesting more, fall back to
the old slow path. Doesn't really matter, since the transfer size will
be large enough to hopefully overcome any deficiencies in the old path.

That avoids having the atomic inc/dec in the fast path, and having to
loop for both aio/sync O_DIRECT.

Example below.

  	struct bio bio;
@@ -204,13 +262,21 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, 
struct iov_iter *iter,
  	if ((pos | iov_iter_alignment(iter)) & ((1 << blkbits) - 1))
  		return -EINVAL;

+	if (nr_pages <= DIO_INLINE_BIO_VECS)
+		vecs = inline_vecs;
+	else {
+		vecs = kmalloc(nr_pages * sizeof(struct bio_vec), GFP_KERNEL);
+		if (!vecs)
+			return -ENOMEM;
+	}
+
  	bio_init(&bio);
  	bio.bi_max_vecs = nr_pages;
-	bio.bi_io_vec = inline_vecs;
+	bio.bi_io_vec = vecs;
  	bio.bi_bdev = bdev;
  	bio.bi_iter.bi_sector = pos >> blkbits;
  	bio.bi_private = current;
-	bio.bi_end_io = blkdev_bio_end_io_simple;
+	bio.bi_end_io = blkdev_bio_end_io_sync;

  	ret = bio_iov_iter_get_pages(&bio, iter);
  	if (unlikely(ret))
@@ -243,6 +309,9 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct 
iov_iter *iter,
  		put_page(bvec->bv_page);
  	}

+	if (vecs != inline_vecs)
+		kfree(vecs);
+
  	if (unlikely(bio.bi_error))
  		return bio.bi_error;
  	iocb->ki_pos += ret;
@@ -252,18 +321,25 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, 
struct iov_iter *iter,
  static ssize_t
  blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
  {
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = bdev_file_inode(file);
  	int nr_pages;

-	nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
+	nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1);
  	if (!nr_pages)
  		return 0;
-	if (is_sync_kiocb(iocb) && nr_pages <= DIO_INLINE_BIO_VECS)
-		return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
-	return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter,
-				    blkdev_get_block, NULL, NULL,
-				    DIO_SKIP_DIO_COUNT);
+
+	if (nr_pages > BIO_MAX_PAGES) {
+		struct file *file = iocb->ki_filp;
+		struct inode *inode = bdev_file_inode(file);
+
+		return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter,
+						blkdev_get_block, NULL, NULL,
+						DIO_SKIP_DIO_COUNT);
+	}
+
+	if (is_sync_kiocb(iocb))
+		return __blkdev_direct_IO_sync(iocb, iter, nr_pages);
+
+	return __blkdev_direct_IO_async(iocb, iter, nr_pages);
  }

  int __sync_blockdev(struct block_device *bdev, int wait)

Comments

Christoph Hellwig Nov. 16, 2016, 4:31 p.m. UTC | #1
On Wed, Nov 16, 2016 at 09:09:17AM -0700, Jens Axboe wrote:
> Another idea - just limit the max we can support in the simplified
> version as a single bio. If the user is requesting more, fall back to
> the old slow path. Doesn't really matter, since the transfer size will
> be large enough to hopefully overcome any deficiencies in the old path.

I really want to kill off the old code :)  Maybe we could use my
iomap-based code instead of another block version if we delegate it
far enough away from the host path with a patch like yours, but I really
wonder if avoiding the atomic inc/dec is that important once we are
above a minimal threshold.  That being said I think we could avoid
the atomic op for the the single-bio cases even with code that handles
multiples bios.  Kent actually noted that on my full blown iomap
version.

But if we need another fast but not super fast case your patch below
looks reasonable.  Although we really need to implement O_SYNC/O_DSYNC
support for it to avoid losing data.

> +	/*
> +	 * Overload bio size in error. If it gets set, we lose the
> +	 * size, but we don't need the size for that case. IO is limited
> +	 * to BIO_MAX_PAGES, so we can't overflow.
> +	 */
> +	ret = bio->bi_error = bio->bi_iter.bi_size;

Oh, that's a neat trick. 

> +	if (nr_pages <= DIO_INLINE_BIO_VECS)
> +		vecs = inline_vecs;
> +	else {
> +		vecs = kmalloc(nr_pages * sizeof(struct bio_vec), GFP_KERNEL);

kmalloc_array?
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jens Axboe Nov. 16, 2016, 4:57 p.m. UTC | #2
On 11/16/2016 09:31 AM, Christoph Hellwig wrote:
> On Wed, Nov 16, 2016 at 09:09:17AM -0700, Jens Axboe wrote:
>> Another idea - just limit the max we can support in the simplified
>> version as a single bio. If the user is requesting more, fall back to
>> the old slow path. Doesn't really matter, since the transfer size will
>> be large enough to hopefully overcome any deficiencies in the old path.
>
> I really want to kill off the old code :)  Maybe we could use my
> iomap-based code instead of another block version if we delegate it
> far enough away from the host path with a patch like yours, but I really
> wonder if avoiding the atomic inc/dec is that important once we are
> above a minimal threshold.  That being said I think we could avoid
> the atomic op for the the single-bio cases even with code that handles
> multiples bios.  Kent actually noted that on my full blown iomap
> version.

I'd love to kill it off too, but that might take a bit longer. At least
the bdev versions are nice and simple.

> But if we need another fast but not super fast case your patch below
> looks reasonable.  Although we really need to implement O_SYNC/O_DSYNC
> support for it to avoid losing data.

I can add the flush support.

>> +	/*
>> +	 * Overload bio size in error. If it gets set, we lose the
>> +	 * size, but we don't need the size for that case. IO is limited
>> +	 * to BIO_MAX_PAGES, so we can't overflow.
>> +	 */
>> +	ret = bio->bi_error = bio->bi_iter.bi_size;
>
> Oh, that's a neat trick.

It's a bit iffy, but should be safe. And with that, we don't need an
extra allocation outside of the bio.

>> +	if (nr_pages <= DIO_INLINE_BIO_VECS)
>> +		vecs = inline_vecs;
>> +	else {
>> +		vecs = kmalloc(nr_pages * sizeof(struct bio_vec), GFP_KERNEL);
>
> kmalloc_array?

Good point, I'll make that change.
Christoph Hellwig Nov. 16, 2016, 5:16 p.m. UTC | #3
I've pushed out a new version of my code that avoids atomic ops
for the single bio case:

http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/new-dio

I think we should take your extension of the existing
__blkdev_direct_IO_simple to kmalloc up to BIO_MAX_PAGES and then use
the above implementation for AIO and large synchronous I/O.

I think I can also kill of blkdev_dio_pool by simply using bio_kmalloc
as we're only doing the allocation at the beginning of the call,
but I'd like to agree on an approach before spending too much time on
it.
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jens Axboe Nov. 16, 2016, 8:02 p.m. UTC | #4
On 11/16/2016 10:16 AM, Christoph Hellwig wrote:
> I've pushed out a new version of my code that avoids atomic ops
> for the single bio case:
>
> http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/new-dio
>
> I think we should take your extension of the existing
> __blkdev_direct_IO_simple to kmalloc up to BIO_MAX_PAGES and then use
> the above implementation for AIO and large synchronous I/O.
>
> I think I can also kill of blkdev_dio_pool by simply using bio_kmalloc
> as we're only doing the allocation at the beginning of the call,
> but I'd like to agree on an approach before spending too much time on
> it.
>

I'm fine with this approach, but I would still REALLY like to see
blkdev_bio_end_io() split in two, once for sync and once for async. That
would be a lot cleaner, imho.

Let me know how you want to do it. I added SYNC support for mine here,
the branch is here:

http://git.kernel.dk/cgit/linux-block/log/?h=for-4.10/dio
Jens Axboe Nov. 17, 2016, 4:43 a.m. UTC | #5
On 11/16/2016 01:02 PM, Jens Axboe wrote:
> On 11/16/2016 10:16 AM, Christoph Hellwig wrote:
>> I've pushed out a new version of my code that avoids atomic ops
>> for the single bio case:
>>
>> http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/new-dio
>>
>> I think we should take your extension of the existing
>> __blkdev_direct_IO_simple to kmalloc up to BIO_MAX_PAGES and then use
>> the above implementation for AIO and large synchronous I/O.
>>
>> I think I can also kill of blkdev_dio_pool by simply using bio_kmalloc
>> as we're only doing the allocation at the beginning of the call,
>> but I'd like to agree on an approach before spending too much time on
>> it.
>>
>
> I'm fine with this approach, but I would still REALLY like to see
> blkdev_bio_end_io() split in two, once for sync and once for async. That
> would be a lot cleaner, imho.

OK, I'm getting reasonably happy with it now:

http://git.kernel.dk/cgit/linux-block/log/?h=for-4.10/hch-dio

Sharing the FUA setting to avoid the manual flush between the two, and
removing the remnants of SYNC support from __blkdev_direct_IO().

I'll fire off some overnight testing now and leave it for the day.
Christoph Hellwig Nov. 17, 2016, 12:46 p.m. UTC | #6
On Wed, Nov 16, 2016 at 01:02:05PM -0700, Jens Axboe wrote:
> I'm fine with this approach, but I would still REALLY like to see
> blkdev_bio_end_io() split in two, once for sync and once for async. That
> would be a lot cleaner, imho.

Do you still want that?  It's not in your latest tree even with all
the other changes.
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Nov. 17, 2016, 12:50 p.m. UTC | #7
On Wed, Nov 16, 2016 at 09:43:57PM -0700, Jens Axboe wrote:
> OK, I'm getting reasonably happy with it now:
> 
> http://git.kernel.dk/cgit/linux-block/log/?h=for-4.10/hch-dio
> 
> Sharing the FUA setting to avoid the manual flush between the two, and
> removing the remnants of SYNC support from __blkdev_direct_IO().
> 
> I'll fire off some overnight testing now and leave it for the day.

This looks reasonable, a couple questions and comments:

 - why the 8 * BIO_MAX_PAGES in the call to iov_iter_npages,
   that probably needs a comment
 - I'll probably need to write up a proper changelog and fold
   in at least the multi_bio patch.
 - I'd just return the op from dio_bio_set_write_op instead of
   passing the bio, that reads a lot easier.
 - I should probably get rid of blkdev_dio_pool in favour
   of bio_kmalloc

I can look into doing these fixups this afternoon and prepare a tre
for you.
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jens Axboe Nov. 17, 2016, 1:25 p.m. UTC | #8
On 11/17/2016 05:50 AM, Christoph Hellwig wrote:
> On Wed, Nov 16, 2016 at 09:43:57PM -0700, Jens Axboe wrote:
>> OK, I'm getting reasonably happy with it now:
>>
>> http://git.kernel.dk/cgit/linux-block/log/?h=for-4.10/hch-dio
>>
>> Sharing the FUA setting to avoid the manual flush between the two, and
>> removing the remnants of SYNC support from __blkdev_direct_IO().
>>
>> I'll fire off some overnight testing now and leave it for the day.
>
> This looks reasonable, a couple questions and comments:
>
>  - why the 8 * BIO_MAX_PAGES in the call to iov_iter_npages,
>    that probably needs a comment

Looks like I forgot to push out the latest changes, that is gone. We're
just using + 1 now to see if we should use the multibio function or not.

>  - I'll probably need to write up a proper changelog and fold
>    in at least the multi_bio patch.

That's fine.

>  - I'd just return the op from dio_bio_set_write_op instead of
>    passing the bio, that reads a lot easier.

Fine with me.

>  - I should probably get rid of blkdev_dio_pool in favour
>    of bio_kmalloc

Agree, though not a requirement imho.

> I can look into doing these fixups this afternoon and prepare a tre
> for you.

Sounds good.
Jens Axboe Nov. 17, 2016, 1:26 p.m. UTC | #9
On 11/17/2016 05:46 AM, Christoph Hellwig wrote:
> On Wed, Nov 16, 2016 at 01:02:05PM -0700, Jens Axboe wrote:
>> I'm fine with this approach, but I would still REALLY like to see
>> blkdev_bio_end_io() split in two, once for sync and once for async. That
>> would be a lot cleaner, imho.
>
> Do you still want that?  It's not in your latest tree even with all
> the other changes.

I still think it'd clean it up, but not a merge stopper.
Jens Axboe Nov. 17, 2016, 3:03 p.m. UTC | #10
On 11/17/2016 06:25 AM, Jens Axboe wrote:
> On 11/17/2016 05:50 AM, Christoph Hellwig wrote:
>> On Wed, Nov 16, 2016 at 09:43:57PM -0700, Jens Axboe wrote:
>>> OK, I'm getting reasonably happy with it now:
>>>
>>> http://git.kernel.dk/cgit/linux-block/log/?h=for-4.10/hch-dio
>>>
>>> Sharing the FUA setting to avoid the manual flush between the two, and
>>> removing the remnants of SYNC support from __blkdev_direct_IO().
>>>
>>> I'll fire off some overnight testing now and leave it for the day.
>>
>> This looks reasonable, a couple questions and comments:
>>
>>  - why the 8 * BIO_MAX_PAGES in the call to iov_iter_npages,
>>    that probably needs a comment
>
> Looks like I forgot to push out the latest changes, that is gone. We're
> just using + 1 now to see if we should use the multibio function or not.

Just to clarify, the correct tip is:

commit 18eb8e94477813c190a2dfcd790c6707ab168377
Author: Jens Axboe <axboe@fb.com>
Date:   Wed Nov 16 23:20:41 2016 -0700

     block: save 8 bytes of space in struct blkdev_dio

That's what I ran in my overnight testing, and it looks fine from my
end.
Christoph Hellwig Nov. 17, 2016, 3:16 p.m. UTC | #11
On Thu, Nov 17, 2016 at 08:03:46AM -0700, Jens Axboe wrote:
> > Looks like I forgot to push out the latest changes, that is gone. We're
> > just using + 1 now to see if we should use the multibio function or not.
> 
> Just to clarify, the correct tip is:

Re-checking for-4.10/dio now everything Ń•eems to be fine.  Not sure
how I managed to pick up an old head.
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Nov. 17, 2016, 8:17 p.m. UTC | #12
Ok, the updated tree is here:

http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/new-dio

This moves your O_SYNC patch earlier and makes it not directly
assign bi_opf.

Then comes the main direct I/O path with the various folds, and
a use after free fix ported from the iomap code in the completion
handler.

Last comes the I/O completion handler split - I don't think it's
worth it, but if you prefer it that way go ahead and fold it in.

I couldn't do the bio_kmalloc conversion as bio_kmalloc seems
to not like the front pad.
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jens Axboe Nov. 17, 2016, 8:30 p.m. UTC | #13
On 11/17/2016 01:17 PM, Christoph Hellwig wrote:
> Ok, the updated tree is here:
>
> http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/new-dio
>
> This moves your O_SYNC patch earlier and makes it not directly
> assign bi_opf.
>
> Then comes the main direct I/O path with the various folds, and
> a use after free fix ported from the iomap code in the completion
> handler.
>
> Last comes the I/O completion handler split - I don't think it's
> worth it, but if you prefer it that way go ahead and fold it in.

The last patch seems to have some issues:

+       bool is_sync = is_sync = is_sync_kiocb(iocb);

and it's reverting part of the dio->is_sync that was part of my union
waiter/iocb change. So I'll leave that out for now, for those reasons.

> I couldn't do the bio_kmalloc conversion as bio_kmalloc seems
> to not like the front pad.

Yeah, !bs/bio_kmalloc() doesn't use the front pad...
Christoph Hellwig Nov. 17, 2016, 9:04 p.m. UTC | #14
On Thu, Nov 17, 2016 at 01:30:12PM -0700, Jens Axboe wrote:
> The last patch seems to have some issues:
> 
> +       bool is_sync = is_sync = is_sync_kiocb(iocb);
> 
> and it's reverting part of the dio->is_sync that was part of my union
> waiter/iocb change. So I'll leave that out for now, for those reasons.

dio->is_sync isn't needed once the completion handlers are split.
That beeing said I wonder how the above statement even compiled,
there should just be a single assignment obviously.

Did I mention that I don't like the split anyway? :)
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jens Axboe Nov. 17, 2016, 9:06 p.m. UTC | #15
On 11/17/2016 02:04 PM, Christoph Hellwig wrote:
> On Thu, Nov 17, 2016 at 01:30:12PM -0700, Jens Axboe wrote:
>> The last patch seems to have some issues:
>>
>> +       bool is_sync = is_sync = is_sync_kiocb(iocb);
>>
>> and it's reverting part of the dio->is_sync that was part of my union
>> waiter/iocb change. So I'll leave that out for now, for those reasons.
>
> dio->is_sync isn't needed once the completion handlers are split.
> That beeing said I wonder how the above statement even compiled,
> there should just be a single assignment obviously.
>
> Did I mention that I don't like the split anyway? :)

Alright alright, let's drop the split ;-)
diff mbox

Patch

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7c3ec60..becc78e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -176,9 +176,68 @@  static struct inode *bdev_file_inode(struct file *file)
  	return file->f_mapping->host;
  }

+static void blkdev_bio_end_io_async(struct bio *bio)
+{
+	struct kiocb *iocb = bio->bi_private;
+
+	iocb->ki_complete(iocb, bio->bi_error, 0);
+
+	if (bio_op(bio) == REQ_OP_READ)
+		bio_check_pages_dirty(bio);
+	else
+		bio_put(bio);
+}
+
+static ssize_t
+__blkdev_direct_IO_async(struct kiocb *iocb, struct iov_iter *iter,
+			 int nr_pages)
+{
+	struct file *file = iocb->ki_filp;
+	struct block_device *bdev = I_BDEV(bdev_file_inode(file));
+	unsigned blkbits = blksize_bits(bdev_logical_block_size(bdev));
+	loff_t pos = iocb->ki_pos;
+	struct bio *bio;
+	ssize_t ret;
+
+	if ((pos | iov_iter_alignment(iter)) & ((1 << blkbits) - 1))
+		return -EINVAL;
+
+	bio = bio_alloc(GFP_KERNEL, nr_pages);
+	if (!bio)
+		return -ENOMEM;
+
+	bio->bi_bdev = bdev;
+	bio->bi_iter.bi_sector = pos >> blkbits;
+	bio->bi_private = iocb;
+	bio->bi_end_io = blkdev_bio_end_io_async;
+
+	ret = bio_iov_iter_get_pages(bio, iter);
+	if (unlikely(ret))
+		return ret;
+
+	/*
+	 * Overload bio size in error. If it gets set, we lose the
+	 * size, but we don't need the size for that case. IO is limited
+	 * to BIO_MAX_PAGES, so we can't overflow.
+	 */
+	ret = bio->bi_error = bio->bi_iter.bi_size;
+
+	if (iov_iter_rw(iter) == READ) {
+		bio_set_op_attrs(bio, REQ_OP_READ, 0);
+		bio_set_pages_dirty(bio);
+	} else {
+		bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
+		task_io_account_write(ret);
+	}
+
+	submit_bio(bio);
+	iocb->ki_pos += ret;
+	return -EIOCBQUEUED;
+}
+
  #define DIO_INLINE_BIO_VECS 4

-static void blkdev_bio_end_io_simple(struct bio *bio)
+static void blkdev_bio_end_io_sync(struct bio *bio)
  {
  	struct task_struct *waiter = bio->bi_private;

@@ -187,13 +246,12 @@  static void blkdev_bio_end_io_simple(struct bio *bio)
  }

  static ssize_t
-__blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
-		int nr_pages)
+__blkdev_direct_IO_sync(struct kiocb *iocb, struct iov_iter *iter, int 
nr_pages)
  {
  	struct file *file = iocb->ki_filp;
  	struct block_device *bdev = I_BDEV(bdev_file_inode(file));
  	unsigned blkbits = blksize_bits(bdev_logical_block_size(bdev));
-	struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *bvec;
+	struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs, *bvec;
  	loff_t pos = iocb->ki_pos;
  	bool should_dirty = false;