Message ID | 1427802746-30432-1-git-send-email-ming.lei@canonical.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Ming Lei <ming.lei@canonical.com> writes: > Each bio is always submitted to block device one by one, > so it isn't necessary to increase the bio refcount by one > each time with holding dio->bio_lock. This patch opens up a race where a completion event can come in before the refcount for the dio is incremented, resulting in refcount going negative. I don't think that will actually cause problems, but it certainly is ugly, and I doubt it was the intended design. Before I dig into this any further, would you care to comment on why you went down this path? Did you see spinlock contention here? And was there a resultant performance improvement for some benchmark with the patch applied? Cheers, Jeff > Signed-off-by: Ming Lei <ming.lei@canonical.com> > --- > fs/direct-io.c | 27 +++++++++++++++++---------- > 1 file changed, 17 insertions(+), 10 deletions(-) > > diff --git a/fs/direct-io.c b/fs/direct-io.c > index 6fb00e3..57b8e73 100644 > --- a/fs/direct-io.c > +++ b/fs/direct-io.c > @@ -79,6 +79,8 @@ struct dio_submit { > get_block_t *get_block; /* block mapping function */ > dio_submit_t *submit_io; /* IO submition function */ > > + long submitted_bio; > + > loff_t logical_offset_in_bio; /* current first logical block in bio */ > sector_t final_block_in_bio; /* current final block in bio + 1 */ > sector_t next_block_for_io; /* next block to be put under IO, > @@ -121,7 +123,7 @@ struct dio { > int is_async; /* is IO async ? */ > bool defer_completion; /* defer AIO completion to workqueue? */ > int io_error; /* IO error in completion path */ > - unsigned long refcount; /* direct_io_worker() and bios */ > + long refcount; /* direct_io_worker() and bios */ > struct bio *bio_list; /* singly linked via bi_private */ > struct task_struct *waiter; /* waiting task (NULL if none) */ > > @@ -383,14 +385,9 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, > static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) > { > struct bio *bio = sdio->bio; > - unsigned long flags; > > bio->bi_private = dio; > > - spin_lock_irqsave(&dio->bio_lock, flags); > - dio->refcount++; > - spin_unlock_irqrestore(&dio->bio_lock, flags); > - > if (dio->is_async && dio->rw == READ) > bio_set_pages_dirty(bio); > > @@ -403,15 +400,26 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) > sdio->bio = NULL; > sdio->boundary = 0; > sdio->logical_offset_in_bio = 0; > + sdio->submitted_bio++; > } > > /* > * Release any resources in case of a failure > */ > -static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio) > +static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio, > + bool commit_refcount) > { > + unsigned long flags; > + > while (sdio->head < sdio->tail) > page_cache_release(dio->pages[sdio->head++]); > + > + if (!commit_refcount) > + return; > + > + spin_lock_irqsave(&dio->bio_lock, flags); > + dio->refcount += (sdio->submitted_bio + 1); > + spin_unlock_irqrestore(&dio->bio_lock, flags); > } > > /* > @@ -1215,7 +1223,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, > dio->i_size = i_size_read(inode); > > spin_lock_init(&dio->bio_lock); > - dio->refcount = 1; > > sdio.iter = iter; > sdio.final_block_in_request = > @@ -1234,7 +1241,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, > > retval = do_direct_IO(dio, &sdio, &map_bh); > if (retval) > - dio_cleanup(dio, &sdio); > + dio_cleanup(dio, &sdio, false); > > if (retval == -ENOTBLK) { > /* > @@ -1267,7 +1274,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, > * It is possible that, we return short IO due to end of file. > * In that case, we need to release all the pages we got hold on. > */ > - dio_cleanup(dio, &sdio); > + dio_cleanup(dio, &sdio, true); > > /* > * All block lookups have been performed. For READ requests -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi, On Tue, Mar 31, 2015 at 10:02 PM, Jeff Moyer <jmoyer@redhat.com> wrote: > Ming Lei <ming.lei@canonical.com> writes: > >> Each bio is always submitted to block device one by one, >> so it isn't necessary to increase the bio refcount by one >> each time with holding dio->bio_lock. > > This patch opens up a race where a completion event can come in before > the refcount for the dio is incremented, resulting in refcount going > negative. I don't think that will actually cause problems, but it > certainly is ugly, and I doubt it was the intended design. Could you explain why you think it is a race and a bug? When dio->refcount is negative, dio_bio_end_*() only completes the current BIO, which is just what the function should do, isn't it? > > Before I dig into this any further, would you care to comment on why you > went down this path? Did you see spinlock contention here? And was > there a resultant performance improvement for some benchmark with the > patch applied? It is just a minor optimization in theory, especially in case of lots of BIO in one dio. Thanks, Ming Lei > > Cheers, > Jeff > >> Signed-off-by: Ming Lei <ming.lei@canonical.com> >> --- >> fs/direct-io.c | 27 +++++++++++++++++---------- >> 1 file changed, 17 insertions(+), 10 deletions(-) >> >> diff --git a/fs/direct-io.c b/fs/direct-io.c >> index 6fb00e3..57b8e73 100644 >> --- a/fs/direct-io.c >> +++ b/fs/direct-io.c >> @@ -79,6 +79,8 @@ struct dio_submit { >> get_block_t *get_block; /* block mapping function */ >> dio_submit_t *submit_io; /* IO submition function */ >> >> + long submitted_bio; >> + >> loff_t logical_offset_in_bio; /* current first logical block in bio */ >> sector_t final_block_in_bio; /* current final block in bio + 1 */ >> sector_t next_block_for_io; /* next block to be put under IO, >> @@ -121,7 +123,7 @@ struct dio { >> int is_async; /* is IO async ? */ >> bool defer_completion; /* defer AIO completion to workqueue? */ >> int io_error; /* IO error in completion path */ >> - unsigned long refcount; /* direct_io_worker() and bios */ >> + long refcount; /* direct_io_worker() and bios */ >> struct bio *bio_list; /* singly linked via bi_private */ >> struct task_struct *waiter; /* waiting task (NULL if none) */ >> >> @@ -383,14 +385,9 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, >> static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) >> { >> struct bio *bio = sdio->bio; >> - unsigned long flags; >> >> bio->bi_private = dio; >> >> - spin_lock_irqsave(&dio->bio_lock, flags); >> - dio->refcount++; >> - spin_unlock_irqrestore(&dio->bio_lock, flags); >> - >> if (dio->is_async && dio->rw == READ) >> bio_set_pages_dirty(bio); >> >> @@ -403,15 +400,26 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) >> sdio->bio = NULL; >> sdio->boundary = 0; >> sdio->logical_offset_in_bio = 0; >> + sdio->submitted_bio++; >> } >> >> /* >> * Release any resources in case of a failure >> */ >> -static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio) >> +static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio, >> + bool commit_refcount) >> { >> + unsigned long flags; >> + >> while (sdio->head < sdio->tail) >> page_cache_release(dio->pages[sdio->head++]); >> + >> + if (!commit_refcount) >> + return; >> + >> + spin_lock_irqsave(&dio->bio_lock, flags); >> + dio->refcount += (sdio->submitted_bio + 1); >> + spin_unlock_irqrestore(&dio->bio_lock, flags); >> } >> >> /* >> @@ -1215,7 +1223,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, >> dio->i_size = i_size_read(inode); >> >> spin_lock_init(&dio->bio_lock); >> - dio->refcount = 1; >> >> sdio.iter = iter; >> sdio.final_block_in_request = >> @@ -1234,7 +1241,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, >> >> retval = do_direct_IO(dio, &sdio, &map_bh); >> if (retval) >> - dio_cleanup(dio, &sdio); >> + dio_cleanup(dio, &sdio, false); >> >> if (retval == -ENOTBLK) { >> /* >> @@ -1267,7 +1274,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, >> * It is possible that, we return short IO due to end of file. >> * In that case, we need to release all the pages we got hold on. >> */ >> - dio_cleanup(dio, &sdio); >> + dio_cleanup(dio, &sdio, true); >> >> /* >> * All block lookups have been performed. For READ requests -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Ming Lei <ming.lei@canonical.com> writes: > Hi, > > On Tue, Mar 31, 2015 at 10:02 PM, Jeff Moyer <jmoyer@redhat.com> wrote: >> Ming Lei <ming.lei@canonical.com> writes: >> >>> Each bio is always submitted to block device one by one, >>> so it isn't necessary to increase the bio refcount by one >>> each time with holding dio->bio_lock. >> >> This patch opens up a race where a completion event can come in before >> the refcount for the dio is incremented, resulting in refcount going >> negative. I don't think that will actually cause problems, but it >> certainly is ugly, and I doubt it was the intended design. > > Could you explain why you think it is a race and a bug? When > dio->refcount is negative, dio_bio_end_*() only completes the > current BIO, which is just what the function should do, isn't it? I didn't say it was a bug. :) Refcounts going negative isn't something that seems clean, though. If you're going to push this patch through, at least add a comment saying that this can happen by design, and is safe. >> Before I dig into this any further, would you care to comment on why you >> went down this path? Did you see spinlock contention here? And was >> there a resultant performance improvement for some benchmark with the >> patch applied? > > It is just a minor optimization in theory, especially in case of lots of BIO > in one dio. It seems plausible that it would be a win. It sure would be nice to have some numbers, though. Cheers, Jeff -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/direct-io.c b/fs/direct-io.c index 6fb00e3..57b8e73 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -79,6 +79,8 @@ struct dio_submit { get_block_t *get_block; /* block mapping function */ dio_submit_t *submit_io; /* IO submition function */ + long submitted_bio; + loff_t logical_offset_in_bio; /* current first logical block in bio */ sector_t final_block_in_bio; /* current final block in bio + 1 */ sector_t next_block_for_io; /* next block to be put under IO, @@ -121,7 +123,7 @@ struct dio { int is_async; /* is IO async ? */ bool defer_completion; /* defer AIO completion to workqueue? */ int io_error; /* IO error in completion path */ - unsigned long refcount; /* direct_io_worker() and bios */ + long refcount; /* direct_io_worker() and bios */ struct bio *bio_list; /* singly linked via bi_private */ struct task_struct *waiter; /* waiting task (NULL if none) */ @@ -383,14 +385,9 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) { struct bio *bio = sdio->bio; - unsigned long flags; bio->bi_private = dio; - spin_lock_irqsave(&dio->bio_lock, flags); - dio->refcount++; - spin_unlock_irqrestore(&dio->bio_lock, flags); - if (dio->is_async && dio->rw == READ) bio_set_pages_dirty(bio); @@ -403,15 +400,26 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) sdio->bio = NULL; sdio->boundary = 0; sdio->logical_offset_in_bio = 0; + sdio->submitted_bio++; } /* * Release any resources in case of a failure */ -static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio) +static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio, + bool commit_refcount) { + unsigned long flags; + while (sdio->head < sdio->tail) page_cache_release(dio->pages[sdio->head++]); + + if (!commit_refcount) + return; + + spin_lock_irqsave(&dio->bio_lock, flags); + dio->refcount += (sdio->submitted_bio + 1); + spin_unlock_irqrestore(&dio->bio_lock, flags); } /* @@ -1215,7 +1223,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, dio->i_size = i_size_read(inode); spin_lock_init(&dio->bio_lock); - dio->refcount = 1; sdio.iter = iter; sdio.final_block_in_request = @@ -1234,7 +1241,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, retval = do_direct_IO(dio, &sdio, &map_bh); if (retval) - dio_cleanup(dio, &sdio); + dio_cleanup(dio, &sdio, false); if (retval == -ENOTBLK) { /* @@ -1267,7 +1274,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, * It is possible that, we return short IO due to end of file. * In that case, we need to release all the pages we got hold on. */ - dio_cleanup(dio, &sdio); + dio_cleanup(dio, &sdio, true); /* * All block lookups have been performed. For READ requests
Each bio is always submitted to block device one by one, so it isn't necessary to increase the bio refcount by one each time with holding dio->bio_lock. Signed-off-by: Ming Lei <ming.lei@canonical.com> --- fs/direct-io.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-)