@@ -685,6 +685,7 @@ EXPORT_SYMBOL_GPL(iomap_seek_data);
* Private flags for iomap_dio, must not overlap with the public ones in
* iomap.h:
*/
+#define IOMAP_DIO_WRITE_FUA (1 << 28)
#define IOMAP_DIO_WRITE_SYNC (1 << 29)
#define IOMAP_DIO_WRITE (1 << 30)
#define IOMAP_DIO_DIRTY (1 << 31)
@@ -863,6 +864,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
struct iov_iter iter;
struct bio *bio;
bool need_zeroout = false;
+ bool use_fua = false;
int nr_pages, ret;
size_t copied = 0;
@@ -888,6 +890,18 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
dio->flags |= IOMAP_DIO_COW;
if (iomap->flags & IOMAP_F_NEW)
need_zeroout = true;
+ else {
+ /*
+ * Use a FUA write if we need datasync semantics, this
+ * is a pure data IO that doesn't require any metadata
+ * updates and the underlying device supports FUA. This
+ * allows us to avoid cache flushes on IO completion.
+ */
+ if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
+ (dio->flags & IOMAP_DIO_WRITE_FUA) &&
+ blk_queue_fua(bdev_get_queue(iomap->bdev)))
+ use_fua = true;
+ }
break;
default:
WARN_ON_ONCE(1);
@@ -935,7 +949,13 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
n = bio->bi_iter.bi_size;
if (dio->flags & IOMAP_DIO_WRITE) {
- bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
+ int op_flags = REQ_SYNC | REQ_IDLE;
+
+ if (use_fua)
+ op_flags |= REQ_FUA;
+ else
+ dio->flags &= ~IOMAP_DIO_WRITE_FUA;
+ bio_set_op_attrs(bio, REQ_OP_WRITE, op_flags);
task_io_account_write(n);
} else {
bio_set_op_attrs(bio, REQ_OP_READ, 0);
@@ -968,7 +988,12 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
/*
* iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
- * is being issued as AIO or not.
+ * is being issued as AIO or not. This allows us to optimise pure data writes
+ * to use REQ_FUA rather than requiring generic_write_sync() to issue a
+ * REQ_FLUSH post write. This is slightly tricky because a single request here
+ * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
+ * may be pure data writes. In that case, we still need to do a full data sync
+ * completion.
*/
ssize_t
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
@@ -1015,8 +1040,14 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->flags |= IOMAP_DIO_DIRTY;
} else {
dio->flags |= IOMAP_DIO_WRITE;
- if (iocb->ki_flags & IOCB_DSYNC)
+ if (iocb->ki_flags & IOCB_DSYNC) {
dio->flags |= IOMAP_DIO_WRITE_SYNC;
+ /*
+ * Any non-FUA write will clear this flag, hence we know
+ * before completion whether a cache flush is necessary.
+ */
+ dio->flags |= IOMAP_DIO_WRITE_FUA;
+ }
flags |= IOMAP_WRITE;
}
@@ -1073,6 +1104,13 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (ret < 0)
iomap_dio_set_error(dio, ret);
+ /*
+ * If all the writes we issued were FUA, we don't need to flush the
+ * cache on IO completion. Clear the sync flag for this case.
+ */
+ if (dio->flags & IOMAP_DIO_WRITE_FUA)
+ dio->flags &= ~IOMAP_DIO_WRITE_SYNC;
+
if (!atomic_dec_and_test(&dio->ref)) {
if (!is_sync_kiocb(iocb))
return -EIOCBQUEUED;
@@ -800,6 +800,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
#define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags)
#define blk_queue_preempt_only(q) \
test_bit(QUEUE_FLAG_PREEMPT_ONLY, &(q)->queue_flags)
+#define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags)
extern int blk_set_preempt_only(struct request_queue *q);
extern void blk_clear_preempt_only(struct request_queue *q);