@@ -20,6 +20,7 @@
* Private flags for iomap_dio, must not overlap with the public ones in
* iomap.h:
*/
+#define IOMAP_DIO_CALLER_COMP (1 << 26)
#define IOMAP_DIO_INLINE_COMP (1 << 27)
#define IOMAP_DIO_WRITE_THROUGH (1 << 28)
#define IOMAP_DIO_NEED_SYNC (1 << 29)
@@ -132,6 +133,11 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
}
EXPORT_SYMBOL_GPL(iomap_dio_complete);
+static ssize_t iomap_dio_deferred_complete(void *data)
+{
+ return iomap_dio_complete(data);
+}
+
static void iomap_dio_complete_work(struct work_struct *work)
{
struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
@@ -192,6 +198,31 @@ void iomap_dio_bio_end_io(struct bio *bio)
goto release_bio;
}
+ /*
+ * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule
+ * our completion that way to avoid an async punt to a workqueue.
+ */
+ if (dio->flags & IOMAP_DIO_CALLER_COMP) {
+ /* only polled IO cares about private cleared */
+ iocb->private = dio;
+ iocb->dio_complete = iomap_dio_deferred_complete;
+
+ /*
+ * Invoke ->ki_complete() directly. We've assigned our
+ * dio_complete callback handler, and since the issuer set
+ * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will
+ * notice ->dio_complete being set and will defer calling that
+ * handler until it can be done from a safe task context.
+ *
+ * Note that the 'res' being passed in here is not important
+ * for this case. The actual completion value of the request
+ * will be gotten from dio_complete when that is run by the
+ * issuer.
+ */
+ iocb->ki_complete(iocb, 0);
+ goto release_bio;
+ }
+
/*
* Async DIO completion that requires filesystem level completion work
* gets punted to a work queue to complete as the operation may require
@@ -288,12 +319,17 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
* after IO completion such as unwritten extent conversion) and
* the underlying device either supports FUA or doesn't have
* a volatile write cache. This allows us to avoid cache flushes
- * on IO completion.
+ * on IO completion. If we can't use writethrough and need to
+ * sync, disable in-task completions as dio completion will
+ * need to call generic_write_sync() which will do a blocking
+ * fsync / cache flush call.
*/
if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
(dio->flags & IOMAP_DIO_WRITE_THROUGH) &&
(bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev)))
use_fua = true;
+ else if (dio->flags & IOMAP_DIO_NEED_SYNC)
+ dio->flags &= ~IOMAP_DIO_CALLER_COMP;
}
/*
@@ -319,6 +355,14 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
pad = pos & (fs_block_size - 1);
if (pad)
iomap_dio_zero(iter, dio, pos - pad, pad);
+
+ /*
+ * If need_zeroout is set, then this is a new or unwritten
+ * extent, or dirty file metadata have not been persisted to
+ * disk. These need extra handling at completion time, so
+ * disable in-task deferred completion for those.
+ */
+ dio->flags &= ~IOMAP_DIO_CALLER_COMP;
}
/*
@@ -557,6 +601,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
iomi.flags |= IOMAP_WRITE;
dio->flags |= IOMAP_DIO_WRITE;
+ /*
+ * Flag as supporting deferred completions, if the issuer
+ * groks it. This can avoid a workqueue punt for writes.
+ * We may later clear this flag if we need to do other IO
+ * as part of this IO completion.
+ */
+ if (iocb->ki_flags & IOCB_DIO_CALLER_COMP)
+ dio->flags |= IOMAP_DIO_CALLER_COMP;
+
if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
ret = -EAGAIN;
if (iomi.pos >= dio->i_size ||