diff mbox series

[3/3] apple-nvme: defer cache flushes by a specified amount

Message ID 20250211-nvme-fixes-v1-3-6958b3aa49fe@rosenzweig.io (mailing list archive)
State New
Headers show
Series apple-nvme: bug and perf fixes | expand

Commit Message

Alyssa Rosenzweig Feb. 11, 2025, 6:25 p.m. UTC
From: Jens Axboe <axboe@kernel.dk>

Cache flushes on the M1 nvme are really slow, taking 17-18 msec to
complete. This can slow down workloads considerably, pure random writes
end up being bound by the flush latency and hence run at 55-60 IOPS.

Add a deferred flush work around to provide better performance, at a
minimal risk. By default, flushes are delayed at most 1 second, but this
is configurable.

With this work-around, a pure random write workload runs at ~12K IOPS
rather than 56 IOPS.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
---
 drivers/nvme/host/apple.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

Comments

Jens Axboe Feb. 11, 2025, 7:50 p.m. UTC | #1
On 2/11/25 11:25 AM, Alyssa Rosenzweig wrote:
> From: Jens Axboe <axboe@kernel.dk>
> 
> Cache flushes on the M1 nvme are really slow, taking 17-18 msec to
> complete. This can slow down workloads considerably, pure random writes
> end up being bound by the flush latency and hence run at 55-60 IOPS.
> 
> Add a deferred flush work around to provide better performance, at a
> minimal risk. By default, flushes are delayed at most 1 second, but this
> is configurable.
> 
> With this work-around, a pure random write workload runs at ~12K IOPS
> rather than 56 IOPS.

I knew this one would bite in the ass at some point down the line ;-)

I do think the feature is sane, and to my knowledge we haven't had any
issues with it since I originally wrote it 3 years ago. But I also
think it should probably go in the block layer proper, as other
devices might benefit from it.

That said, I'm fine with parking this in the apple nvme driver for
now, as we don't have to deal with multiple namespaces etc. Can
always get migrated to a core feature later, if desired.
Christoph Hellwig Feb. 13, 2025, 6:20 a.m. UTC | #2
On Tue, Feb 11, 2025 at 01:25:59PM -0500, Alyssa Rosenzweig wrote:
> From: Jens Axboe <axboe@kernel.dk>
> 
> Cache flushes on the M1 nvme are really slow, taking 17-18 msec to
> complete. This can slow down workloads considerably, pure random writes
> end up being bound by the flush latency and hence run at 55-60 IOPS.
> 
> Add a deferred flush work around to provide better performance, at a
> minimal risk. By default, flushes are delayed at most 1 second, but this
> is configurable.
> 
> With this work-around, a pure random write workload runs at ~12K IOPS
> rather than 56 IOPS.

Just as last time this really is not a driver feature.  Cache flushes
are slow on consumer hardware, it's just apple is worse than usual.
Breaking file system transactional guarantee by ignoring data integrity
command in the driver is a no-go.

If we want to allow an opt-in policy for those whole feel adventurous,
it belongs into the core flush state machine.  Fortunately the patch
author seems qualified to touch that :)
Alyssa Rosenzweig Feb. 13, 2025, 4:09 p.m. UTC | #3
> > Cache flushes on the M1 nvme are really slow, taking 17-18 msec to
> > complete. This can slow down workloads considerably, pure random writes
> > end up being bound by the flush latency and hence run at 55-60 IOPS.
> > 
> > Add a deferred flush work around to provide better performance, at a
> > minimal risk. By default, flushes are delayed at most 1 second, but this
> > is configurable.
> > 
> > With this work-around, a pure random write workload runs at ~12K IOPS
> > rather than 56 IOPS.
> 
> Just as last time this really is not a driver feature.  Cache flushes
> are slow on consumer hardware, it's just apple is worse than usual.
> Breaking file system transactional guarantee by ignoring data integrity
> command in the driver is a no-go.
> 
> If we want to allow an opt-in policy for those whole feel adventurous,
> it belongs into the core flush state machine.  Fortunately the patch
> author seems qualified to touch that :)

Fair enough. I didn't realize this patch was previously discussed, my
apologies. I'll drop this change in v2, and hopefully somebody is
inspired later to do that 'adventure'.
diff mbox series

Patch

diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c
index a060f69558e76970bfba046cca5127243e8a51b7..2dfb0442d56195756df91e0fbc913b751c74d0ea 100644
--- a/drivers/nvme/host/apple.c
+++ b/drivers/nvme/host/apple.c
@@ -195,8 +195,20 @@  struct apple_nvme {
 
 	int irq;
 	spinlock_t lock;
+
+	/*
+	 * Delayed cache flush handling state
+	 */
+	struct nvme_ns *flush_ns;
+	unsigned long flush_interval;
+	unsigned long last_flush;
+	struct delayed_work flush_dwork;
 };
 
+unsigned int flush_interval = 1000;
+module_param(flush_interval, uint, 0644);
+MODULE_PARM_DESC(flush_interval, "Grace period in msecs between flushes");
+
 static_assert(sizeof(struct nvme_command) == 64);
 static_assert(sizeof(struct apple_nvmmu_tcb) == 128);
 
@@ -729,6 +741,26 @@  static int apple_nvme_remove_sq(struct apple_nvme *anv)
 	return nvme_submit_sync_cmd(anv->ctrl.admin_q, &c, NULL, 0);
 }
 
+static bool apple_nvme_delayed_flush(struct apple_nvme *anv, struct nvme_ns *ns,
+				     struct request *req)
+{
+	if (!anv->flush_interval || req_op(req) != REQ_OP_FLUSH)
+		return false;
+	if (delayed_work_pending(&anv->flush_dwork))
+		return true;
+	if (time_before(jiffies, anv->last_flush + anv->flush_interval)) {
+		kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &anv->flush_dwork,
+						anv->flush_interval);
+		if (WARN_ON_ONCE(anv->flush_ns && anv->flush_ns != ns))
+			goto out;
+		anv->flush_ns = ns;
+		return true;
+	}
+out:
+	anv->last_flush = jiffies;
+	return false;
+}
+
 static blk_status_t apple_nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 					const struct blk_mq_queue_data *bd)
 {
@@ -764,6 +796,12 @@  static blk_status_t apple_nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	}
 
 	nvme_start_request(req);
+
+	if (apple_nvme_delayed_flush(anv, ns, req)) {
+		blk_mq_complete_request(req);
+		return BLK_STS_OK;
+	}
+
 	apple_nvme_submit_cmd(q, cmnd);
 	return BLK_STS_OK;
 
@@ -1398,6 +1436,28 @@  static void devm_apple_nvme_mempool_destroy(void *data)
 	mempool_destroy(data);
 }
 
+static void apple_nvme_flush_work(struct work_struct *work)
+{
+	struct nvme_command c = { };
+	struct apple_nvme *anv;
+	struct nvme_ns *ns;
+	int err;
+
+	anv = container_of(work, struct apple_nvme, flush_dwork.work);
+	ns = anv->flush_ns;
+	if (WARN_ON_ONCE(!ns))
+		return;
+
+	c.common.opcode = nvme_cmd_flush;
+	c.common.nsid = cpu_to_le32(anv->flush_ns->head->ns_id);
+	err = nvme_submit_sync_cmd(ns->queue, &c, NULL, 0);
+	if (err) {
+		dev_err(anv->dev, "Deferred flush failed: %d\n", err);
+	} else {
+		anv->last_flush = jiffies;
+	}
+}
+
 static struct apple_nvme *apple_nvme_alloc(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
@@ -1553,6 +1613,14 @@  static int apple_nvme_probe(struct platform_device *pdev)
 		goto out_uninit_ctrl;
 	}
 
+	if (flush_interval) {
+		anv->flush_interval = msecs_to_jiffies(flush_interval);
+		anv->flush_ns = NULL;
+		anv->last_flush = jiffies - anv->flush_interval;
+	}
+
+	INIT_DELAYED_WORK(&anv->flush_dwork, apple_nvme_flush_work);
+
 	nvme_reset_ctrl(&anv->ctrl);
 	async_schedule(apple_nvme_async_probe, anv);
 
@@ -1590,6 +1658,7 @@  static void apple_nvme_shutdown(struct platform_device *pdev)
 {
 	struct apple_nvme *anv = platform_get_drvdata(pdev);
 
+	flush_delayed_work(&anv->flush_dwork);
 	apple_nvme_disable(anv, true);
 	if (apple_rtkit_is_running(anv->rtk)) {
 		apple_rtkit_shutdown(anv->rtk);