diff mbox series

[RFC] scsi: ufs-core: Do not use clk_scaling_lock in ufshcd_queuecommand()

Message ID 20211029133751.420015-1-adrian.hunter@intel.com (mailing list archive)
State Changes Requested
Headers show
Series [RFC] scsi: ufs-core: Do not use clk_scaling_lock in ufshcd_queuecommand() | expand

Commit Message

Adrian Hunter Oct. 29, 2021, 1:37 p.m. UTC
Use flags and memory barriers instead of the clk_scaling_lock in
ufshcd_queuecommand().  This is done to prepare for potentially faster IOPS
in the future.

On an x86 test system (Samsung Galaxy Book S), for random 4k reads this cut
the time of ufshcd_queuecommand() from about 690 ns to 460 ns.  With larger
I/O sizes, the increased duration of DMA mapping made the difference
increasingly negligible. Random 4k read IOPS was not affected, remaining at
about 97 kIOPS.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
---
 drivers/scsi/ufs/ufshcd.c | 110 +++++++++++++++++++++++++++++++++++---
 drivers/scsi/ufs/ufshcd.h |   4 ++
 2 files changed, 106 insertions(+), 8 deletions(-)

Comments

Bart Van Assche Oct. 29, 2021, 4:31 p.m. UTC | #1
On 10/29/21 6:37 AM, Adrian Hunter wrote:
> Use flags and memory barriers instead of the clk_scaling_lock in
> ufshcd_queuecommand().  This is done to prepare for potentially faster IOPS
> in the future.
> 
> On an x86 test system (Samsung Galaxy Book S), for random 4k reads this cut
> the time of ufshcd_queuecommand() from about 690 ns to 460 ns.  With larger
> I/O sizes, the increased duration of DMA mapping made the difference
> increasingly negligible. Random 4k read IOPS was not affected, remaining at
> about 97 kIOPS.

Hi Adrian,

That's an excellent performance improvement!

Earlier this week I looked into this myself and came up with a different
approach. My patches add less new code. Please take a look at the patches
below.

Thanks,

Bart.


[PATCH 1/3] scsi: ufs: Do not block SCSI requests from the EE handler

It is not necessary to prevent ufshcd_queuecommand() calls while the EE
handler is in progress. Additionally, calling ufshcd_scsi_block_requests()
only prevents new ufshcd_queuecommand() calls and does not wait until
existing calls have finished. Hence remove the ufshcd_scsi_block_requests()
and ufshcd_scsi_unblock_requests() calls from the EE handler.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
  drivers/scsi/ufs/ufshcd.c | 5 +----
  1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index bd0178b284f1..903750c836be 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -5815,12 +5815,11 @@ static void ufshcd_exception_event_handler(struct work_struct *work)
  	u32 status = 0;
  	hba = container_of(work, struct ufs_hba, eeh_work);

-	ufshcd_scsi_block_requests(hba);
  	err = ufshcd_get_ee_status(hba, &status);
  	if (err) {
  		dev_err(hba->dev, "%s: failed to get exception status %d\n",
  				__func__, err);
-		goto out;
+		return;
  	}

  	trace_ufshcd_exception_event(dev_name(hba->dev), status);
@@ -5832,8 +5831,6 @@ static void ufshcd_exception_event_handler(struct work_struct *work)
  		ufshcd_temp_exception_event_handler(hba, status);

  	ufs_debugfs_exception_event(hba, status);
-out:
-	ufshcd_scsi_unblock_requests(hba);
  }

  /* Complete requests that have door-bell cleared */



Subject: [PATCH 2/3] scsi: ufs: Fix a race condition related to clock scaling

Calling ufshcd_scsi_block_requests() after ungate_work has been queued
does not guarantee that SCSI requests are blocked before the clock ungating
work starts. Fix this race condition by moving the
ufshcd_scsi_block_requests() call into ufshcd_ungate_work().

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
  drivers/scsi/ufs/ufshcd.c | 7 ++++---
  1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 903750c836be..f35e7ab9054e 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -1624,6 +1624,8 @@ static void ufshcd_ungate_work(struct work_struct *work)

  	cancel_delayed_work_sync(&hba->clk_gating.gate_work);

+	ufshcd_scsi_block_requests(hba);
+
  	spin_lock_irqsave(hba->host->host_lock, flags);
  	if (hba->clk_gating.state == CLKS_ON) {
  		spin_unlock_irqrestore(hba->host->host_lock, flags);
@@ -1714,9 +1716,8 @@ int ufshcd_hold(struct ufs_hba *hba, bool async)
  		hba->clk_gating.state = REQ_CLKS_ON;
  		trace_ufshcd_clk_gating(dev_name(hba->dev),
  					hba->clk_gating.state);
-		if (queue_work(hba->clk_gating.clk_gating_workq,
-			       &hba->clk_gating.ungate_work))
-			ufshcd_scsi_block_requests(hba);
+		queue_work(hba->clk_gating.clk_gating_workq,
+			   &hba->clk_gating.ungate_work);
  		/*
  		 * fall through to check if we should wait for this
  		 * work to be done or not.



Subject: [PATCH 3/3] scsi: ufs: Remove the clock scaling lock

Remove the clock scaling lock since it is a performance bottleneck for the
hot path. Freeze request queues instead of calling scsi_block_requests()
from inside ufshcd_clock_scaling_prepare(). Use synchronize_rcu() to
wait for ongoing ufshcd_queuecommand() calls.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
  drivers/scsi/ufs/ufshcd.c | 146 ++++++++++----------------------------
  drivers/scsi/ufs/ufshcd.h |   1 -
  2 files changed, 38 insertions(+), 109 deletions(-)

diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index f35e7ab9054e..1b694be807bd 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -1069,65 +1069,6 @@ static bool ufshcd_is_devfreq_scaling_required(struct ufs_hba *hba,
  	return false;
  }

-static int ufshcd_wait_for_doorbell_clr(struct ufs_hba *hba,
-					u64 wait_timeout_us)
-{
-	unsigned long flags;
-	int ret = 0;
-	u32 tm_doorbell;
-	u32 tr_doorbell;
-	bool timeout = false, do_last_check = false;
-	ktime_t start;
-
-	ufshcd_hold(hba, false);
-	spin_lock_irqsave(hba->host->host_lock, flags);
-	/*
-	 * Wait for all the outstanding tasks/transfer requests.
-	 * Verify by checking the doorbell registers are clear.
-	 */
-	start = ktime_get();
-	do {
-		if (hba->ufshcd_state != UFSHCD_STATE_OPERATIONAL) {
-			ret = -EBUSY;
-			goto out;
-		}
-
-		tm_doorbell = ufshcd_readl(hba, REG_UTP_TASK_REQ_DOOR_BELL);
-		tr_doorbell = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL);
-		if (!tm_doorbell && !tr_doorbell) {
-			timeout = false;
-			break;
-		} else if (do_last_check) {
-			break;
-		}
-
-		spin_unlock_irqrestore(hba->host->host_lock, flags);
-		schedule();
-		if (ktime_to_us(ktime_sub(ktime_get(), start)) >
-		    wait_timeout_us) {
-			timeout = true;
-			/*
-			 * We might have scheduled out for long time so make
-			 * sure to check if doorbells are cleared by this time
-			 * or not.
-			 */
-			do_last_check = true;
-		}
-		spin_lock_irqsave(hba->host->host_lock, flags);
-	} while (tm_doorbell || tr_doorbell);
-
-	if (timeout) {
-		dev_err(hba->dev,
-			"%s: timedout waiting for doorbell to clear (tm=0x%x, tr=0x%x)\n",
-			__func__, tm_doorbell, tr_doorbell);
-		ret = -EBUSY;
-	}
-out:
-	spin_unlock_irqrestore(hba->host->host_lock, flags);
-	ufshcd_release(hba);
-	return ret;
-}
-
  /**
   * ufshcd_scale_gear - scale up/down UFS gear
   * @hba: per adapter instance
@@ -1175,20 +1116,22 @@ static int ufshcd_scale_gear(struct ufs_hba *hba, bool scale_up)

  static int ufshcd_clock_scaling_prepare(struct ufs_hba *hba)
  {
-	#define DOORBELL_CLR_TOUT_US		(1000 * 1000) /* 1 sec */
  	int ret = 0;
+
  	/*
-	 * make sure that there are no outstanding requests when
-	 * clock scaling is in progress
+	 * Make sure that there are no outstanding requests while clock scaling
+	 * is in progress. Since the error handler may submit TMFs, limit the
+	 * time during which to wait for the doorbell registers to clear in
+	 * order not to block the UFS error handler.
  	 */
-	ufshcd_scsi_block_requests(hba);
-	down_write(&hba->clk_scaling_lock);
-
-	if (!hba->clk_scaling.is_allowed ||
-	    ufshcd_wait_for_doorbell_clr(hba, DOORBELL_CLR_TOUT_US)) {
+	blk_freeze_queue_start(hba->cmd_queue);
+	blk_freeze_queue_start(hba->tmf_queue);
+	if (blk_mq_freeze_queue_wait_timeout(hba->cmd_queue, HZ) <= 0 ||
+	    blk_mq_freeze_queue_wait_timeout(hba->tmf_queue, HZ / 10) <= 0 ||
+	    !READ_ONCE(hba->clk_scaling.is_allowed)) {
  		ret = -EBUSY;
-		up_write(&hba->clk_scaling_lock);
-		ufshcd_scsi_unblock_requests(hba);
+		blk_mq_unfreeze_queue(hba->tmf_queue);
+		blk_mq_unfreeze_queue(hba->cmd_queue);
  		goto out;
  	}

@@ -1199,13 +1142,10 @@ static int ufshcd_clock_scaling_prepare(struct ufs_hba *hba)
  	return ret;
  }

-static void ufshcd_clock_scaling_unprepare(struct ufs_hba *hba, bool writelock)
+static void ufshcd_clock_scaling_unprepare(struct ufs_hba *hba)
  {
-	if (writelock)
-		up_write(&hba->clk_scaling_lock);
-	else
-		up_read(&hba->clk_scaling_lock);
-	ufshcd_scsi_unblock_requests(hba);
+	blk_mq_unfreeze_queue(hba->tmf_queue);
+	blk_mq_unfreeze_queue(hba->cmd_queue);
  	ufshcd_release(hba);
  }

@@ -1220,8 +1160,7 @@ static void ufshcd_clock_scaling_unprepare(struct ufs_hba *hba, bool writelock)
   */
  static int ufshcd_devfreq_scale(struct ufs_hba *hba, bool scale_up)
  {
-	int ret = 0;
-	bool is_writelock = true;
+	int ret;

  	ret = ufshcd_clock_scaling_prepare(hba);
  	if (ret)
@@ -1249,14 +1188,19 @@ static int ufshcd_devfreq_scale(struct ufs_hba *hba, bool scale_up)
  			goto out_unprepare;
  		}
  	}
-
-	/* Enable Write Booster if we have scaled up else disable it */
-	downgrade_write(&hba->clk_scaling_lock);
-	is_writelock = false;
+	ufshcd_scsi_block_requests(hba);
+	ufshcd_clock_scaling_unprepare(hba);
+	/*
+	 * Enable the Write Booster if we have scaled up else disable it.
+	 * ufshcd_wb_toggle() calls ufshcd_exec_dev_cmd() indirectly and hence
+	 * must be called after ufshcd_clock_scaling_unprepare().
+	 */
  	ufshcd_wb_toggle(hba, scale_up);
+	ufshcd_scsi_unblock_requests(hba);
+	return ret;

  out_unprepare:
-	ufshcd_clock_scaling_unprepare(hba, is_writelock);
+	ufshcd_clock_scaling_unprepare(hba);
  	return ret;
  }

@@ -2696,9 +2640,6 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)

  	WARN_ONCE(tag < 0, "Invalid tag %d\n", tag);

-	if (!down_read_trylock(&hba->clk_scaling_lock))
-		return SCSI_MLQUEUE_HOST_BUSY;
-
  	switch (hba->ufshcd_state) {
  	case UFSHCD_STATE_OPERATIONAL:
  		break;
@@ -2782,9 +2723,8 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
  	}

  	ufshcd_send_command(hba, tag);
-out:
-	up_read(&hba->clk_scaling_lock);

+out:
  	if (ufs_trigger_eh()) {
  		unsigned long flags;

@@ -2946,8 +2886,6 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba *hba,
  	int err;
  	int tag;

-	down_read(&hba->clk_scaling_lock);
-
  	/*
  	 * Get free slot, sleep if slots are unavailable.
  	 * Even though we use wait_event() which sleeps indefinitely,
@@ -2982,7 +2920,6 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba *hba,
  out:
  	blk_put_request(req);
  out_unlock:
-	up_read(&hba->clk_scaling_lock);
  	return err;
  }

@@ -5936,9 +5873,7 @@ void ufshcd_schedule_eh_work(struct ufs_hba *hba)

  static void ufshcd_clk_scaling_allow(struct ufs_hba *hba, bool allow)
  {
-	down_write(&hba->clk_scaling_lock);
-	hba->clk_scaling.is_allowed = allow;
-	up_write(&hba->clk_scaling_lock);
+	WRITE_ONCE(hba->clk_scaling.is_allowed, allow);
  }

  static void ufshcd_clk_scaling_suspend(struct ufs_hba *hba, bool suspend)
@@ -5985,9 +5920,12 @@ static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
  		ufshcd_clk_scaling_allow(hba, false);
  	}
  	ufshcd_scsi_block_requests(hba);
-	/* Drain ufshcd_queuecommand() */
-	down_write(&hba->clk_scaling_lock);
-	up_write(&hba->clk_scaling_lock);
+	/*
+	 * Drain ufshcd_queuecommand(). Since ufshcd_queuecommand() does not
+	 * sleep, calling synchronize_rcu() is sufficient to wait for ongoing
+	 * ufshcd_queuecommand calls.
+	 */
+	synchronize_rcu();
  	cancel_work_sync(&hba->eeh_work);
  }

@@ -6212,11 +6150,8 @@ static void ufshcd_err_handler(struct work_struct *work)
  	 */
  	if (needs_restore) {
  		spin_unlock_irqrestore(hba->host->host_lock, flags);
-		/*
-		 * Hold the scaling lock just in case dev cmds
-		 * are sent via bsg and/or sysfs.
-		 */
-		down_write(&hba->clk_scaling_lock);
+		/* Block TMF submission, e.g. through BSG or sysfs. */
+		blk_mq_freeze_queue(hba->tmf_queue);
  		hba->force_pmc = true;
  		pmc_err = ufshcd_config_pwr_mode(hba, &(hba->pwr_info));
  		if (pmc_err) {
@@ -6226,7 +6161,7 @@ static void ufshcd_err_handler(struct work_struct *work)
  		}
  		hba->force_pmc = false;
  		ufshcd_print_pwr_info(hba);
-		up_write(&hba->clk_scaling_lock);
+		blk_mq_unfreeze_queue(hba->tmf_queue);
  		spin_lock_irqsave(hba->host->host_lock, flags);
  	}

@@ -6725,8 +6660,6 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba,
  	int tag;
  	u8 upiu_flags;

-	down_read(&hba->clk_scaling_lock);
-
  	req = blk_get_request(q, REQ_OP_DRV_OUT, 0);
  	if (IS_ERR(req)) {
  		err = PTR_ERR(req);
@@ -6804,7 +6737,6 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba,

  	blk_put_request(req);
  out_unlock:
-	up_read(&hba->clk_scaling_lock);
  	return err;
  }

@@ -7982,7 +7914,7 @@ static int ufshcd_add_lus(struct ufs_hba *hba)
  			&hba->pwr_info,
  			sizeof(struct ufs_pa_layer_attr));
  		hba->clk_scaling.saved_pwr_info.is_valid = true;
-		hba->clk_scaling.is_allowed = true;
+		WRITE_ONCE(hba->clk_scaling.is_allowed, true);

  		ret = ufshcd_devfreq_init(hba);
  		if (ret)
@@ -9558,8 +9490,6 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq)
  	/* Initialize mutex for exception event control */
  	mutex_init(&hba->ee_ctrl_mutex);

-	init_rwsem(&hba->clk_scaling_lock);
-
  	ufshcd_init_clk_gating(hba);

  	ufshcd_init_clk_scaling(hba);
diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
index 0820d409585a..5514528cca58 100644
--- a/drivers/scsi/ufs/ufshcd.h
+++ b/drivers/scsi/ufs/ufshcd.h
@@ -902,7 +902,6 @@ struct ufs_hba {
  	enum bkops_status urgent_bkops_lvl;
  	bool is_urgent_bkops_lvl_checked;

-	struct rw_semaphore clk_scaling_lock;
  	unsigned char desc_size[QUERY_DESC_IDN_MAX];
  	atomic_t scsi_block_reqs_cnt;
Adrian Hunter Nov. 1, 2021, 9:16 a.m. UTC | #2
On 29/10/2021 19:31, Bart Van Assche wrote:
> On 10/29/21 6:37 AM, Adrian Hunter wrote:
>> Use flags and memory barriers instead of the clk_scaling_lock in
>> ufshcd_queuecommand().  This is done to prepare for potentially faster IOPS
>> in the future.
>>
>> On an x86 test system (Samsung Galaxy Book S), for random 4k reads this cut
>> the time of ufshcd_queuecommand() from about 690 ns to 460 ns.  With larger
>> I/O sizes, the increased duration of DMA mapping made the difference
>> increasingly negligible. Random 4k read IOPS was not affected, remaining at
>> about 97 kIOPS.
> 
> Hi Adrian,
> 
> That's an excellent performance improvement!
> 
> Earlier this week I looked into this myself and came up with a different
> approach. My patches add less new code. Please take a look at the patches
> below.
> 
> Thanks,
> 
> Bart.
> 
> 
> [PATCH 1/3] scsi: ufs: Do not block SCSI requests from the EE handler
> 
> It is not necessary to prevent ufshcd_queuecommand() calls while the EE
> handler is in progress. Additionally, calling ufshcd_scsi_block_requests()
> only prevents new ufshcd_queuecommand() calls and does not wait until
> existing calls have finished. Hence remove the ufshcd_scsi_block_requests()
> and ufshcd_scsi_unblock_requests() calls from the EE handler.
> 
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
>  drivers/scsi/ufs/ufshcd.c | 5 +----
>  1 file changed, 1 insertion(+), 4 deletions(-)
> 
> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> index bd0178b284f1..903750c836be 100644
> --- a/drivers/scsi/ufs/ufshcd.c
> +++ b/drivers/scsi/ufs/ufshcd.c
> @@ -5815,12 +5815,11 @@ static void ufshcd_exception_event_handler(struct work_struct *work)
>      u32 status = 0;
>      hba = container_of(work, struct ufs_hba, eeh_work);
> 
> -    ufshcd_scsi_block_requests(hba);

I suspect that this was to facilitate the quick handling of urgent background ops,
so I am not sure it should be removed.

>      err = ufshcd_get_ee_status(hba, &status);
>      if (err) {
>          dev_err(hba->dev, "%s: failed to get exception status %d\n",
>                  __func__, err);
> -        goto out;
> +        return;
>      }
> 
>      trace_ufshcd_exception_event(dev_name(hba->dev), status);
> @@ -5832,8 +5831,6 @@ static void ufshcd_exception_event_handler(struct work_struct *work)
>          ufshcd_temp_exception_event_handler(hba, status);
> 
>      ufs_debugfs_exception_event(hba, status);
> -out:
> -    ufshcd_scsi_unblock_requests(hba);
>  }
> 
>  /* Complete requests that have door-bell cleared */
> 
> 
> 
> Subject: [PATCH 2/3] scsi: ufs: Fix a race condition related to clock scaling
> 
> Calling ufshcd_scsi_block_requests() after ungate_work has been queued
> does not guarantee that SCSI requests are blocked before the clock ungating
> work starts. Fix this race condition by moving the
> ufshcd_scsi_block_requests() call into ufshcd_ungate_work().
> 
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
>  drivers/scsi/ufs/ufshcd.c | 7 ++++---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> index 903750c836be..f35e7ab9054e 100644
> --- a/drivers/scsi/ufs/ufshcd.c
> +++ b/drivers/scsi/ufs/ufshcd.c
> @@ -1624,6 +1624,8 @@ static void ufshcd_ungate_work(struct work_struct *work)
> 
>      cancel_delayed_work_sync(&hba->clk_gating.gate_work);
> 
> +    ufshcd_scsi_block_requests(hba);
> +
>      spin_lock_irqsave(hba->host->host_lock, flags);
>      if (hba->clk_gating.state == CLKS_ON) {
>          spin_unlock_irqrestore(hba->host->host_lock, flags);
> @@ -1714,9 +1716,8 @@ int ufshcd_hold(struct ufs_hba *hba, bool async)
>          hba->clk_gating.state = REQ_CLKS_ON;
>          trace_ufshcd_clk_gating(dev_name(hba->dev),
>                      hba->clk_gating.state);
> -        if (queue_work(hba->clk_gating.clk_gating_workq,
> -                   &hba->clk_gating.ungate_work))
> -            ufshcd_scsi_block_requests(hba);
> +        queue_work(hba->clk_gating.clk_gating_workq,
> +               &hba->clk_gating.ungate_work);
>          /*
>           * fall through to check if we should wait for this
>           * work to be done or not.
> 
> 
> 
> Subject: [PATCH 3/3] scsi: ufs: Remove the clock scaling lock
> 
> Remove the clock scaling lock since it is a performance bottleneck for the
> hot path. Freeze request queues instead of calling scsi_block_requests()
> from inside ufshcd_clock_scaling_prepare(). Use synchronize_rcu() to
> wait for ongoing ufshcd_queuecommand() calls.
> 
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
>  drivers/scsi/ufs/ufshcd.c | 146 ++++++++++----------------------------
>  drivers/scsi/ufs/ufshcd.h |   1 -
>  2 files changed, 38 insertions(+), 109 deletions(-)
> 
> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> index f35e7ab9054e..1b694be807bd 100644
> --- a/drivers/scsi/ufs/ufshcd.c
> +++ b/drivers/scsi/ufs/ufshcd.c
> @@ -1069,65 +1069,6 @@ static bool ufshcd_is_devfreq_scaling_required(struct ufs_hba *hba,
>      return false;
>  }
> 
> -static int ufshcd_wait_for_doorbell_clr(struct ufs_hba *hba,
> -                    u64 wait_timeout_us)
> -{
> -    unsigned long flags;
> -    int ret = 0;
> -    u32 tm_doorbell;
> -    u32 tr_doorbell;
> -    bool timeout = false, do_last_check = false;
> -    ktime_t start;
> -
> -    ufshcd_hold(hba, false);
> -    spin_lock_irqsave(hba->host->host_lock, flags);
> -    /*
> -     * Wait for all the outstanding tasks/transfer requests.
> -     * Verify by checking the doorbell registers are clear.
> -     */
> -    start = ktime_get();
> -    do {
> -        if (hba->ufshcd_state != UFSHCD_STATE_OPERATIONAL) {
> -            ret = -EBUSY;
> -            goto out;
> -        }
> -
> -        tm_doorbell = ufshcd_readl(hba, REG_UTP_TASK_REQ_DOOR_BELL);
> -        tr_doorbell = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL);
> -        if (!tm_doorbell && !tr_doorbell) {
> -            timeout = false;
> -            break;
> -        } else if (do_last_check) {
> -            break;
> -        }
> -
> -        spin_unlock_irqrestore(hba->host->host_lock, flags);
> -        schedule();
> -        if (ktime_to_us(ktime_sub(ktime_get(), start)) >
> -            wait_timeout_us) {
> -            timeout = true;
> -            /*
> -             * We might have scheduled out for long time so make
> -             * sure to check if doorbells are cleared by this time
> -             * or not.
> -             */
> -            do_last_check = true;
> -        }
> -        spin_lock_irqsave(hba->host->host_lock, flags);
> -    } while (tm_doorbell || tr_doorbell);
> -
> -    if (timeout) {
> -        dev_err(hba->dev,
> -            "%s: timedout waiting for doorbell to clear (tm=0x%x, tr=0x%x)\n",
> -            __func__, tm_doorbell, tr_doorbell);
> -        ret = -EBUSY;
> -    }
> -out:
> -    spin_unlock_irqrestore(hba->host->host_lock, flags);
> -    ufshcd_release(hba);
> -    return ret;
> -}
> -
>  /**
>   * ufshcd_scale_gear - scale up/down UFS gear
>   * @hba: per adapter instance
> @@ -1175,20 +1116,22 @@ static int ufshcd_scale_gear(struct ufs_hba *hba, bool scale_up)
> 
>  static int ufshcd_clock_scaling_prepare(struct ufs_hba *hba)
>  {
> -    #define DOORBELL_CLR_TOUT_US        (1000 * 1000) /* 1 sec */
>      int ret = 0;
> +
>      /*
> -     * make sure that there are no outstanding requests when
> -     * clock scaling is in progress
> +     * Make sure that there are no outstanding requests while clock scaling
> +     * is in progress. Since the error handler may submit TMFs, limit the
> +     * time during which to wait for the doorbell registers to clear in
> +     * order not to block the UFS error handler.
>       */
> -    ufshcd_scsi_block_requests(hba);
> -    down_write(&hba->clk_scaling_lock);
> -
> -    if (!hba->clk_scaling.is_allowed ||
> -        ufshcd_wait_for_doorbell_clr(hba, DOORBELL_CLR_TOUT_US)) {
> +    blk_freeze_queue_start(hba->cmd_queue);
> +    blk_freeze_queue_start(hba->tmf_queue);
> +    if (blk_mq_freeze_queue_wait_timeout(hba->cmd_queue, HZ) <= 0 ||
> +        blk_mq_freeze_queue_wait_timeout(hba->tmf_queue, HZ / 10) <= 0 ||
> +        !READ_ONCE(hba->clk_scaling.is_allowed)) {
>          ret = -EBUSY;
> -        up_write(&hba->clk_scaling_lock);
> -        ufshcd_scsi_unblock_requests(hba);
> +        blk_mq_unfreeze_queue(hba->tmf_queue);
> +        blk_mq_unfreeze_queue(hba->cmd_queue);
>          goto out;
>      }
> 
> @@ -1199,13 +1142,10 @@ static int ufshcd_clock_scaling_prepare(struct ufs_hba *hba)
>      return ret;
>  }
> 
> -static void ufshcd_clock_scaling_unprepare(struct ufs_hba *hba, bool writelock)
> +static void ufshcd_clock_scaling_unprepare(struct ufs_hba *hba)
>  {
> -    if (writelock)
> -        up_write(&hba->clk_scaling_lock);
> -    else
> -        up_read(&hba->clk_scaling_lock);
> -    ufshcd_scsi_unblock_requests(hba);
> +    blk_mq_unfreeze_queue(hba->tmf_queue);
> +    blk_mq_unfreeze_queue(hba->cmd_queue);
>      ufshcd_release(hba);
>  }
> 
> @@ -1220,8 +1160,7 @@ static void ufshcd_clock_scaling_unprepare(struct ufs_hba *hba, bool writelock)
>   */
>  static int ufshcd_devfreq_scale(struct ufs_hba *hba, bool scale_up)
>  {
> -    int ret = 0;
> -    bool is_writelock = true;
> +    int ret;
> 
>      ret = ufshcd_clock_scaling_prepare(hba);
>      if (ret)
> @@ -1249,14 +1188,19 @@ static int ufshcd_devfreq_scale(struct ufs_hba *hba, bool scale_up)
>              goto out_unprepare;
>          }
>      }
> -
> -    /* Enable Write Booster if we have scaled up else disable it */
> -    downgrade_write(&hba->clk_scaling_lock);
> -    is_writelock = false;
> +    ufshcd_scsi_block_requests(hba);
> +    ufshcd_clock_scaling_unprepare(hba);
> +    /*
> +     * Enable the Write Booster if we have scaled up else disable it.
> +     * ufshcd_wb_toggle() calls ufshcd_exec_dev_cmd() indirectly and hence
> +     * must be called after ufshcd_clock_scaling_unprepare().
> +     */
>      ufshcd_wb_toggle(hba, scale_up);
> +    ufshcd_scsi_unblock_requests(hba);
> +    return ret;
> 
>  out_unprepare:
> -    ufshcd_clock_scaling_unprepare(hba, is_writelock);
> +    ufshcd_clock_scaling_unprepare(hba);
>      return ret;
>  }
> 
> @@ -2696,9 +2640,6 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
> 
>      WARN_ONCE(tag < 0, "Invalid tag %d\n", tag);
> 
> -    if (!down_read_trylock(&hba->clk_scaling_lock))
> -        return SCSI_MLQUEUE_HOST_BUSY;
> -
>      switch (hba->ufshcd_state) {
>      case UFSHCD_STATE_OPERATIONAL:
>          break;
> @@ -2782,9 +2723,8 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
>      }
> 
>      ufshcd_send_command(hba, tag);
> -out:
> -    up_read(&hba->clk_scaling_lock);
> 
> +out:
>      if (ufs_trigger_eh()) {
>          unsigned long flags;
> 
> @@ -2946,8 +2886,6 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba *hba,
>      int err;
>      int tag;
> 
> -    down_read(&hba->clk_scaling_lock);
> -
>      /*
>       * Get free slot, sleep if slots are unavailable.
>       * Even though we use wait_event() which sleeps indefinitely,
> @@ -2982,7 +2920,6 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba *hba,
>  out:
>      blk_put_request(req);
>  out_unlock:
> -    up_read(&hba->clk_scaling_lock);
>      return err;
>  }
> 
> @@ -5936,9 +5873,7 @@ void ufshcd_schedule_eh_work(struct ufs_hba *hba)
> 
>  static void ufshcd_clk_scaling_allow(struct ufs_hba *hba, bool allow)
>  {
> -    down_write(&hba->clk_scaling_lock);
> -    hba->clk_scaling.is_allowed = allow;
> -    up_write(&hba->clk_scaling_lock);
> +    WRITE_ONCE(hba->clk_scaling.is_allowed, allow);
>  }
> 
>  static void ufshcd_clk_scaling_suspend(struct ufs_hba *hba, bool suspend)
> @@ -5985,9 +5920,12 @@ static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
>          ufshcd_clk_scaling_allow(hba, false);
>      }
>      ufshcd_scsi_block_requests(hba);
> -    /* Drain ufshcd_queuecommand() */
> -    down_write(&hba->clk_scaling_lock);
> -    up_write(&hba->clk_scaling_lock);
> +    /*
> +     * Drain ufshcd_queuecommand(). Since ufshcd_queuecommand() does not
> +     * sleep, calling synchronize_rcu() is sufficient to wait for ongoing
> +     * ufshcd_queuecommand calls.
> +     */
> +    synchronize_rcu();

This depends upon block layer internals, so it must be called via a block
layer function i.e. the block layer must guarantee this will always work.

Also, scsi_unjam_host() does not look like it uses RCU when issuing
requests.

>      cancel_work_sync(&hba->eeh_work);
>  }
> 
> @@ -6212,11 +6150,8 @@ static void ufshcd_err_handler(struct work_struct *work)
>       */
>      if (needs_restore) {
>          spin_unlock_irqrestore(hba->host->host_lock, flags);
> -        /*
> -         * Hold the scaling lock just in case dev cmds
> -         * are sent via bsg and/or sysfs.
> -         */
> -        down_write(&hba->clk_scaling_lock);
> +        /* Block TMF submission, e.g. through BSG or sysfs. */

What about dev cmds?

Also, there is the outstanding question of synchronization for the call to
ufshcd_reset_and_restore() further down.

> +        blk_mq_freeze_queue(hba->tmf_queue);
>          hba->force_pmc = true;
>          pmc_err = ufshcd_config_pwr_mode(hba, &(hba->pwr_info));
>          if (pmc_err) {
> @@ -6226,7 +6161,7 @@ static void ufshcd_err_handler(struct work_struct *work)
>          }
>          hba->force_pmc = false;
>          ufshcd_print_pwr_info(hba);
> -        up_write(&hba->clk_scaling_lock);
> +        blk_mq_unfreeze_queue(hba->tmf_queue);
>          spin_lock_irqsave(hba->host->host_lock, flags);
>      }
> 
> @@ -6725,8 +6660,6 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba,
>      int tag;
>      u8 upiu_flags;
> 
> -    down_read(&hba->clk_scaling_lock);
> -
>      req = blk_get_request(q, REQ_OP_DRV_OUT, 0);
>      if (IS_ERR(req)) {
>          err = PTR_ERR(req);
> @@ -6804,7 +6737,6 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba,
> 
>      blk_put_request(req);
>  out_unlock:
> -    up_read(&hba->clk_scaling_lock);
>      return err;
>  }
> 
> @@ -7982,7 +7914,7 @@ static int ufshcd_add_lus(struct ufs_hba *hba)
>              &hba->pwr_info,
>              sizeof(struct ufs_pa_layer_attr));
>          hba->clk_scaling.saved_pwr_info.is_valid = true;
> -        hba->clk_scaling.is_allowed = true;
> +        WRITE_ONCE(hba->clk_scaling.is_allowed, true);
> 
>          ret = ufshcd_devfreq_init(hba);
>          if (ret)
> @@ -9558,8 +9490,6 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq)
>      /* Initialize mutex for exception event control */
>      mutex_init(&hba->ee_ctrl_mutex);
> 
> -    init_rwsem(&hba->clk_scaling_lock);
> -
>      ufshcd_init_clk_gating(hba);
> 
>      ufshcd_init_clk_scaling(hba);
> diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
> index 0820d409585a..5514528cca58 100644
> --- a/drivers/scsi/ufs/ufshcd.h
> +++ b/drivers/scsi/ufs/ufshcd.h
> @@ -902,7 +902,6 @@ struct ufs_hba {
>      enum bkops_status urgent_bkops_lvl;
>      bool is_urgent_bkops_lvl_checked;
> 
> -    struct rw_semaphore clk_scaling_lock;
>      unsigned char desc_size[QUERY_DESC_IDN_MAX];
>      atomic_t scsi_block_reqs_cnt;
>
Bart Van Assche Nov. 1, 2021, 6:35 p.m. UTC | #3
On 11/1/21 2:16 AM, Adrian Hunter wrote:
> On 29/10/2021 19:31, Bart Van Assche wrote:
>> @@ -5985,9 +5920,12 @@ static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
>>           ufshcd_clk_scaling_allow(hba, false);
>>       }
>>       ufshcd_scsi_block_requests(hba);
>> -    /* Drain ufshcd_queuecommand() */
>> -    down_write(&hba->clk_scaling_lock);
>> -    up_write(&hba->clk_scaling_lock);
>> +    /*
>> +     * Drain ufshcd_queuecommand(). Since ufshcd_queuecommand() does not
>> +     * sleep, calling synchronize_rcu() is sufficient to wait for ongoing
>> +     * ufshcd_queuecommand calls.
>> +     */
>> +    synchronize_rcu();
> 
> This depends upon block layer internals, so it must be called via a block
> layer function i.e. the block layer must guarantee this will always work. > Also, scsi_unjam_host() does not look like it uses RCU when issuing
> requests.

I will add an rcu_read_lock() / rcu_read_unlock() pair in ufshcd_queuecommand().
I think that addresses both points mentioned above.

>>       cancel_work_sync(&hba->eeh_work);
>>   }
>>
>> @@ -6212,11 +6150,8 @@ static void ufshcd_err_handler(struct work_struct *work)
>>        */
>>       if (needs_restore) {
>>           spin_unlock_irqrestore(hba->host->host_lock, flags);
>> -        /*
>> -         * Hold the scaling lock just in case dev cmds
>> -         * are sent via bsg and/or sysfs.
>> -         */
>> -        down_write(&hba->clk_scaling_lock);
>> +        /* Block TMF submission, e.g. through BSG or sysfs. */
> 
> What about dev cmds?

ufshcd_exec_dev_cmd() and ufshcd_issue_devman_upiu_cmd() both allocate a
request from hba->cmd_queue. blk_get_request() indirectly increments
q->q_usage_counter and blk_mq_freeze_queue() waits until that counter drops
to zero. Hence, hba->cmd_queue freezing only finishes after all pending dev
cmds have finished. Additionally, if hba->cmd_queue is frozen,
blk_get_request() blocks until it is unfrozen. So dev cmds are covered.

> Also, there is the outstanding question of synchronization for the call to
> ufshcd_reset_and_restore() further down.

Please clarify this comment further. I assume that you are referring to the
ufshcd_reset_and_restore() call guarded by if (needs_reset)? It is not clear
to me what the outstanding question is?

Thanks,

Bart.
Adrian Hunter Nov. 2, 2021, 6:11 a.m. UTC | #4
On 01/11/2021 20:35, Bart Van Assche wrote:
> On 11/1/21 2:16 AM, Adrian Hunter wrote:
>> On 29/10/2021 19:31, Bart Van Assche wrote:
>>> @@ -5985,9 +5920,12 @@ static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
>>>           ufshcd_clk_scaling_allow(hba, false);
>>>       }
>>>       ufshcd_scsi_block_requests(hba);
>>> -    /* Drain ufshcd_queuecommand() */
>>> -    down_write(&hba->clk_scaling_lock);
>>> -    up_write(&hba->clk_scaling_lock);
>>> +    /*
>>> +     * Drain ufshcd_queuecommand(). Since ufshcd_queuecommand() does not
>>> +     * sleep, calling synchronize_rcu() is sufficient to wait for ongoing
>>> +     * ufshcd_queuecommand calls.
>>> +     */
>>> +    synchronize_rcu();
>>
>> This depends upon block layer internals, so it must be called via a block
>> layer function i.e. the block layer must guarantee this will always work. > Also, scsi_unjam_host() does not look like it uses RCU when issuing
>> requests.
> 
> I will add an rcu_read_lock() / rcu_read_unlock() pair in ufshcd_queuecommand().
> I think that addresses both points mentioned above.
> 
>>>       cancel_work_sync(&hba->eeh_work);
>>>   }
>>>
>>> @@ -6212,11 +6150,8 @@ static void ufshcd_err_handler(struct work_struct *work)
>>>        */
>>>       if (needs_restore) {
>>>           spin_unlock_irqrestore(hba->host->host_lock, flags);
>>> -        /*
>>> -         * Hold the scaling lock just in case dev cmds
>>> -         * are sent via bsg and/or sysfs.
>>> -         */
>>> -        down_write(&hba->clk_scaling_lock);
>>> +        /* Block TMF submission, e.g. through BSG or sysfs. */
>>
>> What about dev cmds?
> 
> ufshcd_exec_dev_cmd() and ufshcd_issue_devman_upiu_cmd() both allocate a
> request from hba->cmd_queue. blk_get_request() indirectly increments
> q->q_usage_counter and blk_mq_freeze_queue() waits until that counter drops
> to zero. Hence, hba->cmd_queue freezing only finishes after all pending dev
> cmds have finished. Additionally, if hba->cmd_queue is frozen,
> blk_get_request() blocks until it is unfrozen. So dev cmds are covered.

The queue is not usually frozen when the error handler runs.

> 
>> Also, there is the outstanding question of synchronization for the call to
>> ufshcd_reset_and_restore() further down.
> 
> Please clarify this comment further. I assume that you are referring to the
> ufshcd_reset_and_restore() call guarded by if (needs_reset)? It is not clear
> to me what the outstanding question is?

What is to stop it racing with BSG, sysfs, debugfs, abort, reset etc?

> 
> Thanks,
> 
> Bart.
Bart Van Assche Nov. 2, 2021, 8:49 p.m. UTC | #5
On 11/1/21 11:11 PM, Adrian Hunter wrote:
> On 01/11/2021 20:35, Bart Van Assche wrote:
>> On 11/1/21 2:16 AM, Adrian Hunter wrote:
>>> Also, there is the outstanding question of synchronization for the call to
>>> ufshcd_reset_and_restore() further down.
>>
>> Please clarify this comment further. I assume that you are referring to the
>> ufshcd_reset_and_restore() call guarded by if (needs_reset)? It is not clear
>> to me what the outstanding question is?
> 
> What is to stop it racing with BSG, sysfs, debugfs, abort, reset etc?

The patch below should address all feedback that has been provided so far.
Instead of removing the clock scaling lock entirely, it is only removed from
ufshcd_queuecommand().


Subject: [PATCH] scsi: ufs: Remove the clock scaling lock from the command queueing code

Remove the clock scaling lock from ufshcd_queuecommand() since it is a
performance bottleneck. Freeze request queues from inside
ufshcd_clock_scaling_prepare() to wait for ongoing SCSI and dev cmds.
Insert a rcu_read_lock() / rcu_read_unlock() pair in ufshcd_queuecommand().
Use synchronize_rcu() to wait for ongoing ufshcd_queuecommand() calls.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
  drivers/scsi/ufs/ufshcd.c | 97 ++++++++++-----------------------------
  drivers/scsi/ufs/ufshcd.h |  1 +
  2 files changed, 26 insertions(+), 72 deletions(-)

diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 35a1af70f705..9d964b979aa2 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -1069,65 +1069,6 @@ static bool ufshcd_is_devfreq_scaling_required(struct ufs_hba *hba,
  	return false;
  }

-static int ufshcd_wait_for_doorbell_clr(struct ufs_hba *hba,
-					u64 wait_timeout_us)
-{
-	unsigned long flags;
-	int ret = 0;
-	u32 tm_doorbell;
-	u32 tr_doorbell;
-	bool timeout = false, do_last_check = false;
-	ktime_t start;
-
-	ufshcd_hold(hba, false);
-	spin_lock_irqsave(hba->host->host_lock, flags);
-	/*
-	 * Wait for all the outstanding tasks/transfer requests.
-	 * Verify by checking the doorbell registers are clear.
-	 */
-	start = ktime_get();
-	do {
-		if (hba->ufshcd_state != UFSHCD_STATE_OPERATIONAL) {
-			ret = -EBUSY;
-			goto out;
-		}
-
-		tm_doorbell = ufshcd_readl(hba, REG_UTP_TASK_REQ_DOOR_BELL);
-		tr_doorbell = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL);
-		if (!tm_doorbell && !tr_doorbell) {
-			timeout = false;
-			break;
-		} else if (do_last_check) {
-			break;
-		}
-
-		spin_unlock_irqrestore(hba->host->host_lock, flags);
-		schedule();
-		if (ktime_to_us(ktime_sub(ktime_get(), start)) >
-		    wait_timeout_us) {
-			timeout = true;
-			/*
-			 * We might have scheduled out for long time so make
-			 * sure to check if doorbells are cleared by this time
-			 * or not.
-			 */
-			do_last_check = true;
-		}
-		spin_lock_irqsave(hba->host->host_lock, flags);
-	} while (tm_doorbell || tr_doorbell);
-
-	if (timeout) {
-		dev_err(hba->dev,
-			"%s: timedout waiting for doorbell to clear (tm=0x%x, tr=0x%x)\n",
-			__func__, tm_doorbell, tr_doorbell);
-		ret = -EBUSY;
-	}
-out:
-	spin_unlock_irqrestore(hba->host->host_lock, flags);
-	ufshcd_release(hba);
-	return ret;
-}
-
  /**
   * ufshcd_scale_gear - scale up/down UFS gear
   * @hba: per adapter instance
@@ -1175,20 +1116,29 @@ static int ufshcd_scale_gear(struct ufs_hba *hba, bool scale_up)

  static int ufshcd_clock_scaling_prepare(struct ufs_hba *hba)
  {
-	#define DOORBELL_CLR_TOUT_US		(1000 * 1000) /* 1 sec */
  	int ret = 0;
+
  	/*
-	 * make sure that there are no outstanding requests when
-	 * clock scaling is in progress
+	 * Make sure that there are no outstanding requests while clock scaling
+	 * is in progress. Since the error handler may submit TMFs, limit the
+	 * time during which to block hba->tmf_queue in order not to block the
+	 * UFS error handler.
+	 *
+	 * Since ufshcd_exec_dev_cmd() and ufshcd_issue_devman_upiu_cmd() lock
+	 * the clk_scaling_lock before calling blk_get_request(), lock
+	 * clk_scaling_lock before freezing the request queues to prevent a
+	 * deadlock.
  	 */
-	ufshcd_scsi_block_requests(hba);
  	down_write(&hba->clk_scaling_lock);
-
+	blk_freeze_queue_start(hba->cmd_queue);
+	blk_freeze_queue_start(hba->tmf_queue);
  	if (!hba->clk_scaling.is_allowed ||
-	    ufshcd_wait_for_doorbell_clr(hba, DOORBELL_CLR_TOUT_US)) {
+	    blk_mq_freeze_queue_wait_timeout(hba->cmd_queue, HZ) <= 0 ||
+	    blk_mq_freeze_queue_wait_timeout(hba->tmf_queue, HZ / 10) <= 0) {
  		ret = -EBUSY;
+		blk_mq_unfreeze_queue(hba->tmf_queue);
+		blk_mq_unfreeze_queue(hba->cmd_queue);
  		up_write(&hba->clk_scaling_lock);
-		ufshcd_scsi_unblock_requests(hba);
  		goto out;
  	}

@@ -1201,11 +1151,12 @@ static int ufshcd_clock_scaling_prepare(struct ufs_hba *hba)

  static void ufshcd_clock_scaling_unprepare(struct ufs_hba *hba, bool writelock)
  {
+	blk_mq_unfreeze_queue(hba->tmf_queue);
+	blk_mq_unfreeze_queue(hba->cmd_queue);
  	if (writelock)
  		up_write(&hba->clk_scaling_lock);
  	else
  		up_read(&hba->clk_scaling_lock);
-	ufshcd_scsi_unblock_requests(hba);
  	ufshcd_release(hba);
  }

@@ -2698,8 +2649,11 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)

  	WARN_ONCE(tag < 0, "Invalid tag %d\n", tag);

-	if (!down_read_trylock(&hba->clk_scaling_lock))
-		return SCSI_MLQUEUE_HOST_BUSY;
+	/*
+	 * Allows the UFS error handler to wait for prior ufshcd_queuecommand()
+	 * calls.
+	 */
+	rcu_read_lock();

  	switch (hba->ufshcd_state) {
  	case UFSHCD_STATE_OPERATIONAL:
@@ -2785,7 +2739,7 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)

  	ufshcd_send_command(hba, tag);
  out:
-	up_read(&hba->clk_scaling_lock);
+	rcu_read_unlock();

  	if (ufs_trigger_eh()) {
  		unsigned long flags;
@@ -5991,8 +5945,7 @@ static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
  	}
  	ufshcd_scsi_block_requests(hba);
  	/* Drain ufshcd_queuecommand() */
-	down_write(&hba->clk_scaling_lock);
-	up_write(&hba->clk_scaling_lock);
+	synchronize_rcu();
  	cancel_work_sync(&hba->eeh_work);
  }

diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
index a911ad72de7a..c8c2bddb1d33 100644
--- a/drivers/scsi/ufs/ufshcd.h
+++ b/drivers/scsi/ufs/ufshcd.h
@@ -778,6 +778,7 @@ struct ufs_hba_monitor {
   * @clk_list_head: UFS host controller clocks list node head
   * @pwr_info: holds current power mode
   * @max_pwr_info: keeps the device max valid pwm
+ * @clk_scaling_lock: used to serialize device commands and clock scaling
   * @desc_size: descriptor sizes reported by device
   * @urgent_bkops_lvl: keeps track of urgent bkops level for device
   * @is_urgent_bkops_lvl_checked: keeps track if the urgent bkops level for
Adrian Hunter Nov. 3, 2021, 7:46 a.m. UTC | #6
On 02/11/2021 22:49, Bart Van Assche wrote:
> On 11/1/21 11:11 PM, Adrian Hunter wrote:
>> On 01/11/2021 20:35, Bart Van Assche wrote:
>>> On 11/1/21 2:16 AM, Adrian Hunter wrote:
>>>> Also, there is the outstanding question of synchronization for the call to
>>>> ufshcd_reset_and_restore() further down.
>>>
>>> Please clarify this comment further. I assume that you are referring to the
>>> ufshcd_reset_and_restore() call guarded by if (needs_reset)? It is not clear
>>> to me what the outstanding question is?
>>
>> What is to stop it racing with BSG, sysfs, debugfs, abort, reset etc?
> 
> The patch below should address all feedback that has been provided so far.
> Instead of removing the clock scaling lock entirely, it is only removed from
> ufshcd_queuecommand().
> 
> 
> Subject: [PATCH] scsi: ufs: Remove the clock scaling lock from the command queueing code
> 
> Remove the clock scaling lock from ufshcd_queuecommand() since it is a
> performance bottleneck. Freeze request queues from inside
> ufshcd_clock_scaling_prepare() to wait for ongoing SCSI and dev cmds.
> Insert a rcu_read_lock() / rcu_read_unlock() pair in ufshcd_queuecommand().
> Use synchronize_rcu() to wait for ongoing ufshcd_queuecommand() calls.
> 
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
>  drivers/scsi/ufs/ufshcd.c | 97 ++++++++++-----------------------------
>  drivers/scsi/ufs/ufshcd.h |  1 +
>  2 files changed, 26 insertions(+), 72 deletions(-)
> 
> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> index 35a1af70f705..9d964b979aa2 100644
> --- a/drivers/scsi/ufs/ufshcd.c
> +++ b/drivers/scsi/ufs/ufshcd.c
> @@ -1069,65 +1069,6 @@ static bool ufshcd_is_devfreq_scaling_required(struct ufs_hba *hba,
>      return false;
>  }
> 
> -static int ufshcd_wait_for_doorbell_clr(struct ufs_hba *hba,
> -                    u64 wait_timeout_us)
> -{
> -    unsigned long flags;
> -    int ret = 0;
> -    u32 tm_doorbell;
> -    u32 tr_doorbell;
> -    bool timeout = false, do_last_check = false;
> -    ktime_t start;
> -
> -    ufshcd_hold(hba, false);
> -    spin_lock_irqsave(hba->host->host_lock, flags);
> -    /*
> -     * Wait for all the outstanding tasks/transfer requests.
> -     * Verify by checking the doorbell registers are clear.
> -     */
> -    start = ktime_get();
> -    do {
> -        if (hba->ufshcd_state != UFSHCD_STATE_OPERATIONAL) {
> -            ret = -EBUSY;
> -            goto out;
> -        }
> -
> -        tm_doorbell = ufshcd_readl(hba, REG_UTP_TASK_REQ_DOOR_BELL);
> -        tr_doorbell = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL);
> -        if (!tm_doorbell && !tr_doorbell) {
> -            timeout = false;
> -            break;
> -        } else if (do_last_check) {
> -            break;
> -        }
> -
> -        spin_unlock_irqrestore(hba->host->host_lock, flags);
> -        schedule();
> -        if (ktime_to_us(ktime_sub(ktime_get(), start)) >
> -            wait_timeout_us) {
> -            timeout = true;
> -            /*
> -             * We might have scheduled out for long time so make
> -             * sure to check if doorbells are cleared by this time
> -             * or not.
> -             */
> -            do_last_check = true;
> -        }
> -        spin_lock_irqsave(hba->host->host_lock, flags);
> -    } while (tm_doorbell || tr_doorbell);
> -
> -    if (timeout) {
> -        dev_err(hba->dev,
> -            "%s: timedout waiting for doorbell to clear (tm=0x%x, tr=0x%x)\n",
> -            __func__, tm_doorbell, tr_doorbell);
> -        ret = -EBUSY;
> -    }
> -out:
> -    spin_unlock_irqrestore(hba->host->host_lock, flags);
> -    ufshcd_release(hba);
> -    return ret;
> -}
> -
>  /**
>   * ufshcd_scale_gear - scale up/down UFS gear
>   * @hba: per adapter instance
> @@ -1175,20 +1116,29 @@ static int ufshcd_scale_gear(struct ufs_hba *hba, bool scale_up)
> 
>  static int ufshcd_clock_scaling_prepare(struct ufs_hba *hba)
>  {
> -    #define DOORBELL_CLR_TOUT_US        (1000 * 1000) /* 1 sec */
>      int ret = 0;
> +
>      /*
> -     * make sure that there are no outstanding requests when
> -     * clock scaling is in progress
> +     * Make sure that there are no outstanding requests while clock scaling
> +     * is in progress. Since the error handler may submit TMFs, limit the
> +     * time during which to block hba->tmf_queue in order not to block the
> +     * UFS error handler.
> +     *
> +     * Since ufshcd_exec_dev_cmd() and ufshcd_issue_devman_upiu_cmd() lock
> +     * the clk_scaling_lock before calling blk_get_request(), lock
> +     * clk_scaling_lock before freezing the request queues to prevent a
> +     * deadlock.
>       */
> -    ufshcd_scsi_block_requests(hba);

How are requests from LUN queues blocked?

>      down_write(&hba->clk_scaling_lock);
> -
> +    blk_freeze_queue_start(hba->cmd_queue);
> +    blk_freeze_queue_start(hba->tmf_queue);
>      if (!hba->clk_scaling.is_allowed ||
> -        ufshcd_wait_for_doorbell_clr(hba, DOORBELL_CLR_TOUT_US)) {

As above, couldn't there be requests from LUN queues?

> +        blk_mq_freeze_queue_wait_timeout(hba->cmd_queue, HZ) <= 0 ||
> +        blk_mq_freeze_queue_wait_timeout(hba->tmf_queue, HZ / 10) <= 0) {
>          ret = -EBUSY;
> +        blk_mq_unfreeze_queue(hba->tmf_queue);
> +        blk_mq_unfreeze_queue(hba->cmd_queue);
>          up_write(&hba->clk_scaling_lock);
> -        ufshcd_scsi_unblock_requests(hba);
>          goto out;
>      }
> 
> @@ -1201,11 +1151,12 @@ static int ufshcd_clock_scaling_prepare(struct ufs_hba *hba)
> 
>  static void ufshcd_clock_scaling_unprepare(struct ufs_hba *hba, bool writelock)
>  {
> +    blk_mq_unfreeze_queue(hba->tmf_queue);
> +    blk_mq_unfreeze_queue(hba->cmd_queue);
>      if (writelock)
>          up_write(&hba->clk_scaling_lock);
>      else
>          up_read(&hba->clk_scaling_lock);
> -    ufshcd_scsi_unblock_requests(hba);
>      ufshcd_release(hba);
>  }
> 
> @@ -2698,8 +2649,11 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
> 
>      WARN_ONCE(tag < 0, "Invalid tag %d\n", tag);
> 
> -    if (!down_read_trylock(&hba->clk_scaling_lock))
> -        return SCSI_MLQUEUE_HOST_BUSY;
> +    /*
> +     * Allows the UFS error handler to wait for prior ufshcd_queuecommand()
> +     * calls.
> +     */
> +    rcu_read_lock();
> 
>      switch (hba->ufshcd_state) {
>      case UFSHCD_STATE_OPERATIONAL:
> @@ -2785,7 +2739,7 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
> 
>      ufshcd_send_command(hba, tag);
>  out:
> -    up_read(&hba->clk_scaling_lock);
> +    rcu_read_unlock();
> 
>      if (ufs_trigger_eh()) {
>          unsigned long flags;
> @@ -5991,8 +5945,7 @@ static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
>      }
>      ufshcd_scsi_block_requests(hba);
>      /* Drain ufshcd_queuecommand() */
> -    down_write(&hba->clk_scaling_lock);
> -    up_write(&hba->clk_scaling_lock);
> +    synchronize_rcu();
>      cancel_work_sync(&hba->eeh_work);
>  }
> 
> diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
> index a911ad72de7a..c8c2bddb1d33 100644
> --- a/drivers/scsi/ufs/ufshcd.h
> +++ b/drivers/scsi/ufs/ufshcd.h
> @@ -778,6 +778,7 @@ struct ufs_hba_monitor {
>   * @clk_list_head: UFS host controller clocks list node head
>   * @pwr_info: holds current power mode
>   * @max_pwr_info: keeps the device max valid pwm
> + * @clk_scaling_lock: used to serialize device commands and clock scaling
>   * @desc_size: descriptor sizes reported by device
>   * @urgent_bkops_lvl: keeps track of urgent bkops level for device
>   * @is_urgent_bkops_lvl_checked: keeps track if the urgent bkops level for
Asutosh Das (asd) Nov. 3, 2021, 4:29 p.m. UTC | #7
On 10/29/2021 6:37 AM, Adrian Hunter wrote:
> Use flags and memory barriers instead of the clk_scaling_lock in
> ufshcd_queuecommand().  This is done to prepare for potentially faster IOPS
> in the future.
> 
> On an x86 test system (Samsung Galaxy Book S), for random 4k reads this cut
> the time of ufshcd_queuecommand() from about 690 ns to 460 ns.  With larger
> I/O sizes, the increased duration of DMA mapping made the difference
> increasingly negligible. Random 4k read IOPS was not affected, remaining at
> about 97 kIOPS.
> 
> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
> ---

Hi Adrian,

Thanks for the change.
I've validated clock scaling with this on a Qualcomm SoC.

-asd

>   drivers/scsi/ufs/ufshcd.c | 110 +++++++++++++++++++++++++++++++++++---
>   drivers/scsi/ufs/ufshcd.h |   4 ++
>   2 files changed, 106 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> index 115ea16f5a22..36536e11db78 100644
> --- a/drivers/scsi/ufs/ufshcd.c
> +++ b/drivers/scsi/ufs/ufshcd.c
> @@ -294,6 +294,82 @@ static void ufshcd_scsi_block_requests(struct ufs_hba *hba)
>   		scsi_block_requests(hba->host);
>   }
>   
> +static void ufshcd_check_issuing(struct ufs_hba *hba, unsigned long *issuing)
> +{
> +	int tag;
> +
> +	/*
> +	 * A memory barrier is needed to ensure changes to lrbp->issuing will be
> +	 * seen here.
> +	 */
> +	smp_rmb();
> +	for_each_set_bit(tag, issuing, hba->nutrs) {
> +		struct ufshcd_lrb *lrbp = &hba->lrb[tag];
> +
> +		if (!lrbp->issuing)
> +			__clear_bit(tag, issuing);
> +	}
> +}
> +
> +struct issuing {
> +	struct ufs_hba *hba;
> +	unsigned long issuing;
> +};
> +
> +static bool ufshcd_is_issuing(struct request *req, void *priv, bool reserved)
> +{
> +	int tag = req->tag;
> +	struct issuing *issuing = priv;
> +	struct ufshcd_lrb *lrbp = &issuing->hba->lrb[tag];
> +
> +	if (lrbp->issuing)
> +		__set_bit(tag, &issuing->issuing);
> +	return false;
> +}
> +
> +static void ufshcd_flush_queuecommand(struct ufs_hba *hba)
> +{
> +	struct request_queue *q = hba->cmd_queue;
> +	struct issuing issuing = { .hba = hba };
> +	int i;
> +
> +	/*
> +	 * Ensure any changes will be visible in ufshcd_queuecommand(), before
> +	 * finding requests that are currently in ufshcd_queuecommand(). Any
> +	 * requests not yet past the memory barrier in ufshcd_queuecommand()
> +	 * will therefore certainly see any changes that this task has made.
> +	 * Any requests that are past the memory barrier in
> +	 * ufshcd_queuecommand() will be found below, unless they are already
> +	 * exiting ufshcd_queuecommand().
> +	 */
> +	smp_mb();
> +
> +	/*
> +	 * Requests coming through ufshcd_queuecommand() will always have been
> +	 * "started", so blk_mq_tagset_busy_iter() will find them.
> +	 */
> +	blk_mq_tagset_busy_iter(q->tag_set, ufshcd_is_issuing, &issuing);
> +
> +	/*
> +	 * Spin briefly, to wait for tasks currently executing
> +	 * ufshcd_queuecommand(), assuming ufshcd_queuecommand() typically takes
> +	 * less that 10 microseconds.
> +	 */
> +	for (i = 0; i < 10 && issuing.issuing; i++) {
> +		udelay(1);
> +		ufshcd_check_issuing(hba, &issuing.issuing);
> +	}
> +
> +	/*
> +	 * Anything still in ufshcd_queuecommand() presumably got preempted, so
> +	 * sleep while polling.
> +	 */
> +	while (issuing.issuing) {
> +		usleep_range(100, 500);
> +		ufshcd_check_issuing(hba, &issuing.issuing);
> +	}
> +}
> +
>   static void ufshcd_add_cmd_upiu_trace(struct ufs_hba *hba, unsigned int tag,
>   				      enum ufs_trace_str_t str_t)
>   {
> @@ -1195,6 +1271,8 @@ static int ufshcd_clock_scaling_prepare(struct ufs_hba *hba)
>   	/* let's not get into low power until clock scaling is completed */
>   	ufshcd_hold(hba, false);
>   
> +	hba->clk_scaling.in_progress = true;
> +	ufshcd_flush_queuecommand(hba);
>   out:
>   	return ret;
>   }
> @@ -1205,6 +1283,9 @@ static void ufshcd_clock_scaling_unprepare(struct ufs_hba *hba, bool writelock)
>   		up_write(&hba->clk_scaling_lock);
>   	else
>   		up_read(&hba->clk_scaling_lock);
> +	/* Ensure writes are done before setting in_progress to false */
> +	smp_wmb();
> +	hba->clk_scaling.in_progress = false;
>   	ufshcd_scsi_unblock_requests(hba);
>   	ufshcd_release(hba);
>   }
> @@ -2111,7 +2192,11 @@ void ufshcd_send_command(struct ufs_hba *hba, unsigned int task_tag)
>   	ufshcd_writel(hba, 1 << task_tag, REG_UTP_TRANSFER_REQ_DOOR_BELL);
>   	spin_unlock_irqrestore(&hba->outstanding_lock, flags);
>   
> -	/* Make sure that doorbell is committed immediately */
> +	/*
> +	 * Make sure that doorbell is committed immediately.
> +	 * Also provides synchronization for ufshcd_queuecommand() wrt
> +	 * ufshcd_flush_queuecommand().
> +	 */
>   	wmb();
>   }
>   
> @@ -2695,8 +2780,15 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
>   
>   	WARN_ONCE(tag < 0, "Invalid tag %d\n", tag);
>   
> -	if (!down_read_trylock(&hba->clk_scaling_lock))
> -		return SCSI_MLQUEUE_HOST_BUSY;
> +	lrbp = &hba->lrb[tag];
> +	lrbp->issuing = true;
> +	/* Synchronize with ufshcd_flush_queuecommand() */
> +	smp_mb();
> +
> +	if (unlikely(hba->clk_scaling.in_progress)) {
> +		err = SCSI_MLQUEUE_HOST_BUSY;
> +		goto out;
> +	}
>   
>   	switch (hba->ufshcd_state) {
>   	case UFSHCD_STATE_OPERATIONAL:
> @@ -2751,7 +2843,6 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
>   	WARN_ON(ufshcd_is_clkgating_allowed(hba) &&
>   		(hba->clk_gating.state != CLKS_ON));
>   
> -	lrbp = &hba->lrb[tag];
>   	WARN_ON(lrbp->cmd);
>   	lrbp->cmd = cmd;
>   	lrbp->sense_bufflen = UFS_SENSE_SIZE;
> @@ -2782,7 +2873,12 @@ static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
>   
>   	ufshcd_send_command(hba, tag);
>   out:
> -	up_read(&hba->clk_scaling_lock);
> +	/*
> +	 * Leverages wmb() in ufshcd_send_command() for synchronization with
> +	 * ufshcd_flush_queuecommand(), otherwise if no command was sent, then
> +	 * there is no need to synchronize anyway.
> +	 */
> +	lrbp->issuing = false;
>   
>   	if (ufs_trigger_eh()) {
>   		unsigned long flags;
> @@ -5987,9 +6083,7 @@ static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
>   		ufshcd_clk_scaling_allow(hba, false);
>   	}
>   	ufshcd_scsi_block_requests(hba);
> -	/* Drain ufshcd_queuecommand() */
> -	down_write(&hba->clk_scaling_lock);
> -	up_write(&hba->clk_scaling_lock);
> +	ufshcd_flush_queuecommand(hba);
>   	cancel_work_sync(&hba->eeh_work);
>   }
>   
> diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
> index b9492f300bd1..46f385133aaa 100644
> --- a/drivers/scsi/ufs/ufshcd.h
> +++ b/drivers/scsi/ufs/ufshcd.h
> @@ -188,6 +188,7 @@ struct ufs_pm_lvl_states {
>    * @task_tag: Task tag of the command
>    * @lun: LUN of the command
>    * @intr_cmd: Interrupt command (doesn't participate in interrupt aggregation)
> + * @issuing: Is in ufshcd_queuecommand()
>    * @issue_time_stamp: time stamp for debug purposes
>    * @compl_time_stamp: time stamp for statistics
>    * @crypto_key_slot: the key slot to use for inline crypto (-1 if none)
> @@ -214,6 +215,7 @@ struct ufshcd_lrb {
>   	int task_tag;
>   	u8 lun; /* UPIU LUN id field is only 8-bit wide */
>   	bool intr_cmd;
> +	bool issuing;
>   	ktime_t issue_time_stamp;
>   	ktime_t compl_time_stamp;
>   #ifdef CONFIG_SCSI_UFS_CRYPTO
> @@ -425,6 +427,7 @@ struct ufs_saved_pwr_info {
>    * @is_initialized: Indicates whether clock scaling is initialized or not
>    * @is_busy_started: tracks if busy period has started or not
>    * @is_suspended: tracks if devfreq is suspended or not
> + * @in_progress: clock_scaling is in progress
>    */
>   struct ufs_clk_scaling {
>   	int active_reqs;
> @@ -442,6 +445,7 @@ struct ufs_clk_scaling {
>   	bool is_initialized;
>   	bool is_busy_started;
>   	bool is_suspended;
> +	bool in_progress;
>   };
>   
>   #define UFS_EVENT_HIST_LENGTH 8
>
Bart Van Assche Nov. 3, 2021, 5:06 p.m. UTC | #8
On 11/3/21 12:46 AM, Adrian Hunter wrote:
> On 02/11/2021 22:49, Bart Van Assche wrote:
>>   static int ufshcd_clock_scaling_prepare(struct ufs_hba *hba)
>>   {
>> -    #define DOORBELL_CLR_TOUT_US        (1000 * 1000) /* 1 sec */
>>       int ret = 0;
>> +
>>       /*
>> -     * make sure that there are no outstanding requests when
>> -     * clock scaling is in progress
>> +     * Make sure that there are no outstanding requests while clock scaling
>> +     * is in progress. Since the error handler may submit TMFs, limit the
>> +     * time during which to block hba->tmf_queue in order not to block the
>> +     * UFS error handler.
>> +     *
>> +     * Since ufshcd_exec_dev_cmd() and ufshcd_issue_devman_upiu_cmd() lock
>> +     * the clk_scaling_lock before calling blk_get_request(), lock
>> +     * clk_scaling_lock before freezing the request queues to prevent a
>> +     * deadlock.
>>        */
>> -    ufshcd_scsi_block_requests(hba);
> 
> How are requests from LUN queues blocked?

I will add blk_freeze_queue() calls for the LUNs.

Thanks,

Bart.
Asutosh Das (asd) Nov. 4, 2021, 2:23 p.m. UTC | #9
On 11/3/2021 10:06 AM, Bart Van Assche wrote:
> On 11/3/21 12:46 AM, Adrian Hunter wrote:
>> On 02/11/2021 22:49, Bart Van Assche wrote:
>>>   static int ufshcd_clock_scaling_prepare(struct ufs_hba *hba)
>>>   {
>>> -    #define DOORBELL_CLR_TOUT_US        (1000 * 1000) /* 1 sec */
>>>       int ret = 0;
>>> +
>>>       /*
>>> -     * make sure that there are no outstanding requests when
>>> -     * clock scaling is in progress
>>> +     * Make sure that there are no outstanding requests while clock 
>>> scaling
>>> +     * is in progress. Since the error handler may submit TMFs, 
>>> limit the
>>> +     * time during which to block hba->tmf_queue in order not to 
>>> block the
>>> +     * UFS error handler.
>>> +     *
>>> +     * Since ufshcd_exec_dev_cmd() and 
>>> ufshcd_issue_devman_upiu_cmd() lock
>>> +     * the clk_scaling_lock before calling blk_get_request(), lock
>>> +     * clk_scaling_lock before freezing the request queues to prevent a
>>> +     * deadlock.
>>>        */
>>> -    ufshcd_scsi_block_requests(hba);
>>
>> How are requests from LUN queues blocked?
> 
> I will add blk_freeze_queue() calls for the LUNs.
> 
> Thanks,
> 
> Bart.
Hi Bart,

In the current clock scaling code, the expectation is to scale up as 
soon as possible.

For e.g. say, the current gear is G1 and there're pending requests in 
the queue but the DBR is empty and there's a decision to scale up. 
During scale-up, if the queues are frozen, wouldn't those requests be 
issued to the driver and executed in G1 instead of G4?
I think this would lead to higher run to run variance in performance.

What do you think?

Thanks,
-asd
Bart Van Assche Nov. 4, 2021, 5:10 p.m. UTC | #10
On 11/4/21 7:23 AM, Asutosh Das (asd) wrote:
> In the current clock scaling code, the expectation is to scale up as 
> soon as possible.
> 
> For e.g. say, the current gear is G1 and there're pending requests in 
> the queue but the DBR is empty and there's a decision to scale up. 
> During scale-up, if the queues are frozen, wouldn't those requests be 
> issued to the driver and executed in G1 instead of G4?
> I think this would lead to higher run to run variance in performance.

Hi Asutosh,

My understanding of the current clock scaling implementation is as 
follows (please correct me if I got anything wrong):
* ufshcd_clock_scaling_prepare() is called before clock scaling happens
   and ufshcd_clock_scaling_unprepare() is called after clock scaling has
   finished.
* ufshcd_clock_scaling_prepare() calls ufshcd_scsi_block_requests() and
   ufshcd_wait_for_doorbell_clr().
* ufshcd_wait_for_doorbell_clr() waits until both the UTP Transfer
   Request List Doorbell Register and UTP Task Management Request List
   DoorBell Register are zero. Hence, it waits until all pending SCSI
   commands, task management commands and device commands have finished.

As far as I can see from a conceptual viewpoint there is no difference
between calling ufshcd_wait_for_doorbell_clr() or freezing the request
queues. There is an implementation difference however: 
blk_mq_freeze_queue() waits for an RCU grace period. This can introduce
an additional delay of several milliseconds compared to 
ufshcd_wait_for_doorbell_clr(). If this is a concern I can look into 
expediting the RCU grace period during clock scaling.

Thanks,

Bart.
Asutosh Das (asd) Nov. 8, 2021, 6:06 p.m. UTC | #11
On 11/4/2021 10:10 AM, Bart Van Assche wrote:
> On 11/4/21 7:23 AM, Asutosh Das (asd) wrote:
>> In the current clock scaling code, the expectation is to scale up as 
>> soon as possible.
>>
>> For e.g. say, the current gear is G1 and there're pending requests in 
>> the queue but the DBR is empty and there's a decision to scale up. 
>> During scale-up, if the queues are frozen, wouldn't those requests be 
>> issued to the driver and executed in G1 instead of G4?
>> I think this would lead to higher run to run variance in performance.
> 
> Hi Asutosh,
> 
> My understanding of the current clock scaling implementation is as 
> follows (please correct me if I got anything wrong):
> * ufshcd_clock_scaling_prepare() is called before clock scaling happens
>    and ufshcd_clock_scaling_unprepare() is called after clock scaling has
>    finished.
> * ufshcd_clock_scaling_prepare() calls ufshcd_scsi_block_requests() and
>    ufshcd_wait_for_doorbell_clr().
> * ufshcd_wait_for_doorbell_clr() waits until both the UTP Transfer
>    Request List Doorbell Register and UTP Task Management Request List
>    DoorBell Register are zero. Hence, it waits until all pending SCSI
>    commands, task management commands and device commands have finished.
> 
> As far as I can see from a conceptual viewpoint there is no difference
> between calling ufshcd_wait_for_doorbell_clr() or freezing the request
> queues. There is an implementation difference however: 
> blk_mq_freeze_queue() waits for an RCU grace period. This can introduce
> an additional delay of several milliseconds compared to 
> ufshcd_wait_for_doorbell_clr(). If this is a concern I can look into 
> expediting the RCU grace period during clock scaling.
> 
> Thanks,
> 
> Bart.
Hi Bart,
The current scaling code invokes scsi_block_request() which would block 
incoming requests right away in prepare. So any un-issued requests 
wouldn't be issued.

I'm not sure if this exact behavior would manifest if 
blk_freeze_queue_start() is used and scsi_block_request() is removed.
I see that it invokes - blk_mq_run_hw_queues().
void blk_freeze_queue_start(struct request_queue *q)
{
         mutex_lock(&q->mq_freeze_lock);
         if (++q->mq_freeze_depth == 1) {
                 percpu_ref_kill(&q->q_usage_counter);
                 mutex_unlock(&q->mq_freeze_lock);
                 if (queue_is_mq(q))
                         blk_mq_run_hw_queues(q, false);
         } else {
                 mutex_unlock(&q->mq_freeze_lock);
         }
}

Would blk_mq_run_hw_queues() issue any pending requests in the queue? If 
so, then these requests would be completed in the unchanged power-mode 
which is not what we want.

-asd
Bart Van Assche Nov. 8, 2021, 6:24 p.m. UTC | #12
On 11/8/21 10:06 AM, Asutosh Das (asd) wrote:
> The current scaling code invokes scsi_block_request() which would block 
> incoming requests right away in prepare. So any un-issued requests 
> wouldn't be issued.
> 
> I'm not sure if this exact behavior would manifest if 
> blk_freeze_queue_start() is used and scsi_block_request() is removed.

That behavior would be preserved. The percpu_ref_tryget_live() call in 
blk_queue_enter() fails after blk_freeze_queue_start() has called 
percpu_ref_kill().

> Would blk_mq_run_hw_queues() issue any pending requests in the queue?

Yes.

> If so, then these requests would be completed in the unchanged power-mode 
> which is not what we want.

I will change the blk_freeze_queue_start() calls into 
blk_mq_quiesce_queue_nowait() calls.

Thanks,

Bart.
diff mbox series

Patch

diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 115ea16f5a22..36536e11db78 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -294,6 +294,82 @@  static void ufshcd_scsi_block_requests(struct ufs_hba *hba)
 		scsi_block_requests(hba->host);
 }
 
+static void ufshcd_check_issuing(struct ufs_hba *hba, unsigned long *issuing)
+{
+	int tag;
+
+	/*
+	 * A memory barrier is needed to ensure changes to lrbp->issuing will be
+	 * seen here.
+	 */
+	smp_rmb();
+	for_each_set_bit(tag, issuing, hba->nutrs) {
+		struct ufshcd_lrb *lrbp = &hba->lrb[tag];
+
+		if (!lrbp->issuing)
+			__clear_bit(tag, issuing);
+	}
+}
+
+struct issuing {
+	struct ufs_hba *hba;
+	unsigned long issuing;
+};
+
+static bool ufshcd_is_issuing(struct request *req, void *priv, bool reserved)
+{
+	int tag = req->tag;
+	struct issuing *issuing = priv;
+	struct ufshcd_lrb *lrbp = &issuing->hba->lrb[tag];
+
+	if (lrbp->issuing)
+		__set_bit(tag, &issuing->issuing);
+	return false;
+}
+
+static void ufshcd_flush_queuecommand(struct ufs_hba *hba)
+{
+	struct request_queue *q = hba->cmd_queue;
+	struct issuing issuing = { .hba = hba };
+	int i;
+
+	/*
+	 * Ensure any changes will be visible in ufshcd_queuecommand(), before
+	 * finding requests that are currently in ufshcd_queuecommand(). Any
+	 * requests not yet past the memory barrier in ufshcd_queuecommand()
+	 * will therefore certainly see any changes that this task has made.
+	 * Any requests that are past the memory barrier in
+	 * ufshcd_queuecommand() will be found below, unless they are already
+	 * exiting ufshcd_queuecommand().
+	 */
+	smp_mb();
+
+	/*
+	 * Requests coming through ufshcd_queuecommand() will always have been
+	 * "started", so blk_mq_tagset_busy_iter() will find them.
+	 */
+	blk_mq_tagset_busy_iter(q->tag_set, ufshcd_is_issuing, &issuing);
+
+	/*
+	 * Spin briefly, to wait for tasks currently executing
+	 * ufshcd_queuecommand(), assuming ufshcd_queuecommand() typically takes
+	 * less that 10 microseconds.
+	 */
+	for (i = 0; i < 10 && issuing.issuing; i++) {
+		udelay(1);
+		ufshcd_check_issuing(hba, &issuing.issuing);
+	}
+
+	/*
+	 * Anything still in ufshcd_queuecommand() presumably got preempted, so
+	 * sleep while polling.
+	 */
+	while (issuing.issuing) {
+		usleep_range(100, 500);
+		ufshcd_check_issuing(hba, &issuing.issuing);
+	}
+}
+
 static void ufshcd_add_cmd_upiu_trace(struct ufs_hba *hba, unsigned int tag,
 				      enum ufs_trace_str_t str_t)
 {
@@ -1195,6 +1271,8 @@  static int ufshcd_clock_scaling_prepare(struct ufs_hba *hba)
 	/* let's not get into low power until clock scaling is completed */
 	ufshcd_hold(hba, false);
 
+	hba->clk_scaling.in_progress = true;
+	ufshcd_flush_queuecommand(hba);
 out:
 	return ret;
 }
@@ -1205,6 +1283,9 @@  static void ufshcd_clock_scaling_unprepare(struct ufs_hba *hba, bool writelock)
 		up_write(&hba->clk_scaling_lock);
 	else
 		up_read(&hba->clk_scaling_lock);
+	/* Ensure writes are done before setting in_progress to false */
+	smp_wmb();
+	hba->clk_scaling.in_progress = false;
 	ufshcd_scsi_unblock_requests(hba);
 	ufshcd_release(hba);
 }
@@ -2111,7 +2192,11 @@  void ufshcd_send_command(struct ufs_hba *hba, unsigned int task_tag)
 	ufshcd_writel(hba, 1 << task_tag, REG_UTP_TRANSFER_REQ_DOOR_BELL);
 	spin_unlock_irqrestore(&hba->outstanding_lock, flags);
 
-	/* Make sure that doorbell is committed immediately */
+	/*
+	 * Make sure that doorbell is committed immediately.
+	 * Also provides synchronization for ufshcd_queuecommand() wrt
+	 * ufshcd_flush_queuecommand().
+	 */
 	wmb();
 }
 
@@ -2695,8 +2780,15 @@  static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
 
 	WARN_ONCE(tag < 0, "Invalid tag %d\n", tag);
 
-	if (!down_read_trylock(&hba->clk_scaling_lock))
-		return SCSI_MLQUEUE_HOST_BUSY;
+	lrbp = &hba->lrb[tag];
+	lrbp->issuing = true;
+	/* Synchronize with ufshcd_flush_queuecommand() */
+	smp_mb();
+
+	if (unlikely(hba->clk_scaling.in_progress)) {
+		err = SCSI_MLQUEUE_HOST_BUSY;
+		goto out;
+	}
 
 	switch (hba->ufshcd_state) {
 	case UFSHCD_STATE_OPERATIONAL:
@@ -2751,7 +2843,6 @@  static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
 	WARN_ON(ufshcd_is_clkgating_allowed(hba) &&
 		(hba->clk_gating.state != CLKS_ON));
 
-	lrbp = &hba->lrb[tag];
 	WARN_ON(lrbp->cmd);
 	lrbp->cmd = cmd;
 	lrbp->sense_bufflen = UFS_SENSE_SIZE;
@@ -2782,7 +2873,12 @@  static int ufshcd_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
 
 	ufshcd_send_command(hba, tag);
 out:
-	up_read(&hba->clk_scaling_lock);
+	/*
+	 * Leverages wmb() in ufshcd_send_command() for synchronization with
+	 * ufshcd_flush_queuecommand(), otherwise if no command was sent, then
+	 * there is no need to synchronize anyway.
+	 */
+	lrbp->issuing = false;
 
 	if (ufs_trigger_eh()) {
 		unsigned long flags;
@@ -5987,9 +6083,7 @@  static void ufshcd_err_handling_prepare(struct ufs_hba *hba)
 		ufshcd_clk_scaling_allow(hba, false);
 	}
 	ufshcd_scsi_block_requests(hba);
-	/* Drain ufshcd_queuecommand() */
-	down_write(&hba->clk_scaling_lock);
-	up_write(&hba->clk_scaling_lock);
+	ufshcd_flush_queuecommand(hba);
 	cancel_work_sync(&hba->eeh_work);
 }
 
diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
index b9492f300bd1..46f385133aaa 100644
--- a/drivers/scsi/ufs/ufshcd.h
+++ b/drivers/scsi/ufs/ufshcd.h
@@ -188,6 +188,7 @@  struct ufs_pm_lvl_states {
  * @task_tag: Task tag of the command
  * @lun: LUN of the command
  * @intr_cmd: Interrupt command (doesn't participate in interrupt aggregation)
+ * @issuing: Is in ufshcd_queuecommand()
  * @issue_time_stamp: time stamp for debug purposes
  * @compl_time_stamp: time stamp for statistics
  * @crypto_key_slot: the key slot to use for inline crypto (-1 if none)
@@ -214,6 +215,7 @@  struct ufshcd_lrb {
 	int task_tag;
 	u8 lun; /* UPIU LUN id field is only 8-bit wide */
 	bool intr_cmd;
+	bool issuing;
 	ktime_t issue_time_stamp;
 	ktime_t compl_time_stamp;
 #ifdef CONFIG_SCSI_UFS_CRYPTO
@@ -425,6 +427,7 @@  struct ufs_saved_pwr_info {
  * @is_initialized: Indicates whether clock scaling is initialized or not
  * @is_busy_started: tracks if busy period has started or not
  * @is_suspended: tracks if devfreq is suspended or not
+ * @in_progress: clock_scaling is in progress
  */
 struct ufs_clk_scaling {
 	int active_reqs;
@@ -442,6 +445,7 @@  struct ufs_clk_scaling {
 	bool is_initialized;
 	bool is_busy_started;
 	bool is_suspended;
+	bool in_progress;
 };
 
 #define UFS_EVENT_HIST_LENGTH 8