@@ -245,6 +245,8 @@ int scsi_change_queue_depth(struct scsi_device *sdev, int depth)
if (sdev->request_queue)
blk_set_queue_depth(sdev->request_queue, depth);
+ sbitmap_resize(&sdev->budget_map, sdev->queue_depth);
+
return sdev->queue_depth;
}
EXPORT_SYMBOL(scsi_change_queue_depth);
@@ -354,7 +354,7 @@ void scsi_device_unbusy(struct scsi_device *sdev, struct scsi_cmnd *cmd)
if (starget->can_queue > 0)
atomic_dec(&starget->target_busy);
- atomic_dec(&sdev->device_busy);
+ sbitmap_put(&sdev->budget_map, cmd->budget_token);
}
static void scsi_kick_queue(struct request_queue *q)
@@ -1274,19 +1274,17 @@ scsi_prep_state_check(struct scsi_device *sdev, struct request *req)
}
/*
- * scsi_dev_queue_ready: if we can send requests to sdev, return 1 else
- * return 0.
- *
- * Called with the queue_lock held.
+ * scsi_dev_queue_ready: if we can send requests to sdev, assign one token
+ * and return the token else return -1.
*/
static inline int scsi_dev_queue_ready(struct request_queue *q,
struct scsi_device *sdev)
{
- unsigned int busy;
+ int token;
- busy = atomic_inc_return(&sdev->device_busy) - 1;
+ token = sbitmap_get(&sdev->budget_map);
if (atomic_read(&sdev->device_blocked)) {
- if (busy)
+ if (token >= 0)
goto out_dec;
/*
@@ -1298,13 +1296,11 @@ static inline int scsi_dev_queue_ready(struct request_queue *q,
"unblocking device at zero depth\n"));
}
- if (busy >= sdev->queue_depth)
- goto out_dec;
-
- return 1;
+ return token;
out_dec:
- atomic_dec(&sdev->device_busy);
- return 0;
+ if (token >= 0)
+ sbitmap_put(&sdev->budget_map, token);
+ return -1;
}
/*
@@ -1624,16 +1620,17 @@ static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx, int budget_token)
struct request_queue *q = hctx->queue;
struct scsi_device *sdev = q->queuedata;
- atomic_dec(&sdev->device_busy);
+ sbitmap_put(&sdev->budget_map, budget_token);
}
static int scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
struct scsi_device *sdev = q->queuedata;
+ int token = scsi_dev_queue_ready(q, sdev);
- if (scsi_dev_queue_ready(q, sdev))
- return 0;
+ if (token >= 0)
+ return token;
if (scsi_device_busy(sdev) == 0 && !scsi_device_blocked(sdev))
blk_mq_delay_run_hw_queue(hctx, SCSI_QUEUE_DELAY);
@@ -1677,6 +1674,8 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
blk_mq_start_request(req);
}
+ cmd->budget_token = bd->budget_token;
+
cmd->flags &= SCMD_PRESERVED_FLAGS;
if (sdev->simple_tags)
cmd->flags |= SCMD_TAGGED;
@@ -1701,12 +1700,12 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
if (scsi_target(sdev)->can_queue > 0)
atomic_dec(&scsi_target(sdev)->target_busy);
out_put_budget:
- scsi_mq_put_budget(hctx, 0);
+ scsi_mq_put_budget(hctx, bd->budget_token);
switch (ret) {
case BLK_STS_OK:
break;
case BLK_STS_RESOURCE:
- if (atomic_read(&sdev->device_busy) ||
+ if (sbitmap_any_bit_set(&sdev->budget_map) ||
scsi_device_blocked(sdev))
ret = BLK_STS_DEV_RESOURCE;
break;
@@ -5,6 +5,7 @@
#include <linux/device.h>
#include <linux/async.h>
#include <scsi/scsi_device.h>
+#include <linux/sbitmap.h>
struct request_queue;
struct request;
@@ -215,6 +215,7 @@ static void scsi_unlock_floptical(struct scsi_device *sdev,
static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
u64 lun, void *hostdata)
{
+ unsigned int depth;
struct scsi_device *sdev;
int display_failure_msg = 1, ret;
struct Scsi_Host *shost = dev_to_shost(starget->dev.parent);
@@ -277,8 +278,23 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
WARN_ON_ONCE(!blk_get_queue(sdev->request_queue));
sdev->request_queue->queuedata = sdev;
- scsi_change_queue_depth(sdev, sdev->host->cmd_per_lun ?
- sdev->host->cmd_per_lun : 1);
+ depth = sdev->host->cmd_per_lun ?: 1;
+
+ /*
+ * Use .can_queue as budget map's depth because we have to
+ * support adjusting queue depth from sysfs. Meantime use
+ * default device queue depth to figure out sbitmap shift
+ * since we use this queue depth most of times.
+ */
+ if (sbitmap_init_node(&sdev->budget_map, sdev->host->can_queue,
+ sbitmap_calculate_shift(depth),
+ GFP_KERNEL, sdev->request_queue->node,
+ false, true)) {
+ put_device(&starget->dev);
+ kfree(sdev);
+ }
+
+ scsi_change_queue_depth(sdev, depth);
scsi_sysfs_device_initialize(sdev);
@@ -980,6 +996,7 @@ static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result,
scsi_attach_vpd(sdev);
sdev->max_queue_depth = sdev->queue_depth;
+ WARN_ON_ONCE(sdev->max_queue_depth > sdev->budget_map.depth);
sdev->sdev_bflags = *bflags;
/*
@@ -466,6 +466,8 @@ static void scsi_device_dev_release_usercontext(struct work_struct *work)
/* NULL queue means the device can't be used */
sdev->request_queue = NULL;
+ sbitmap_free(&sdev->budget_map);
+
mutex_lock(&sdev->inquiry_mutex);
vpd_pg0 = rcu_replace_pointer(sdev->vpd_pg0, vpd_pg0,
lockdep_is_held(&sdev->inquiry_mutex));
@@ -76,6 +76,8 @@ struct scsi_cmnd {
int eh_eflags; /* Used by error handlr */
+ int budget_token;
+
/*
* This is set to jiffies as it was when the command was first
* allocated. It is used to time how long the command has
@@ -8,6 +8,7 @@
#include <linux/blkdev.h>
#include <scsi/scsi.h>
#include <linux/atomic.h>
+#include <linux/sbitmap.h>
struct device;
struct request_queue;
@@ -106,7 +107,7 @@ struct scsi_device {
struct list_head siblings; /* list of all devices on this host */
struct list_head same_target_siblings; /* just the devices sharing same target id */
- atomic_t device_busy; /* commands actually active on LLDD */
+ struct sbitmap budget_map;
atomic_t device_blocked; /* Device returned QUEUE_FULL. */
spinlock_t list_lock;
@@ -586,7 +587,7 @@ static inline int scsi_device_supports_vpd(struct scsi_device *sdev)
static inline int scsi_device_busy(struct scsi_device *sdev)
{
- return atomic_read(&sdev->device_busy);
+ return sbitmap_weight(&sdev->budget_map);
}
#define MODULE_ALIAS_SCSI_DEVICE(type) \
scsi uses one global atomic variable to track queue depth for each LUN/request queue. This way doesn't scale well when there is lots of CPU cores and the disk is very fast. It has been observed that IOPS is affected a lot by tracking queue depth via sdev->device_busy in IO path. So replace the atomic variable sdev->device_busy with sbitmap for tracking scsi device queue depth. Test shows that IOPS difference is just ~1% compared with bypassing device queue depth on scsi_debug by applying patches [1]. Meantime IOPS is improved > 20% compared with linus tree. Follows test steps: 1) test machine(32 logical CPU cores) Thread(s) per core: 2 Core(s) per socket: 8 Socket(s): 2 NUMA node(s): 2 Model name: Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz 2) setup scsi_debug: modprobe scsi_debug virtual_gb=128 max_luns=1 submit_queues=32 delay=0 max_queue=256 3) fio script: fio --rw=randread --size=128G --direct=1 --ioengine=libaio --iodepth=2048 \ --numjobs=32 --bs=4k --group_reporting=1 --group_reporting=1 --runtime=60 \ --loops=10000 --name=job1 --filename=/dev/sdN [1] https://lore.kernel.org/linux-block/20200119071432.18558-6-ming.lei@redhat.com/ Cc: Omar Sandoval <osandov@fb.com> Cc: Sathya Prakash <sathya.prakash@broadcom.com> Cc: Chaitra P B <chaitra.basappa@broadcom.com> Cc: Suganath Prabu Subramani <suganath-prabu.subramani@broadcom.com> Cc: Kashyap Desai <kashyap.desai@broadcom.com> Cc: Sumit Saxena <sumit.saxena@broadcom.com> Cc: Shivasharan S <shivasharan.srikanteshwara@broadcom.com> Cc: Ewan D. Milne <emilne@redhat.com> Cc: Hannes Reinecke <hare@suse.de> Cc: Bart Van Assche <bart.vanassche@wdc.com> Signed-off-by: Ming Lei <ming.lei@redhat.com> --- drivers/scsi/scsi.c | 2 ++ drivers/scsi/scsi_lib.c | 37 ++++++++++++++++++------------------- drivers/scsi/scsi_priv.h | 1 + drivers/scsi/scsi_scan.c | 21 +++++++++++++++++++-- drivers/scsi/scsi_sysfs.c | 2 ++ include/scsi/scsi_cmnd.h | 2 ++ include/scsi/scsi_device.h | 5 +++-- 7 files changed, 47 insertions(+), 23 deletions(-)