@@ -305,7 +305,12 @@ static void target_queue_cmd_work(struct se_sess_cmd_queue *q,
static void target_queue_cmd_compl(struct se_cmd *se_cmd)
{
struct se_session *se_sess = se_cmd->se_sess;
- int cpu = se_cmd->cpuid;
+ int cpu;
+
+ if (se_cmd->se_cmd_flags & SCF_IGNORE_CPUID_COMPL)
+ cpu = smp_processor_id();
+ else
+ cpu = se_cmd->cpuid;
target_queue_cmd_work(&se_sess->cq[cpu], se_cmd, cpu,
target_completion_wq);
@@ -1758,6 +1763,9 @@ int target_submit_cmd_map_sgls(struct se_cmd *se_cmd, struct se_session *se_sess
BUG_ON(!se_tpg);
BUG_ON(se_cmd->se_tfo || se_cmd->se_sess);
+ if (flags & TARGET_SCF_IGNORE_CPUID_COMPL)
+ se_cmd->se_cmd_flags |= SCF_IGNORE_CPUID_COMPL;
+
if (flags & TARGET_SCF_USE_CPUID)
se_cmd->se_cmd_flags |= SCF_USE_CPUID;
/*
@@ -804,7 +804,8 @@ static void vhost_scsi_submit_queued_cmd(struct se_cmd *se_cmd)
cmd->tvc_cdb, &cmd->tvc_sense_buf[0],
cmd->tvc_lun, cmd->tvc_exp_data_len,
vhost_scsi_to_tcm_attr(cmd->tvc_task_attr),
- cmd->tvc_data_direction, TARGET_SCF_ACK_KREF,
+ cmd->tvc_data_direction,
+ TARGET_SCF_ACK_KREF | TARGET_SCF_IGNORE_CPUID_COMPL,
sg_ptr, cmd->tvc_sgl_count, NULL, 0, sg_prot_ptr,
cmd->tvc_prot_sgl_count);
if (rc < 0) {
@@ -147,6 +147,7 @@ enum se_cmd_flags_table {
SCF_TASK_ATTR_SET = (1 << 17),
SCF_TREAT_READ_AS_NORMAL = (1 << 18),
SCF_BATCHED = (1 << 19),
+ SCF_IGNORE_CPUID_COMPL = (1 << 20),
};
/*
@@ -197,6 +198,7 @@ enum target_sc_flags_table {
TARGET_SCF_ACK_KREF = 0x02,
TARGET_SCF_UNKNOWN_SIZE = 0x04,
TARGET_SCF_USE_CPUID = 0x08,
+ TARGET_SCF_IGNORE_CPUID_COMPL = 0x10,
};
/* fabric independent task management function values */
LIO wants to complete a cmd on the CPU it was submitted on, because most drivers have per cpu or hw queue handlers. But, for vhost-scsi which has the single thread for submissions and completions this is not always the best thing to do since the thread could be running on a different CPU now, and it conflicts with what the user has setup in the lower levels with settings like the block layer rq_affinity or for network block devices what the user has setup on their nic. This patch has vhost-scsi tell LIO to complete the cmd on the CPU the layer below LIO has completed the cmd on. We then stop fighting the block, net and whatever layer/setting is below us. With this patch and the previous ones I see an increase in IOPs by about 50% (234K -> 350K) for random 4K workloads like: fio --filename=/dev/sda --direct=1 --rw=randrw --bs=4k --ioengine=libaio --iodepth=128 --numjobs=8 --time_based --group_reporting --runtime=60 Signed-off-by: Mike Christie <michael.christie@oracle.com> --- drivers/target/target_core_transport.c | 10 +++++++++- drivers/vhost/scsi.c | 3 ++- include/target/target_core_base.h | 2 ++ 3 files changed, 13 insertions(+), 2 deletions(-)