Message ID | 20191202153914.84722-10-hare@suse.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | blk-mq/scsi: Provide hostwide shared tags for SCSI HBAs | expand |
On Mon, Dec 2, 2019 at 9:09 PM Hannes Reinecke <hare@suse.de> wrote: > > Fusion adapters can steer completions to individual queues, and > we now have support for shared host-wide tags. > So we can enable multiqueue support for fusion adapters and > drop the hand-crafted interrupt affinity settings. Hi Hannes, Ming Lei also proposed similar changes in megaraid_sas driver some time back and it had resulted in performance drop- https://patchwork.kernel.org/patch/10969511/ So, we will do some performance tests with this patch and update you. Thanks, Sumit > > Signed-off-by: Hannes Reinecke <hare@suse.com> > --- > drivers/scsi/megaraid/megaraid_sas.h | 1 - > drivers/scsi/megaraid/megaraid_sas_base.c | 65 +++++++++-------------------- > drivers/scsi/megaraid/megaraid_sas_fusion.c | 14 ++++--- > 3 files changed, 28 insertions(+), 52 deletions(-) > > diff --git a/drivers/scsi/megaraid/megaraid_sas.h b/drivers/scsi/megaraid/megaraid_sas.h > index bd8184072bed..844ea2d6dbb8 100644 > --- a/drivers/scsi/megaraid/megaraid_sas.h > +++ b/drivers/scsi/megaraid/megaraid_sas.h > @@ -2261,7 +2261,6 @@ enum MR_PERF_MODE { > > struct megasas_instance { > > - unsigned int *reply_map; > __le32 *producer; > dma_addr_t producer_h; > __le32 *consumer; > diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c > index a4bc81479284..9d0d74e3d491 100644 > --- a/drivers/scsi/megaraid/megaraid_sas_base.c > +++ b/drivers/scsi/megaraid/megaraid_sas_base.c > @@ -37,6 +37,7 @@ > #include <linux/poll.h> > #include <linux/vmalloc.h> > #include <linux/irq_poll.h> > +#include <linux/blk-mq-pci.h> > > #include <scsi/scsi.h> > #include <scsi/scsi_cmnd.h> > @@ -3106,6 +3107,19 @@ megasas_bios_param(struct scsi_device *sdev, struct block_device *bdev, > return 0; > } > > +static int megasas_map_queues(struct Scsi_Host *shost) > +{ > + struct megasas_instance *instance; > + > + instance = (struct megasas_instance *)shost->hostdata; > + > + if (!instance->smp_affinity_enable) > + return 0; > + > + return blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT], > + instance->pdev, instance->low_latency_index_start); > +} > + > static void megasas_aen_polling(struct work_struct *work); > > /** > @@ -3414,9 +3428,11 @@ static struct scsi_host_template megasas_template = { > .eh_timed_out = megasas_reset_timer, > .shost_attrs = megaraid_host_attrs, > .bios_param = megasas_bios_param, > + .map_queues = megasas_map_queues, > .change_queue_depth = scsi_change_queue_depth, > .max_segment_size = 0xffffffff, > .no_write_same = 1, > + .host_tagset = 1, > }; > > /** > @@ -5695,34 +5711,6 @@ megasas_setup_jbod_map(struct megasas_instance *instance) > instance->use_seqnum_jbod_fp = false; > } > > -static void megasas_setup_reply_map(struct megasas_instance *instance) > -{ > - const struct cpumask *mask; > - unsigned int queue, cpu, low_latency_index_start; > - > - low_latency_index_start = instance->low_latency_index_start; > - > - for (queue = low_latency_index_start; queue < instance->msix_vectors; queue++) { > - mask = pci_irq_get_affinity(instance->pdev, queue); > - if (!mask) > - goto fallback; > - > - for_each_cpu(cpu, mask) > - instance->reply_map[cpu] = queue; > - } > - return; > - > -fallback: > - queue = low_latency_index_start; > - for_each_possible_cpu(cpu) { > - instance->reply_map[cpu] = queue; > - if (queue == (instance->msix_vectors - 1)) > - queue = low_latency_index_start; > - else > - queue++; > - } > -} > - > /** > * megasas_get_device_list - Get the PD and LD device list from FW. > * @instance: Adapter soft state > @@ -6021,12 +6009,6 @@ static int megasas_init_fw(struct megasas_instance *instance) > instance->is_rdpq = (scratch_pad_1 & MR_RDPQ_MODE_OFFSET) ? > 1 : 0; > > - if (instance->adapter_type >= INVADER_SERIES && > - !instance->msix_combined) { > - instance->msix_load_balance = true; > - instance->smp_affinity_enable = false; > - } > - > /* Save 1-15 reply post index address to local memory > * Index 0 is already saved from reg offset > * MPI2_REPLY_POST_HOST_INDEX_OFFSET > @@ -6145,8 +6127,6 @@ static int megasas_init_fw(struct megasas_instance *instance) > goto fail_init_adapter; > } > > - megasas_setup_reply_map(instance); > - > dev_info(&instance->pdev->dev, > "current msix/online cpus\t: (%d/%d)\n", > instance->msix_vectors, (unsigned int)num_online_cpus()); > @@ -6780,6 +6760,9 @@ static int megasas_io_attach(struct megasas_instance *instance) > host->max_id = MEGASAS_MAX_DEV_PER_CHANNEL; > host->max_lun = MEGASAS_MAX_LUN; > host->max_cmd_len = 16; > + if (instance->adapter_type != MFI_SERIES && instance->msix_vectors > 0) > + host->nr_hw_queues = instance->msix_vectors - > + instance->low_latency_index_start; > > /* > * Notify the mid-layer about the new controller > @@ -6947,11 +6930,6 @@ static inline int megasas_alloc_mfi_ctrl_mem(struct megasas_instance *instance) > */ > static int megasas_alloc_ctrl_mem(struct megasas_instance *instance) > { > - instance->reply_map = kcalloc(nr_cpu_ids, sizeof(unsigned int), > - GFP_KERNEL); > - if (!instance->reply_map) > - return -ENOMEM; > - > switch (instance->adapter_type) { > case MFI_SERIES: > if (megasas_alloc_mfi_ctrl_mem(instance)) > @@ -6968,8 +6946,6 @@ static int megasas_alloc_ctrl_mem(struct megasas_instance *instance) > > return 0; > fail: > - kfree(instance->reply_map); > - instance->reply_map = NULL; > return -ENOMEM; > } > > @@ -6982,7 +6958,6 @@ static int megasas_alloc_ctrl_mem(struct megasas_instance *instance) > */ > static inline void megasas_free_ctrl_mem(struct megasas_instance *instance) > { > - kfree(instance->reply_map); > if (instance->adapter_type == MFI_SERIES) { > if (instance->producer) > dma_free_coherent(&instance->pdev->dev, sizeof(u32), > @@ -7645,8 +7620,6 @@ megasas_resume(struct pci_dev *pdev) > if (rval < 0) > goto fail_reenable_msix; > > - megasas_setup_reply_map(instance); > - > if (instance->adapter_type != MFI_SERIES) { > megasas_reset_reply_desc(instance); > if (megasas_ioc_init_fusion(instance)) { > diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c > index e301458bcbae..bae96b82bb10 100644 > --- a/drivers/scsi/megaraid/megaraid_sas_fusion.c > +++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c > @@ -2731,6 +2731,7 @@ megasas_build_ldio_fusion(struct megasas_instance *instance, > struct MR_PRIV_DEVICE *mrdev_priv; > struct RAID_CONTEXT *rctx; > struct RAID_CONTEXT_G35 *rctx_g35; > + u32 tag = blk_mq_unique_tag(scp->request); > > device_id = MEGASAS_DEV_INDEX(scp); > > @@ -2837,7 +2838,7 @@ megasas_build_ldio_fusion(struct megasas_instance *instance, > instance->msix_vectors)); > else > cmd->request_desc->SCSIIO.MSIxIndex = > - instance->reply_map[raw_smp_processor_id()]; > + blk_mq_unique_tag_to_hwq(tag); > > if (instance->adapter_type >= VENTURA_SERIES) { > /* FP for Optimal raid level 1. > @@ -3080,6 +3081,7 @@ megasas_build_syspd_fusion(struct megasas_instance *instance, > u16 pd_index = 0; > u16 os_timeout_value; > u16 timeout_limit; > + u32 tag = blk_mq_unique_tag(scmd->request); > struct MR_DRV_RAID_MAP_ALL *local_map_ptr; > struct RAID_CONTEXT *pRAID_Context; > struct MR_PD_CFG_SEQ_NUM_SYNC *pd_sync; > @@ -3169,7 +3171,7 @@ megasas_build_syspd_fusion(struct megasas_instance *instance, > instance->msix_vectors)); > else > cmd->request_desc->SCSIIO.MSIxIndex = > - instance->reply_map[raw_smp_processor_id()]; > + blk_mq_unique_tag_to_hwq(tag); > > if (!fp_possible) { > /* system pd firmware path */ > @@ -3373,7 +3375,7 @@ megasas_build_and_issue_cmd_fusion(struct megasas_instance *instance, > { > struct megasas_cmd_fusion *cmd, *r1_cmd = NULL; > union MEGASAS_REQUEST_DESCRIPTOR_UNION *req_desc; > - u32 index; > + u32 index, blk_tag, unique_tag; > > if ((megasas_cmd_type(scmd) == READ_WRITE_LDIO) && > instance->ldio_threshold && > @@ -3389,7 +3391,9 @@ megasas_build_and_issue_cmd_fusion(struct megasas_instance *instance, > return SCSI_MLQUEUE_HOST_BUSY; > } > > - cmd = megasas_get_cmd_fusion(instance, scmd->request->tag); > + unique_tag = blk_mq_unique_tag(scmd->request); > + blk_tag = blk_mq_unique_tag_to_tag(unique_tag); > + cmd = megasas_get_cmd_fusion(instance, blk_tag); > > if (!cmd) { > atomic_dec(&instance->fw_outstanding); > @@ -3430,7 +3434,7 @@ megasas_build_and_issue_cmd_fusion(struct megasas_instance *instance, > */ > if (cmd->r1_alt_dev_handle != MR_DEVHANDLE_INVALID) { > r1_cmd = megasas_get_cmd_fusion(instance, > - (scmd->request->tag + instance->max_fw_cmds)); > + (blk_tag + instance->max_fw_cmds)); > megasas_prepare_secondRaid1_IO(instance, cmd, r1_cmd); > } > > -- > 2.16.4 >
On 12/9/19 11:10 AM, Sumit Saxena wrote: > On Mon, Dec 2, 2019 at 9:09 PM Hannes Reinecke <hare@suse.de> wrote: >> >> Fusion adapters can steer completions to individual queues, and >> we now have support for shared host-wide tags. >> So we can enable multiqueue support for fusion adapters and >> drop the hand-crafted interrupt affinity settings. > > Hi Hannes, > > Ming Lei also proposed similar changes in megaraid_sas driver some > time back and it had resulted in performance drop- > https://patchwork.kernel.org/patch/10969511/ > > So, we will do some performance tests with this patch and update you. > Thank you. I'm aware of the results of Ming Leis work, but I do hope this patchset performs better. And when you do performance measurements, can you please run with both, 'none' I/O scheduler and 'mq-deadline' I/O scheduler? I've measured quite a performance improvements when using mq-deadline, up to the point where I've gotten on-par performance with the original, non-mq, implementation. (As a data point, on my setup I've measured about 270k IOPS and 1092 MB/s througput, running on just 2 SSDs). But thanks for doing a performance test here. Cheers, Hannes
On 09/12/2019 10:10, Sumit Saxena wrote: > On Mon, Dec 2, 2019 at 9:09 PM Hannes Reinecke <hare@suse.de> wrote: >> >> Fusion adapters can steer completions to individual queues, and >> we now have support for shared host-wide tags. >> So we can enable multiqueue support for fusion adapters and >> drop the hand-crafted interrupt affinity settings. > > Hi Hannes, > > Ming Lei also proposed similar changes in megaraid_sas driver some > time back and it had resulted in performance drop- > https://patchwork.kernel.org/patch/10969511/ > > So, we will do some performance tests with this patch and update you. > Hi Sumit, I was wondering if you had a chance to do this test yet? It would be good to know, so we can try to progress this work. @Hannes, This shared sbitmap work now seems to conflict with Jens work on tag caching https://lore.kernel.org/linux-block/20200107163037.31745-1-axboe@kernel.dk/T/#t, but should be resolvable AFAICS (v1, anyway, which I checked). Anway, we seem to have stalled, which I feared... Thanks, John > Thanks, > Sumit >> >> Signed-off-by: Hannes Reinecke <hare@suse.com> >> --- >> drivers/scsi/megaraid/megaraid_sas.h | 1 - >> drivers/scsi/megaraid/megaraid_sas_base.c | 65 +++++++++-------------------- >> drivers/scsi/megaraid/megaraid_sas_fusion.c | 14 ++++--- >> 3 files changed, 28 insertions(+), 52 deletions(-) >> >> diff --git a/drivers/scsi/megaraid/megaraid_sas.h b/drivers/scsi/megaraid/megaraid_sas.h >> index bd8184072bed..844ea2d6dbb8 100644 >> --- a/drivers/scsi/megaraid/megaraid_sas.h >> +++ b/drivers/scsi/megaraid/megaraid_sas.h >> @@ -2261,7 +2261,6 @@ enum MR_PERF_MODE { >> >> struct megasas_instance { >> >> - unsigned int *reply_map; >> __le32 *producer; >> dma_addr_t producer_h; >> __le32 *consumer; >> diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c >> index a4bc81479284..9d0d74e3d491 100644 >> --- a/drivers/scsi/megaraid/megaraid_sas_base.c >> +++ b/drivers/scsi/megaraid/megaraid_sas_base.c >> @@ -37,6 +37,7 @@ >> #include <linux/poll.h> >> #include <linux/vmalloc.h> >> #include <linux/irq_poll.h> >> +#include <linux/blk-mq-pci.h> >> >> #include <scsi/scsi.h> >> #include <scsi/scsi_cmnd.h> >> @@ -3106,6 +3107,19 @@ megasas_bios_param(struct scsi_device *sdev, struct block_device *bdev, >> return 0; >> } >> >> +static int megasas_map_queues(struct Scsi_Host *shost) >> +{ >> + struct megasas_instance *instance; >> + >> + instance = (struct megasas_instance *)shost->hostdata; >> + >> + if (!instance->smp_affinity_enable) >> + return 0; >> + >> + return blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT], >> + instance->pdev, instance->low_latency_index_start); >> +} >> + >> static void megasas_aen_polling(struct work_struct *work); >> >> /** >> @@ -3414,9 +3428,11 @@ static struct scsi_host_template megasas_template = { >> .eh_timed_out = megasas_reset_timer, >> .shost_attrs = megaraid_host_attrs, >> .bios_param = megasas_bios_param, >> + .map_queues = megasas_map_queues, >> .change_queue_depth = scsi_change_queue_depth, >> .max_segment_size = 0xffffffff, >> .no_write_same = 1, >> + .host_tagset = 1, >> }; >> >> /** >> @@ -5695,34 +5711,6 @@ megasas_setup_jbod_map(struct megasas_instance *instance) >> instance->use_seqnum_jbod_fp = false; >> } >> >> -static void megasas_setup_reply_map(struct megasas_instance *instance) >> -{ >> - const struct cpumask *mask; >> - unsigned int queue, cpu, low_latency_index_start; >> - >> - low_latency_index_start = instance->low_latency_index_start; >> - >> - for (queue = low_latency_index_start; queue < instance->msix_vectors; queue++) { >> - mask = pci_irq_get_affinity(instance->pdev, queue); >> - if (!mask) >> - goto fallback; >> - >> - for_each_cpu(cpu, mask) >> - instance->reply_map[cpu] = queue; >> - } >> - return; >> - >> -fallback: >> - queue = low_latency_index_start; >> - for_each_possible_cpu(cpu) { >> - instance->reply_map[cpu] = queue; >> - if (queue == (instance->msix_vectors - 1)) >> - queue = low_latency_index_start; >> - else >> - queue++; >> - } >> -} >> - >> /** >> * megasas_get_device_list - Get the PD and LD device list from FW. >> * @instance: Adapter soft state >> @@ -6021,12 +6009,6 @@ static int megasas_init_fw(struct megasas_instance *instance) >> instance->is_rdpq = (scratch_pad_1 & MR_RDPQ_MODE_OFFSET) ? >> 1 : 0; >> >> - if (instance->adapter_type >= INVADER_SERIES && >> - !instance->msix_combined) { >> - instance->msix_load_balance = true; >> - instance->smp_affinity_enable = false; >> - } >> - >> /* Save 1-15 reply post index address to local memory >> * Index 0 is already saved from reg offset >> * MPI2_REPLY_POST_HOST_INDEX_OFFSET >> @@ -6145,8 +6127,6 @@ static int megasas_init_fw(struct megasas_instance *instance) >> goto fail_init_adapter; >> } >> >> - megasas_setup_reply_map(instance); >> - >> dev_info(&instance->pdev->dev, >> "current msix/online cpus\t: (%d/%d)\n", >> instance->msix_vectors, (unsigned int)num_online_cpus()); >> @@ -6780,6 +6760,9 @@ static int megasas_io_attach(struct megasas_instance *instance) >> host->max_id = MEGASAS_MAX_DEV_PER_CHANNEL; >> host->max_lun = MEGASAS_MAX_LUN; >> host->max_cmd_len = 16; >> + if (instance->adapter_type != MFI_SERIES && instance->msix_vectors > 0) >> + host->nr_hw_queues = instance->msix_vectors - >> + instance->low_latency_index_start; >> >> /* >> * Notify the mid-layer about the new controller >> @@ -6947,11 +6930,6 @@ static inline int megasas_alloc_mfi_ctrl_mem(struct megasas_instance *instance) >> */ >> static int megasas_alloc_ctrl_mem(struct megasas_instance *instance) >> { >> - instance->reply_map = kcalloc(nr_cpu_ids, sizeof(unsigned int), >> - GFP_KERNEL); >> - if (!instance->reply_map) >> - return -ENOMEM; >> - >> switch (instance->adapter_type) { >> case MFI_SERIES: >> if (megasas_alloc_mfi_ctrl_mem(instance)) >> @@ -6968,8 +6946,6 @@ static int megasas_alloc_ctrl_mem(struct megasas_instance *instance) >> >> return 0; >> fail: >> - kfree(instance->reply_map); >> - instance->reply_map = NULL; >> return -ENOMEM; >> } >> >> @@ -6982,7 +6958,6 @@ static int megasas_alloc_ctrl_mem(struct megasas_instance *instance) >> */ >> static inline void megasas_free_ctrl_mem(struct megasas_instance *instance) >> { >> - kfree(instance->reply_map); >> if (instance->adapter_type == MFI_SERIES) { >> if (instance->producer) >> dma_free_coherent(&instance->pdev->dev, sizeof(u32), >> @@ -7645,8 +7620,6 @@ megasas_resume(struct pci_dev *pdev) >> if (rval < 0) >> goto fail_reenable_msix; >> >> - megasas_setup_reply_map(instance); >> - >> if (instance->adapter_type != MFI_SERIES) { >> megasas_reset_reply_desc(instance); >> if (megasas_ioc_init_fusion(instance)) { >> diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c >> index e301458bcbae..bae96b82bb10 100644 >> --- a/drivers/scsi/megaraid/megaraid_sas_fusion.c >> +++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c >> @@ -2731,6 +2731,7 @@ megasas_build_ldio_fusion(struct megasas_instance *instance, >> struct MR_PRIV_DEVICE *mrdev_priv; >> struct RAID_CONTEXT *rctx; >> struct RAID_CONTEXT_G35 *rctx_g35; >> + u32 tag = blk_mq_unique_tag(scp->request); >> >> device_id = MEGASAS_DEV_INDEX(scp); >> >> @@ -2837,7 +2838,7 @@ megasas_build_ldio_fusion(struct megasas_instance *instance, >> instance->msix_vectors)); >> else >> cmd->request_desc->SCSIIO.MSIxIndex = >> - instance->reply_map[raw_smp_processor_id()]; >> + blk_mq_unique_tag_to_hwq(tag); >> >> if (instance->adapter_type >= VENTURA_SERIES) { >> /* FP for Optimal raid level 1. >> @@ -3080,6 +3081,7 @@ megasas_build_syspd_fusion(struct megasas_instance *instance, >> u16 pd_index = 0; >> u16 os_timeout_value; >> u16 timeout_limit; >> + u32 tag = blk_mq_unique_tag(scmd->request); >> struct MR_DRV_RAID_MAP_ALL *local_map_ptr; >> struct RAID_CONTEXT *pRAID_Context; >> struct MR_PD_CFG_SEQ_NUM_SYNC *pd_sync; >> @@ -3169,7 +3171,7 @@ megasas_build_syspd_fusion(struct megasas_instance *instance, >> instance->msix_vectors)); >> else >> cmd->request_desc->SCSIIO.MSIxIndex = >> - instance->reply_map[raw_smp_processor_id()]; >> + blk_mq_unique_tag_to_hwq(tag); >> >> if (!fp_possible) { >> /* system pd firmware path */ >> @@ -3373,7 +3375,7 @@ megasas_build_and_issue_cmd_fusion(struct megasas_instance *instance, >> { >> struct megasas_cmd_fusion *cmd, *r1_cmd = NULL; >> union MEGASAS_REQUEST_DESCRIPTOR_UNION *req_desc; >> - u32 index; >> + u32 index, blk_tag, unique_tag; >> >> if ((megasas_cmd_type(scmd) == READ_WRITE_LDIO) && >> instance->ldio_threshold && >> @@ -3389,7 +3391,9 @@ megasas_build_and_issue_cmd_fusion(struct megasas_instance *instance, >> return SCSI_MLQUEUE_HOST_BUSY; >> } >> >> - cmd = megasas_get_cmd_fusion(instance, scmd->request->tag); >> + unique_tag = blk_mq_unique_tag(scmd->request); >> + blk_tag = blk_mq_unique_tag_to_tag(unique_tag); >> + cmd = megasas_get_cmd_fusion(instance, blk_tag); >> >> if (!cmd) { >> atomic_dec(&instance->fw_outstanding); >> @@ -3430,7 +3434,7 @@ megasas_build_and_issue_cmd_fusion(struct megasas_instance *instance, >> */ >> if (cmd->r1_alt_dev_handle != MR_DEVHANDLE_INVALID) { >> r1_cmd = megasas_get_cmd_fusion(instance, >> - (scmd->request->tag + instance->max_fw_cmds)); >> + (blk_tag + instance->max_fw_cmds)); >> megasas_prepare_secondRaid1_IO(instance, cmd, r1_cmd); >> } >> >> -- >> 2.16.4 >> > . >
On 1/9/20 12:55 PM, John Garry wrote: > On 09/12/2019 10:10, Sumit Saxena wrote: >> On Mon, Dec 2, 2019 at 9:09 PM Hannes Reinecke <hare@suse.de> wrote: >>> >>> Fusion adapters can steer completions to individual queues, and >>> we now have support for shared host-wide tags. >>> So we can enable multiqueue support for fusion adapters and >>> drop the hand-crafted interrupt affinity settings. >> >> Hi Hannes, >> >> Ming Lei also proposed similar changes in megaraid_sas driver some >> time back and it had resulted in performance drop- >> https://patchwork.kernel.org/patch/10969511/ >> >> So, we will do some performance tests with this patch and update you. >> > > Hi Sumit, > > I was wondering if you had a chance to do this test yet? > > It would be good to know, so we can try to progress this work. > > @Hannes, This shared sbitmap work now seems to conflict with Jens work > on tag caching > https://lore.kernel.org/linux-block/20200107163037.31745-1-axboe@kernel.dk/T/#t, > but should be resolvable AFAICS (v1, anyway, which I checked). Anway, we > seem to have stalled, which I feared... > Thanks for the reminder. That was a topic I was wanting to discuss at LSF; will be sending a topic then. Cheers, Hannes
On 09/01/2020 15:19, Hannes Reinecke wrote: > On 1/9/20 12:55 PM, John Garry wrote: >> On 09/12/2019 10:10, Sumit Saxena wrote: >>> On Mon, Dec 2, 2019 at 9:09 PM Hannes Reinecke <hare@suse.de> wrote: >>>> >>>> Fusion adapters can steer completions to individual queues, and >>>> we now have support for shared host-wide tags. >>>> So we can enable multiqueue support for fusion adapters and >>>> drop the hand-crafted interrupt affinity settings. >>> >>> Hi Hannes, >>> >>> Ming Lei also proposed similar changes in megaraid_sas driver some >>> time back and it had resulted in performance drop- >>> https://patchwork.kernel.org/patch/10969511/ >>> >>> So, we will do some performance tests with this patch and update you. >>> >> >> Hi Sumit, >> >> I was wondering if you had a chance to do this test yet? >> >> It would be good to know, so we can try to progress this work. >> >> @Hannes, This shared sbitmap work now seems to conflict with Jens work >> on tag caching >> https://lore.kernel.org/linux-block/20200107163037.31745-1-axboe@kernel.dk/T/#t, >> but should be resolvable AFAICS (v1, anyway, which I checked). Anway, we >> seem to have stalled, which I feared... >> > Thanks for the reminder. > That was a topic I was wanting to discuss at LSF; will be sending a > topic then. Alright, but I am not really sure what else we need to wait months to discuss, unless this shared sbitmap approach is rejected and/or testing on other HBAs shows unsatisfactory performance. To summarize, as I see, we have 3 topics to tackle: - shared tags - block hotplug improvement - Ming Lei had said that he will post another version of https://lore.kernel.org/linux-block/20191128020205.GB3277@ming.t460p/ - https://patchwork.kernel.org/cover/10967071/ - I'm not sure what's happening on that, but thought that it was somewhat straightforward. If there's something else I've missed, then let me know. Cheers, John
On Thu, Jan 09, 2020 at 11:55:12AM +0000, John Garry wrote: > On 09/12/2019 10:10, Sumit Saxena wrote: > > On Mon, Dec 2, 2019 at 9:09 PM Hannes Reinecke <hare@suse.de> wrote: > > > > > > Fusion adapters can steer completions to individual queues, and > > > we now have support for shared host-wide tags. > > > So we can enable multiqueue support for fusion adapters and > > > drop the hand-crafted interrupt affinity settings. > > > > Hi Hannes, > > > > Ming Lei also proposed similar changes in megaraid_sas driver some > > time back and it had resulted in performance drop- > > https://patchwork.kernel.org/patch/10969511/ > > > > So, we will do some performance tests with this patch and update you. > > > > Hi Sumit, > > I was wondering if you had a chance to do this test yet? > > It would be good to know, so we can try to progress this work. Looks most of the comment in the following link isn't addressed: https://lore.kernel.org/linux-block/20191129002540.GA1829@ming.t460p/ > Firstly too much((nr_hw_queues - 1) times) memory is wasted. Secondly IO > latency could be increased by too deep scheduler queue depth. Finally CPU > could be wasted in the retrying of running busy hw queue. > > Wrt. driver tags, this patch may be worse, given the average limit for > each LUN is reduced by (nr_hw_queues) times, see hctx_may_queue(). > > Another change is bt_wait_ptr(). Before your patches, there is single > .wait_index, now the number of .wait_index is changed to nr_hw_queues. > > Also the run queue number is increased a lot in SCSI's IO completion, see > scsi_end_request(). I guess memory waste won't be a blocker. But it may not be one accepted behavior to reduce average active queue depth for each LUN by nr_hw_queues times, meantime scheduler queue depth is increased by nr_hw_queues times, compared with single queue. thanks, Ming
On Mon, Dec 9, 2019 at 4:32 PM Hannes Reinecke <hare@suse.de> wrote: > > On 12/9/19 11:10 AM, Sumit Saxena wrote: > > On Mon, Dec 2, 2019 at 9:09 PM Hannes Reinecke <hare@suse.de> wrote: > >> > >> Fusion adapters can steer completions to individual queues, and > >> we now have support for shared host-wide tags. > >> So we can enable multiqueue support for fusion adapters and > >> drop the hand-crafted interrupt affinity settings. > > > > Hi Hannes, > > > > Ming Lei also proposed similar changes in megaraid_sas driver some > > time back and it had resulted in performance drop- > > https://patchwork.kernel.org/patch/10969511/ > > > > So, we will do some performance tests with this patch and update you. > > Thank you. > > I'm aware of the results of Ming Leis work, but I do hope this patchset > performs better. > > And when you do performance measurements, can you please run with both, > 'none' I/O scheduler and 'mq-deadline' I/O scheduler? > I've measured quite a performance improvements when using mq-deadline, > up to the point where I've gotten on-par performance with the original, > non-mq, implementation. > (As a data point, on my setup I've measured about 270k IOPS and 1092 > MB/s througput, running on just 2 SSDs). > > But thanks for doing a performance test here. Hi Hannes, Sorry for the delay in replying, I observed a few issues with this patchset: 1. "blk_mq_unique_tag_to_hwq(tag)" does not return MSI-x vector to which IO submitter CPU is affined with. Due to this IO submission and completion CPUs are different which causes performance drop for low latency workloads. lspcu: # lscpu Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 72 On-line CPU(s) list: 0-71 Thread(s) per core: 2 Core(s) per socket: 18 Socket(s): 2 NUMA node(s): 2 Vendor ID: GenuineIntel CPU family: 6 Model: 85 Model name: Intel(R) Xeon(R) Gold 6150 CPU @ 2.70GHz Stepping: 4 CPU MHz: 3204.246 CPU max MHz: 3700.0000 CPU min MHz: 1200.0000 BogoMIPS: 5400.00 Virtualization: VT-x L1d cache: 32K L1i cache: 32K L2 cache: 1024K L3 cache: 25344K NUMA node0 CPU(s): 0-17,36-53 NUMA node1 CPU(s): 18-35,54-71 Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe s yscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb cat_l3 cdp_l3 invpcid_single intel_ppin mba tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm cqm mpx rdt_a avx512f avx512dq rdseed adx smap clflushopt clwb intel_pt avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_lo fio test: #numactl -N 0 fio jbod.fio # cat jbod.fio [global] allow_mounted_write=0 ioengine=libaio buffered=0 direct=1 group_reporting iodepth=1 bs=4096 readwrite=randread .. In this test, IOs are submitted on Node 0 but observed IO completions on Node 1. IRQs / 1 second(s) IRQ# TOTAL NODE0 NODE1 NAME 176 48387 48387 0 IR-PCI-MSI 34603073-edge megasas14-msix65 178 47966 47966 0 IR-PCI-MSI 34603075-edge megasas14-msix67 170 47706 47706 0 IR-PCI-MSI 34603067-edge megasas14-msix59 180 47291 47291 0 IR-PCI-MSI 34603077-edge megasas14-msix69 181 47155 47155 0 IR-PCI-MSI 34603078-edge megasas14-msix70 173 46806 46806 0 IR-PCI-MSI 34603070-edge megasas14-msix62 179 46773 46773 0 IR-PCI-MSI 34603076-edge megasas14-msix68 169 46600 46600 0 IR-PCI-MSI 34603066-edge megasas14-msix58 175 46447 46447 0 IR-PCI-MSI 34603072-edge megasas14-msix64 172 46184 46184 0 IR-PCI-MSI 34603069-edge megasas14-msix61 182 46117 46117 0 IR-PCI-MSI 34603079-edge megasas14-msix71 165 46070 46070 0 IR-PCI-MSI 34603062-edge megasas14-msix54 164 45892 45892 0 IR-PCI-MSI 34603061-edge megasas14-msix53 174 45864 45864 0 IR-PCI-MSI 34603071-edge megasas14-msix63 156 45348 45348 0 IR-PCI-MSI 34603053-edge megasas14-msix45 147 45302 0 45302 IR-PCI-MSI 34603044-edge megasas14-msix36 151 44922 0 44922 IR-PCI-MSI 34603048-edge megasas14-msix40 171 44876 44876 0 IR-PCI-MSI 34603068-edge megasas14-msix60 159 44755 44755 0 IR-PCI-MSI 34603056-edge megasas14-msix48 148 44695 0 44695 IR-PCI-MSI 34603045-edge megasas14-msix37 157 44304 44304 0 IR-PCI-MSI 34603054-edge megasas14-msix46 167 42552 42552 0 IR-PCI-MSI 34603064-edge megasas14-msix56 154 35937 0 35937 IR-PCI-MSI 34603051-edge megasas14-msix43 166 16004 16004 0 IR-PCI-MSI 34603063-edge megasas14-msix55 IRQ-CPU affinity: Ran below script to get IRQ-CPU affinity: -- #!/bin/bash PCID=`lspci | grep "SAS39xx" | cut -c1-7` PCIP=`find /sys/devices -name *$PCID | grep pci` IRQS=`ls $PCIP/msi_irqs` echo "kernel version: " uname -a for IRQ in $IRQS; do CPUS=`cat /proc/irq/$IRQ/smp_affinity_list` echo "irq $IRQ, cpu list $CPUS" done -- irq 103, cpu list 0-17,36-53 irq 112, cpu list 0-17,36-53 irq 113, cpu list 0-17,36-53 irq 114, cpu list 0-17,36-53 irq 115, cpu list 0-17,36-53 irq 116, cpu list 0-17,36-53 irq 117, cpu list 0-17,36-53 irq 118, cpu list 0-17,36-53 irq 119, cpu list 18 irq 120, cpu list 19 irq 121, cpu list 20 irq 122, cpu list 21 irq 123, cpu list 22 irq 124, cpu list 23 irq 125, cpu list 24 irq 126, cpu list 25 irq 127, cpu list 26 irq 128, cpu list 27 irq 129, cpu list 28 irq 130, cpu list 29 irq 131, cpu list 30 irq 132, cpu list 31 irq 133, cpu list 32 irq 134, cpu list 33 irq 135, cpu list 34 irq 136, cpu list 35 irq 137, cpu list 54 irq 138, cpu list 55 irq 139, cpu list 56 irq 140, cpu list 57 irq 141, cpu list 58 irq 142, cpu list 59 irq 143, cpu list 60 irq 144, cpu list 61 irq 145, cpu list 62 irq 146, cpu list 63 irq 147, cpu list 64 irq 148, cpu list 65 irq 149, cpu list 66 irq 150, cpu list 67 irq 151, cpu list 68 irq 152, cpu list 69 irq 153, cpu list 70 irq 154, cpu list 71 irq 155, cpu list 0 irq 156, cpu list 1 irq 157, cpu list 2 irq 158, cpu list 3 irq 159, cpu list 4 irq 160, cpu list 5 irq 161, cpu list 6 irq 162, cpu list 7 irq 163, cpu list 8 irq 164, cpu list 9 irq 165, cpu list 10 irq 166, cpu list 11 irq 167, cpu list 12 irq 168, cpu list 13 irq 169, cpu list 14 irq 170, cpu list 15 irq 171, cpu list 16 irq 172, cpu list 17 irq 173, cpu list 36 irq 174, cpu list 37 irq 175, cpu list 38 irq 176, cpu list 39 irq 177, cpu list 40 irq 178, cpu list 41 irq 179, cpu list 42 irq 180, cpu list 43 irq 181, cpu list 44 irq 182, cpu list 45 irq 183, cpu list 46 irq 184, cpu list 47 irq 185, cpu list 48 irq 186, cpu list 49 irq 187, cpu list 50 irq 188, cpu list 51 irq 189, cpu list 52 irq 190, cpu list 53 I added prints in megaraid_sas driver's IO path to catch when MSI-x affined to IO submitter CPU does not match with what is returned by API "blk_mq_unique_tag_to_hwq(tag)". I have copied below few prints from dmesg logs- in these prints CPU is submitter CPU, calculated MSX-x is what is returned by "blk_mq_unique_tag_to_hwq(tag)" and affined MSI-x is actual MSI-x to which submitting CPU is affined to: [2536843.629877] BRCM DBG: CPU:6 calculated MSI-x: 153 affined MSIx: 161 [2536843.641674] BRCM DBG: CPU:39 calculated MSI-x: 168 affined MSIx: 176 [2536843.641674] BRCM DBG: CPU:13 calculated MSI-x: 160 affined MSIx: 168 2. Seeing below stack traces/messages in dmesg during driver unload – [2565601.054366] Call Trace: [2565601.054368] blk_mq_free_map_and_requests+0x28/0x50 [2565601.054369] blk_mq_free_tag_set+0x1d/0x90 [2565601.054370] scsi_host_dev_release+0x8a/0xf0 [2565601.054370] device_release+0x27/0x80 [2565601.054371] kobject_cleanup+0x61/0x190 [2565601.054373] megasas_detach_one+0x4c1/0x650 [megaraid_sas] [2565601.054374] pci_device_remove+0x3b/0xc0 [2565601.054375] device_release_driver_internal+0xec/0x1b0 [2565601.054376] driver_detach+0x46/0x90 [2565601.054377] bus_remove_driver+0x58/0xd0 [2565601.054378] pci_unregister_driver+0x26/0xa0 [2565601.054379] megasas_exit+0x91/0x882 [megaraid_sas] [2565601.054381] __x64_sys_delete_module+0x16c/0x250 [2565601.054382] do_syscall_64+0x5b/0x1b0 [2565601.054383] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [2565601.054383] RIP: 0033:0x7f7212a82837 [2565601.054384] RSP: 002b:00007ffdfa2dcea8 EFLAGS: 00000202 ORIG_RAX: 00000000000000b0 [2565601.054385] RAX: ffffffffffffffda RBX: 0000000000b6e2e0 RCX: 00007f7212a82837 [2565601.054385] RDX: 00007f7212af3ac0 RSI: 0000000000000800 RDI: 0000000000b6e348 [2565601.054386] RBP: 0000000000000000 R08: 00007f7212d47060 R09: 00007f7212af3ac0 [2565601.054386] R10: 00007ffdfa2dcbc0 R11: 0000000000000202 R12: 00007ffdfa2dd71c [2565601.054387] R13: 0000000000000000 R14: 0000000000b6e2e0 R15: 0000000000b6e010 [2565601.054387] ---[ end trace 38899303bd85e838 ]--- [2565601.054390] ------------[ cut here ]------------ [2565601.054391] WARNING: CPU: 31 PID: 50996 at block/blk-mq.c:2056 blk_mq_free_rqs+0x10b/0x120 [2565601.054391] Modules linked in: megaraid_sas(-) ses enclosure scsi_transport_sas xt_CHECKSUM xt_MASQUERADE tun bridge stp llc ip6t_rpfilter ipt_REJECT nf_reject_ipv4 ip6t_REJECT nf_reject_ipv6 xt_conntrack ip_set nfnetlink ebtable_nat ebtable_broute ip6table_nat ip6table_mangle ip6table_security ip6table_raw iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 iptable_mangle iptable_security iptable_raw ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter intel_rapl_msr intel_rapl_common skx_edac nfit libnvdimm ftdi_sio x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel snd_hda_codec_realtek kvm snd_hda_codec_generic ledtrig_audio snd_hda_intel snd_intel_nhlt snd_hda_codec snd_hda_core irqbypass snd_hwdep crct10dif_pclmul snd_seq crc32_pclmul ghash_clmulni_intel snd_seq_device snd_pcm iTCO_wdt mei_me iTCO_vendor_support cryptd snd_timer joydev snd mei soundcore ioatdma sg pcspkr ipmi_si ipmi_devintf ipmi_msghandler dca i2c_i801 lpc_ich acpi_power_meter wmi [2565601.054400] acpi_pad nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sd_mod ast i2c_algo_bit drm_vram_helper ttm drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm e1000e ahci libahci crc32c_intel nvme libata nvme_core i2c_core dm_mirror dm_region_hash dm_log dm_mod [last unloaded: megaraid_sas] [2565601.054404] CPU: 31 PID: 50996 Comm: rmmod Kdump: loaded Tainted: G W OE 5.4.0-rc1+ #2 [2565601.054405] Hardware name: Supermicro Super Server/X11DPG-QT, BIOS 1.0 06/22/2017 [2565601.054406] RIP: 0010:blk_mq_free_rqs+0x10b/0x120 [2565601.054407] Code: 89 10 48 8b 73 20 48 89 1b 4c 89 e7 48 89 5b 08 e8 2a 54 e7 ff 48 8b 85 b0 00 00 00 49 39 c5 75 bc 5b 5d 41 5c 41 5d 41 5e c3 <0f> 0b 48 c7 02 00 00 00 00 e9 2b ff ff ff 0f 1f 80 00 00 00 00 0f [2565601.054407] RSP: 0018:ffffb37446a6bd58 EFLAGS: 00010286 [2565601.054408] RAX: 0000000000000747 RBX: ffff92219cb280a8 RCX: 0000000000000747 [2565601.054408] RDX: ffff92219b153a38 RSI: ffff9221692bb5c0 RDI: ffff92219cb280a8 [2565601.054408] RBP: ffff9221692bb5c0 R08: 0000000000000001 R09: 0000000000000000 [2565601.054409] R10: ffff9221692bb680 R11: ffff9221bffd5f60 R12: ffff9221970dd678 [2565601.054409] R13: 0000000000000030 R14: ffff92219cb280a8 R15: ffff922199ada010 [2565601.054410] FS: 00007f72135a1740(0000) GS:ffff92299f540000(0000) knlGS:0000000000000000 [2565601.054410] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [2565601.054410] CR2: 0000000000b77c78 CR3: 0000000f27db0006 CR4: 00000000007606e0 [2565601.054412] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [2565601.054412] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [2565601.054413] PKRU: 55555554 3. For High IOs(outstanding IOs per physical disk > 8) oriented workloads, performance numbers are good(no performance drop) as in that case driver uses non-managed affinity high IOPs reply queues and this patchset does not touch driver's high IOPs IO path. 4. This patch removes below code from driver so what this piece of code does is broken- - if (instance->adapter_type >= INVADER_SERIES && - !instance->msix_combined) { - instance->msix_load_balance = true; - instance->smp_affinity_enable = false; - } Thanks, Sumit > > Cheers, > > Hannes > -- > Dr. Hannes Reinecke Teamlead Storage & Networking > hare@suse.de +49 911 74053 688 > SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg > HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer
On 10/01/2020 02:00, Ming Lei wrote: > On Thu, Jan 09, 2020 at 11:55:12AM +0000, John Garry wrote: >> On 09/12/2019 10:10, Sumit Saxena wrote: >>> On Mon, Dec 2, 2019 at 9:09 PM Hannes Reinecke <hare@suse.de> wrote: >>>> >>>> Fusion adapters can steer completions to individual queues, and >>>> we now have support for shared host-wide tags. >>>> So we can enable multiqueue support for fusion adapters and >>>> drop the hand-crafted interrupt affinity settings. >>> >>> Hi Hannes, >>> >>> Ming Lei also proposed similar changes in megaraid_sas driver some >>> time back and it had resulted in performance drop- >>> https://patchwork.kernel.org/patch/10969511/ >>> >>> So, we will do some performance tests with this patch and update you. >>> >> >> Hi Sumit, >> >> I was wondering if you had a chance to do this test yet? >> >> It would be good to know, so we can try to progress this work. > > Looks most of the comment in the following link isn't addressed: > > https://lore.kernel.org/linux-block/20191129002540.GA1829@ming.t460p/ OK, but I was waiting for results first, which I hoped would not take too long. They weren't forgotten, for sure. Let me check them now. > >> Firstly too much((nr_hw_queues - 1) times) memory is wasted. Secondly IO >> latency could be increased by too deep scheduler queue depth. Finally CPU >> could be wasted in the retrying of running busy hw queue. >> >> Wrt. driver tags, this patch may be worse, given the average limit for >> each LUN is reduced by (nr_hw_queues) times, see hctx_may_queue(). >> >> Another change is bt_wait_ptr(). Before your patches, there is single >> .wait_index, now the number of .wait_index is changed to nr_hw_queues. >> >> Also the run queue number is increased a lot in SCSI's IO completion, see >> scsi_end_request(). > > I guess memory waste won't be a blocker. Yeah, that's a trade-off. And, as I remember, memory waste does not seem extreme. > > But it may not be one accepted behavior to reduce average active queue > depth for each LUN by nr_hw_queues times, meantime scheduler queue depth > is increased by nr_hw_queues times, compared with single queue. > Thanks, John
On 10/01/2020 04:00, Sumit Saxena wrote: > On Mon, Dec 9, 2019 at 4:32 PM Hannes Reinecke <hare@suse.de> wrote: >> >> On 12/9/19 11:10 AM, Sumit Saxena wrote: >>> On Mon, Dec 2, 2019 at 9:09 PM Hannes Reinecke <hare@suse.de> wrote: >>>> >>>> Fusion adapters can steer completions to individual queues, and >>>> we now have support for shared host-wide tags. >>>> So we can enable multiqueue support for fusion adapters and >>>> drop the hand-crafted interrupt affinity settings. >>> >>> Hi Hannes, >>> >>> Ming Lei also proposed similar changes in megaraid_sas driver some >>> time back and it had resulted in performance drop- >>> https://patchwork.kernel.org/patch/10969511/ >>> >>> So, we will do some performance tests with this patch and update you. >>> Thank you. >> >> I'm aware of the results of Ming Leis work, but I do hope this patchset >> performs better. >> >> And when you do performance measurements, can you please run with both, >> 'none' I/O scheduler and 'mq-deadline' I/O scheduler? >> I've measured quite a performance improvements when using mq-deadline, >> up to the point where I've gotten on-par performance with the original, >> non-mq, implementation. >> (As a data point, on my setup I've measured about 270k IOPS and 1092 >> MB/s througput, running on just 2 SSDs). >> >> But thanks for doing a performance test here. > Hi Sumit, Thanks for this. We'll have a look ... Cheers, John EOM > Hi Hannes, > > Sorry for the delay in replying, I observed a few issues with this patchset: > > 1. "blk_mq_unique_tag_to_hwq(tag)" does not return MSI-x vector to > which IO submitter CPU is affined with. Due to this IO submission and > completion CPUs are different which causes performance drop for low > latency workloads. > > lspcu: > > # lscpu > Architecture: x86_64 > CPU op-mode(s): 32-bit, 64-bit > Byte Order: Little Endian > CPU(s): 72 > On-line CPU(s) list: 0-71 > Thread(s) per core: 2 > Core(s) per socket: 18 > Socket(s): 2 > NUMA node(s): 2 > Vendor ID: GenuineIntel > CPU family: 6 > Model: 85 > Model name: Intel(R) Xeon(R) Gold 6150 CPU @ 2.70GHz > Stepping: 4 > CPU MHz: 3204.246 > CPU max MHz: 3700.0000 > CPU min MHz: 1200.0000 > BogoMIPS: 5400.00 > Virtualization: VT-x > L1d cache: 32K > L1i cache: 32K > L2 cache: 1024K > L3 cache: 25344K > NUMA node0 CPU(s): 0-17,36-53 > NUMA node1 CPU(s): 18-35,54-71 > Flags: fpu vme de pse tsc msr pae mce cx8 apic sep > mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht > tm pbe s > yscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts > rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq > dtes64 monitor > ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 > sse4_2 x2apic movbe popcnt tsc_deadline_timer xsave avx f16c rdrand > lahf_lm abm > 3dnowprefetch cpuid_fault epb cat_l3 cdp_l3 invpcid_single intel_ppin > mba tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust > bmi1 hle > avx2 smep bmi2 erms invpcid rtm cqm mpx rdt_a avx512f avx512dq rdseed > adx smap clflushopt clwb intel_pt avx512cd avx512bw avx512vl xsaveopt > xsavec > xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_lo > > > fio test: > > #numactl -N 0 fio jbod.fio > > # cat jbod.fio > [global] > allow_mounted_write=0 > ioengine=libaio > buffered=0 > direct=1 > group_reporting > iodepth=1 > bs=4096 > readwrite=randread > .. > > In this test, IOs are submitted on Node 0 but observed IO completions on Node 1. > > IRQs / 1 second(s) > IRQ# TOTAL NODE0 NODE1 NAME > 176 48387 48387 0 IR-PCI-MSI 34603073-edge megasas14-msix65 > 178 47966 47966 0 IR-PCI-MSI 34603075-edge megasas14-msix67 > 170 47706 47706 0 IR-PCI-MSI 34603067-edge megasas14-msix59 > 180 47291 47291 0 IR-PCI-MSI 34603077-edge megasas14-msix69 > 181 47155 47155 0 IR-PCI-MSI 34603078-edge megasas14-msix70 > 173 46806 46806 0 IR-PCI-MSI 34603070-edge megasas14-msix62 > 179 46773 46773 0 IR-PCI-MSI 34603076-edge megasas14-msix68 > 169 46600 46600 0 IR-PCI-MSI 34603066-edge megasas14-msix58 > 175 46447 46447 0 IR-PCI-MSI 34603072-edge megasas14-msix64 > 172 46184 46184 0 IR-PCI-MSI 34603069-edge megasas14-msix61 > 182 46117 46117 0 IR-PCI-MSI 34603079-edge megasas14-msix71 > 165 46070 46070 0 IR-PCI-MSI 34603062-edge megasas14-msix54 > 164 45892 45892 0 IR-PCI-MSI 34603061-edge megasas14-msix53 > 174 45864 45864 0 IR-PCI-MSI 34603071-edge megasas14-msix63 > 156 45348 45348 0 IR-PCI-MSI 34603053-edge megasas14-msix45 > 147 45302 0 45302 IR-PCI-MSI 34603044-edge megasas14-msix36 > 151 44922 0 44922 IR-PCI-MSI 34603048-edge megasas14-msix40 > 171 44876 44876 0 IR-PCI-MSI 34603068-edge megasas14-msix60 > 159 44755 44755 0 IR-PCI-MSI 34603056-edge megasas14-msix48 > 148 44695 0 44695 IR-PCI-MSI 34603045-edge megasas14-msix37 > 157 44304 44304 0 IR-PCI-MSI 34603054-edge megasas14-msix46 > 167 42552 42552 0 IR-PCI-MSI 34603064-edge megasas14-msix56 > 154 35937 0 35937 IR-PCI-MSI 34603051-edge megasas14-msix43 > 166 16004 16004 0 IR-PCI-MSI 34603063-edge megasas14-msix55 > > > IRQ-CPU affinity: > > Ran below script to get IRQ-CPU affinity: > -- > #!/bin/bash > PCID=`lspci | grep "SAS39xx" | cut -c1-7` > PCIP=`find /sys/devices -name *$PCID | grep pci` > IRQS=`ls $PCIP/msi_irqs` > > echo "kernel version: " > uname -a > > for IRQ in $IRQS; do > CPUS=`cat /proc/irq/$IRQ/smp_affinity_list` > echo "irq $IRQ, cpu list $CPUS" > done > > -- > > irq 103, cpu list 0-17,36-53 > irq 112, cpu list 0-17,36-53 > irq 113, cpu list 0-17,36-53 > irq 114, cpu list 0-17,36-53 > irq 115, cpu list 0-17,36-53 > irq 116, cpu list 0-17,36-53 > irq 117, cpu list 0-17,36-53 > irq 118, cpu list 0-17,36-53 > irq 119, cpu list 18 > irq 120, cpu list 19 > irq 121, cpu list 20 > irq 122, cpu list 21 > irq 123, cpu list 22 > irq 124, cpu list 23 > irq 125, cpu list 24 > irq 126, cpu list 25 > irq 127, cpu list 26 > irq 128, cpu list 27 > irq 129, cpu list 28 > irq 130, cpu list 29 > irq 131, cpu list 30 > irq 132, cpu list 31 > irq 133, cpu list 32 > irq 134, cpu list 33 > irq 135, cpu list 34 > irq 136, cpu list 35 > irq 137, cpu list 54 > irq 138, cpu list 55 > irq 139, cpu list 56 > irq 140, cpu list 57 > irq 141, cpu list 58 > irq 142, cpu list 59 > irq 143, cpu list 60 > irq 144, cpu list 61 > irq 145, cpu list 62 > irq 146, cpu list 63 > irq 147, cpu list 64 > irq 148, cpu list 65 > irq 149, cpu list 66 > irq 150, cpu list 67 > irq 151, cpu list 68 > irq 152, cpu list 69 > irq 153, cpu list 70 > irq 154, cpu list 71 > irq 155, cpu list 0 > irq 156, cpu list 1 > irq 157, cpu list 2 > irq 158, cpu list 3 > irq 159, cpu list 4 > irq 160, cpu list 5 > irq 161, cpu list 6 > irq 162, cpu list 7 > irq 163, cpu list 8 > irq 164, cpu list 9 > irq 165, cpu list 10 > irq 166, cpu list 11 > irq 167, cpu list 12 > irq 168, cpu list 13 > irq 169, cpu list 14 > irq 170, cpu list 15 > irq 171, cpu list 16 > irq 172, cpu list 17 > irq 173, cpu list 36 > irq 174, cpu list 37 > irq 175, cpu list 38 > irq 176, cpu list 39 > irq 177, cpu list 40 > irq 178, cpu list 41 > irq 179, cpu list 42 > irq 180, cpu list 43 > irq 181, cpu list 44 > irq 182, cpu list 45 > irq 183, cpu list 46 > irq 184, cpu list 47 > irq 185, cpu list 48 > irq 186, cpu list 49 > irq 187, cpu list 50 > irq 188, cpu list 51 > irq 189, cpu list 52 > irq 190, cpu list 53 > > > I added prints in megaraid_sas driver's IO path to catch when MSI-x > affined to IO submitter CPU does not match with what is returned by > API "blk_mq_unique_tag_to_hwq(tag)". > I have copied below few prints from dmesg logs- in these prints CPU is > submitter CPU, calculated MSX-x is what is returned by > "blk_mq_unique_tag_to_hwq(tag)" and affined MSI-x is actual MSI-x to > which submitting > CPU is affined to: > > [2536843.629877] BRCM DBG: CPU:6 calculated MSI-x: 153 affined MSIx: 161 > [2536843.641674] BRCM DBG: CPU:39 calculated MSI-x: 168 affined MSIx: 176 > [2536843.641674] BRCM DBG: CPU:13 calculated MSI-x: 160 affined MSIx: 168 > > > 2. Seeing below stack traces/messages in dmesg during driver unload – > > [2565601.054366] Call Trace: > [2565601.054368] blk_mq_free_map_and_requests+0x28/0x50 > [2565601.054369] blk_mq_free_tag_set+0x1d/0x90 > [2565601.054370] scsi_host_dev_release+0x8a/0xf0 > [2565601.054370] device_release+0x27/0x80 > [2565601.054371] kobject_cleanup+0x61/0x190 > [2565601.054373] megasas_detach_one+0x4c1/0x650 [megaraid_sas] > [2565601.054374] pci_device_remove+0x3b/0xc0 > [2565601.054375] device_release_driver_internal+0xec/0x1b0 > [2565601.054376] driver_detach+0x46/0x90 > [2565601.054377] bus_remove_driver+0x58/0xd0 > [2565601.054378] pci_unregister_driver+0x26/0xa0 > [2565601.054379] megasas_exit+0x91/0x882 [megaraid_sas] > [2565601.054381] __x64_sys_delete_module+0x16c/0x250 > [2565601.054382] do_syscall_64+0x5b/0x1b0 > [2565601.054383] entry_SYSCALL_64_after_hwframe+0x44/0xa9 > [2565601.054383] RIP: 0033:0x7f7212a82837 > [2565601.054384] RSP: 002b:00007ffdfa2dcea8 EFLAGS: 00000202 ORIG_RAX: > 00000000000000b0 > [2565601.054385] RAX: ffffffffffffffda RBX: 0000000000b6e2e0 RCX: > 00007f7212a82837 > [2565601.054385] RDX: 00007f7212af3ac0 RSI: 0000000000000800 RDI: > 0000000000b6e348 > [2565601.054386] RBP: 0000000000000000 R08: 00007f7212d47060 R09: > 00007f7212af3ac0 > [2565601.054386] R10: 00007ffdfa2dcbc0 R11: 0000000000000202 R12: > 00007ffdfa2dd71c > [2565601.054387] R13: 0000000000000000 R14: 0000000000b6e2e0 R15: > 0000000000b6e010 > [2565601.054387] ---[ end trace 38899303bd85e838 ]--- > [2565601.054390] ------------[ cut here ]------------ > [2565601.054391] WARNING: CPU: 31 PID: 50996 at block/blk-mq.c:2056 > blk_mq_free_rqs+0x10b/0x120 > [2565601.054391] Modules linked in: megaraid_sas(-) ses enclosure > scsi_transport_sas xt_CHECKSUM xt_MASQUERADE tun bridge stp llc > ip6t_rpfilter ipt_REJECT nf_reject_ipv4 ip6t_REJECT nf_reject_ipv6 > xt_conntrack ip_set nfnetlink ebtable_nat ebtable_broute ip6table_nat > ip6table_mangle ip6table_security ip6table_raw iptable_nat nf_nat > nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 iptable_mangle > iptable_security iptable_raw ebtable_filter ebtables ip6table_filter > ip6_tables iptable_filter intel_rapl_msr intel_rapl_common skx_edac > nfit libnvdimm ftdi_sio x86_pkg_temp_thermal intel_powerclamp coretemp > kvm_intel snd_hda_codec_realtek kvm snd_hda_codec_generic > ledtrig_audio snd_hda_intel snd_intel_nhlt snd_hda_codec snd_hda_core > irqbypass snd_hwdep crct10dif_pclmul snd_seq crc32_pclmul > ghash_clmulni_intel snd_seq_device snd_pcm iTCO_wdt mei_me > iTCO_vendor_support cryptd snd_timer joydev snd mei soundcore ioatdma > sg pcspkr ipmi_si ipmi_devintf ipmi_msghandler dca i2c_i801 lpc_ich > acpi_power_meter wmi > [2565601.054400] acpi_pad nfsd auth_rpcgss nfs_acl lockd grace sunrpc > ip_tables xfs libcrc32c sd_mod ast i2c_algo_bit drm_vram_helper ttm > drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm > e1000e ahci libahci crc32c_intel nvme libata nvme_core i2c_core > dm_mirror dm_region_hash dm_log dm_mod [last unloaded: megaraid_sas] > [2565601.054404] CPU: 31 PID: 50996 Comm: rmmod Kdump: loaded Tainted: > G W OE 5.4.0-rc1+ #2 > [2565601.054405] Hardware name: Supermicro Super Server/X11DPG-QT, > BIOS 1.0 06/22/2017 > [2565601.054406] RIP: 0010:blk_mq_free_rqs+0x10b/0x120 > [2565601.054407] Code: 89 10 48 8b 73 20 48 89 1b 4c 89 e7 48 89 5b 08 > e8 2a 54 e7 ff 48 8b 85 b0 00 00 00 49 39 c5 75 bc 5b 5d 41 5c 41 5d > 41 5e c3 <0f> 0b 48 c7 02 00 00 00 00 e9 2b ff ff ff 0f 1f 80 00 00 00 > 00 0f > [2565601.054407] RSP: 0018:ffffb37446a6bd58 EFLAGS: 00010286 > [2565601.054408] RAX: 0000000000000747 RBX: ffff92219cb280a8 RCX: > 0000000000000747 > [2565601.054408] RDX: ffff92219b153a38 RSI: ffff9221692bb5c0 RDI: > ffff92219cb280a8 > [2565601.054408] RBP: ffff9221692bb5c0 R08: 0000000000000001 R09: > 0000000000000000 > [2565601.054409] R10: ffff9221692bb680 R11: ffff9221bffd5f60 R12: > ffff9221970dd678 > [2565601.054409] R13: 0000000000000030 R14: ffff92219cb280a8 R15: > ffff922199ada010 > [2565601.054410] FS: 00007f72135a1740(0000) GS:ffff92299f540000(0000) > knlGS:0000000000000000 > [2565601.054410] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [2565601.054410] CR2: 0000000000b77c78 CR3: 0000000f27db0006 CR4: > 00000000007606e0 > [2565601.054412] DR0: 0000000000000000 DR1: 0000000000000000 DR2: > 0000000000000000 > [2565601.054412] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: > 0000000000000400 > [2565601.054413] PKRU: 55555554 > > > 3. For High IOs(outstanding IOs per physical disk > 8) oriented > workloads, performance numbers are good(no performance drop) as in > that case driver uses non-managed affinity high IOPs reply queues and > this patchset does not touch driver's high IOPs IO path. > > 4. This patch removes below code from driver so what this piece of > code does is broken- > > > - if (instance->adapter_type >= INVADER_SERIES && > - !instance->msix_combined) { > - instance->msix_load_balance = true; > - instance->smp_affinity_enable = false; > - } > > Thanks, > Sumit >> >> Cheers, >> >> Hannes >> -- >> Dr. Hannes Reinecke Teamlead Storage & Networking >> hare@suse.de +49 911 74053 688 >> SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg >> HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer > . >
On 10/01/2020 04:00, Sumit Saxena wrote: > On Mon, Dec 9, 2019 at 4:32 PM Hannes Reinecke <hare@suse.de> wrote: >> >> On 12/9/19 11:10 AM, Sumit Saxena wrote: >>> On Mon, Dec 2, 2019 at 9:09 PM Hannes Reinecke <hare@suse.de> wrote: >>>> >>>> Fusion adapters can steer completions to individual queues, and >>>> we now have support for shared host-wide tags. >>>> So we can enable multiqueue support for fusion adapters and >>>> drop the hand-crafted interrupt affinity settings. >>> >>> Hi Hannes, >>> >>> Ming Lei also proposed similar changes in megaraid_sas driver some >>> time back and it had resulted in performance drop- >>> https://patchwork.kernel.org/patch/10969511/ >>> >>> So, we will do some performance tests with this patch and update you. >>> Thank you. >> >> I'm aware of the results of Ming Leis work, but I do hope this patchset >> performs better. >> >> And when you do performance measurements, can you please run with both, >> 'none' I/O scheduler and 'mq-deadline' I/O scheduler? >> I've measured quite a performance improvements when using mq-deadline, >> up to the point where I've gotten on-par performance with the original, >> non-mq, implementation. >> (As a data point, on my setup I've measured about 270k IOPS and 1092 >> MB/s througput, running on just 2 SSDs). >>asas_build_ldio_fusion >> But thanks for doing a performance test here. > > Hi Hannes, > > Sorry for the delay in replying, I observed a few issues with this patchset: > > 1. "blk_mq_unique_tag_to_hwq(tag)" does not return MSI-x vector to > which IO submitter CPU is affined with. Due to this IO submission and > completion CPUs are different which causes performance drop for low > latency workloads. Hi Sumit, So the new code has: megasas_build_ldio_fusion() { cmd->request_desc->SCSIIO.MSIxIndex = blk_mq_unique_tag_to_hwq(tag); } So the value here is hw queue index from blk-mq point of view, and not megaraid_sas msix index, as you alluded to. So we get 80 msix, 8 are reserved for low_latency_index_start (that's how it seems to me), and we report other 72 as #hw queues = 72 to SCSI midlayer. So I think that this should be: cmd->request_desc->SCSIIO.MSIxIndex = blk_mq_unique_tag_to_hwq(tag) + low_latency_index_start; > > lspcu: > > # lscpu > Architecture: x86_64 > CPU op-mode(s): 32-bit, 64-bit > Byte Order: Little Endian > CPU(s): 72 > On-line CPU(s) list: 0-71 > Thread(s) per core: 2 > Core(s) per socket: 18 > Socket(s): 2 > NUMA node(s): 2 > Vendor ID: GenuineIntel > CPU family: 6 > Model: 85 > Model name: Intel(R) Xeon(R) Gold 6150 CPU @ 2.70GHz > Stepping: 4 > CPU MHz: 3204.246 > CPU max MHz: 3700.0000 > CPU min MHz: 1200.0000 > BogoMIPS: 5400.00 > Virtualization: VT-x > L1d cache: 32K > L1i cache: 32K > L2 cache: 1024K > L3 cache: 25344K > NUMA node0 CPU(s): 0-17,36-53 > NUMA node1 CPU(s): 18-35,54-71 > Flags: fpu vme de pse tsc msr pae mce cx8 apic sep > mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht > tm pbe s > yscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts > rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq > dtes64 monitor > ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 > sse4_2 x2apic movbe popcnt tsc_deadline_timer xsave avx f16c rdrand > lahf_lm abm > 3dnowprefetch cpuid_fault epb cat_l3 cdp_l3 invpcid_single intel_ppin > mba tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust > bmi1 hle > avx2 smep bmi2 erms invpcid rtm cqm mpx rdt_a avx512f avx512dq rdseed > adx smap clflushopt clwb intel_pt avx512cd avx512bw avx512vl xsaveopt > xsavec > xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_lo > > [snip] > 4. This patch removes below code from driver so what this piece of > code does is broken- > > > - if (instance->adapter_type >= INVADER_SERIES && > - !instance->msix_combined) { > - instance->msix_load_balance = true; > - instance->smp_affinity_enable = false; > - } Does this code need to be re-added? Would this have affected your test? Thanks, John
On 1/13/20 6:42 PM, John Garry wrote: > On 10/01/2020 04:00, Sumit Saxena wrote: >> On Mon, Dec 9, 2019 at 4:32 PM Hannes Reinecke <hare@suse.de> wrote: >>> >>> On 12/9/19 11:10 AM, Sumit Saxena wrote: >>>> On Mon, Dec 2, 2019 at 9:09 PM Hannes Reinecke <hare@suse.de> wrote: >>>>> >>>>> Fusion adapters can steer completions to individual queues, and >>>>> we now have support for shared host-wide tags. >>>>> So we can enable multiqueue support for fusion adapters and >>>>> drop the hand-crafted interrupt affinity settings. >>>> >>>> Hi Hannes, >>>> >>>> Ming Lei also proposed similar changes in megaraid_sas driver some >>>> time back and it had resulted in performance drop- >>>> https://patchwork.kernel.org/patch/10969511/ >>>> >>>> So, we will do some performance tests with this patch and update you. >>>> Thank you. >>> >>> I'm aware of the results of Ming Leis work, but I do hope this patchset >>> performs better. >>> >>> And when you do performance measurements, can you please run with both, >>> 'none' I/O scheduler and 'mq-deadline' I/O scheduler? >>> I've measured quite a performance improvements when using mq-deadline, >>> up to the point where I've gotten on-par performance with the original, >>> non-mq, implementation. >>> (As a data point, on my setup I've measured about 270k IOPS and 1092 >>> MB/s througput, running on just 2 SSDs). >>> asas_build_ldio_fusion >>> But thanks for doing a performance test here. >> >> Hi Hannes, >> >> Sorry for the delay in replying, I observed a few issues with this >> patchset: >> >> 1. "blk_mq_unique_tag_to_hwq(tag)" does not return MSI-x vector to >> which IO submitter CPU is affined with. Due to this IO submission and >> completion CPUs are different which causes performance drop for low >> latency workloads. > > Hi Sumit, > > So the new code has: > > megasas_build_ldio_fusion() > { > > cmd->request_desc->SCSIIO.MSIxIndex = > blk_mq_unique_tag_to_hwq(tag); > > } > > So the value here is hw queue index from blk-mq point of view, and not > megaraid_sas msix index, as you alluded to. > > So we get 80 msix, 8 are reserved for low_latency_index_start (that's > how it seems to me), and we report other 72 as #hw queues = 72 to SCSI > midlayer. > > So I think that this should be: > > cmd->request_desc->SCSIIO.MSIxIndex = > blk_mq_unique_tag_to_hwq(tag) + low_latency_index_start; > > Indeed, that sounds reasonable. (The whole queue mapping stuff isn't exactly well documented :-( ) I'll be updating the patch. >> >> lspcu: >> >> # lscpu >> Architecture: x86_64 >> CPU op-mode(s): 32-bit, 64-bit >> Byte Order: Little Endian >> CPU(s): 72 >> On-line CPU(s) list: 0-71 >> Thread(s) per core: 2 >> Core(s) per socket: 18 >> Socket(s): 2 >> NUMA node(s): 2 >> Vendor ID: GenuineIntel >> CPU family: 6 >> Model: 85 >> Model name: Intel(R) Xeon(R) Gold 6150 CPU @ 2.70GHz >> Stepping: 4 >> CPU MHz: 3204.246 >> CPU max MHz: 3700.0000 >> CPU min MHz: 1200.0000 >> BogoMIPS: 5400.00 >> Virtualization: VT-x >> L1d cache: 32K >> L1i cache: 32K >> L2 cache: 1024K >> L3 cache: 25344K >> NUMA node0 CPU(s): 0-17,36-53 >> NUMA node1 CPU(s): 18-35,54-71 >> Flags: fpu vme de pse tsc msr pae mce cx8 apic sep >> mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht >> tm pbe s >> yscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts >> rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq >> dtes64 monitor >> ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 >> sse4_2 x2apic movbe popcnt tsc_deadline_timer xsave avx f16c rdrand >> lahf_lm abm >> 3dnowprefetch cpuid_fault epb cat_l3 cdp_l3 invpcid_single intel_ppin >> mba tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust >> bmi1 hle >> avx2 smep bmi2 erms invpcid rtm cqm mpx rdt_a avx512f avx512dq rdseed >> adx smap clflushopt clwb intel_pt avx512cd avx512bw avx512vl xsaveopt >> xsavec >> xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_lo >> >> > > [snip] > >> 4. This patch removes below code from driver so what this piece of >> code does is broken- >> >> >> - if (instance->adapter_type >= >> INVADER_SERIES && >> - !instance->msix_combined) { >> - instance->msix_load_balance = >> true; >> - instance->smp_affinity_enable >> = false; >> - } > > Does this code need to be re-added? Would this have affected your test? > Primarily this patch was required to enable interrupt affinity on my machine (Lenovo RAID 930-8i). Can you give me some information why the code is present in the first place? Some hardware limitation, maybe? Cheers, Hannes
On 10/01/2020 12:09, John Garry wrote: >>>>> >>>>> Fusion adapters can steer completions to individual queues, and >>>>> we now have support for shared host-wide tags. >>>>> So we can enable multiqueue support for fusion adapters and >>>>> drop the hand-crafted interrupt affinity settings. >>>> >>>> Hi Hannes, >>>> >>>> Ming Lei also proposed similar changes in megaraid_sas driver some >>>> time back and it had resulted in performance drop- >>>> https://patchwork.kernel.org/patch/10969511/ >>>> >>>> So, we will do some performance tests with this patch and update you. >>>> >>> >>> Hi Sumit, >>> >>> I was wondering if you had a chance to do this test yet? >>> >>> It would be good to know, so we can try to progress this work. >> >> Looks most of the comment in the following link isn't addressed: >> >> https://lore.kernel.org/linux-block/20191129002540.GA1829@ming.t460p/ > > OK, but I was waiting for results first, which I hoped would not take > too long. They weren't forgotten, for sure. Let me check them now. Hi Ming, I think that your questions here were related to the shared scheduler tags, which was Hannes' proposal (I initially had it in v2 series, but dropped it for v3). I was just content to maintain the concept of shared driver tags. Thanks, John > >> >>> Firstly too much((nr_hw_queues - 1) times) memory is wasted. Secondly IO >>> latency could be increased by too deep scheduler queue depth. Finally >>> CPU >>> could be wasted in the retrying of running busy hw queue. >>> >>> Wrt. driver tags, this patch may be worse, given the average limit for >>> each LUN is reduced by (nr_hw_queues) times, see hctx_may_queue(). >>> >>> Another change is bt_wait_ptr(). Before your patches, there is single >>> .wait_index, now the number of .wait_index is changed to nr_hw_queues. >>> >>> Also the run queue number is increased a lot in SCSI's IO completion, >>> see >>> scsi_end_request(). >> >> I guess memory waste won't be a blocker. > > Yeah, that's a trade-off. And, as I remember, memory waste does not seem > extreme. > >> >> But it may not be one accepted behavior to reduce average active queue >> depth for each LUN by nr_hw_queues times, meantime scheduler queue depth >> is increased by nr_hw_queues times, compared with single queue. >> > > Thanks, > John
>>> >>> Hi Hannes, >>> >>> Sorry for the delay in replying, I observed a few issues with this >>> patchset: >>> >>> 1. "blk_mq_unique_tag_to_hwq(tag)" does not return MSI-x vector to >>> which IO submitter CPU is affined with. Due to this IO submission and >>> completion CPUs are different which causes performance drop for low >>> latency workloads. >> >> Hi Sumit, >> >> So the new code has: >> >> megasas_build_ldio_fusion() >> { >> >> cmd->request_desc->SCSIIO.MSIxIndex = >> blk_mq_unique_tag_to_hwq(tag); >> >> } >> >> So the value here is hw queue index from blk-mq point of view, and not >> megaraid_sas msix index, as you alluded to. >> >> So we get 80 msix, 8 are reserved for low_latency_index_start (that's >> how it seems to me), and we report other 72 as #hw queues = 72 to SCSI >> midlayer. >> >> So I think that this should be: >> >> cmd->request_desc->SCSIIO.MSIxIndex = >> blk_mq_unique_tag_to_hwq(tag) + low_latency_index_start; >> >> > Indeed, that sounds reasonable. > (The whole queue mapping stuff isn't exactly well documented :-( ) > Yeah, there's certainly lots of knobs and levers in this driver. > I'll be updating the patch. About this one: > 2. Seeing below stack traces/messages in dmesg during driver unload – > > [2565601.054366] Call Trace: > [2565601.054368] blk_mq_free_map_and_requests+0x28/0x50 > [2565601.054369] blk_mq_free_tag_set+0x1d/0x90 > [2565601.054370] scsi_host_dev_release+0x8a/0xf0 > [2565601.054370] device_release+0x27/0x80 > [2565601.054371] kobject_cleanup+0x61/0x190 > [2565601.054373] megasas_detach_one+0x4c1/0x650 [megaraid_sas] > [2565601.054374] pci_device_remove+0x3b/0xc0 > [2565601.054375] device_release_driver_internal+0xec/0x1b0 > [2565601.054376] driver_detach+0x46/0x90 > [2565601.054377] bus_remove_driver+0x58/0xd0 > [2565601.054378] pci_unregister_driver+0x26/0xa0 > [2565601.054379] megasas_exit+0x91/0x882 [megaraid_sas] > [2565601.054381] __x64_sys_delete_module+0x16c/0x250 > [2565601.054382] do_syscall_64+0x5b/0x1b0 > [2565601.054383] entry_SYSCALL_64_after_hwframe+0x44/0xa9 > [2565601.054383] RIP: 0033:0x7f7212a82837 > [2565601.054384] RSP: 002b:00007ffdfa2dcea8 EFLAGS: 00000202 ORIG_RAX: > 00000000000000b0 > [2565601.054385] RAX: ffffffffffffffda RBX: 0000000000b6e2e0 RCX: > 00007f7212a82837 > [2565601.054385] RDX: 00007f7212af3ac0 RSI: 0000000000000800 RDI: > 0000000000b6e348 > [2565601.054386] RBP: 0000000000000000 R08: 00007f7212d47060 R09: > 00007f7212af3ac0 > [2565601.054386] R10: 00007ffdfa2dcbc0 R11: 0000000000000202 R12: > 00007ffdfa2dd71c > [2565601.054387] R13: 0000000000000000 R14: 0000000000b6e2e0 R15: > 0000000000b6e010 > [2565601.054387] ---[ end trace 38899303bd85e838 ]--- I see it also for hisi_sas_v3_hw. And so I don't understand the code change here, specifically where the WARN is generated: void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { struct page *page; int i; if (tags->rqs) { for (i = 0; i < tags->nr_tags; i++) if (WARN_ON(tags->rqs[i])) tags->rqs[i] = NULL; <--- here } I thought that tags->rqs[i] was just a holder for a pointer to a static tag, like assigned here: static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, unsigned int tag, unsigned int op, u64 alloc_time_ns) { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct request *rq = tags->static_rqs[tag]; ... rq->tag = tag; rq->internal_tag = -1; data->hctx->tags->rqs[rq->tag] = rq; ... } So I don't know why we need to WARN if unset, and then also clear it. The memory is freed pretty soon after this anyway. Thanks, John > >>> >>> lspcu: >>> >>> # lscpu >>> Architecture: x86_64 >>> CPU op-mode(s): 32-bit, 64-bit >>> Byte Order: Little Endian >>> CPU(s): 72 >>> On-line CPU(s) list: 0-71 >>> Thread(s) per core: 2 >>> Core(s) per socket: 18 >>> Socket(s): 2 >>> NUMA node(s): 2 >>> Vendor ID: GenuineIntel >>> CPU family: 6 >>> Model: 85 >>> Model name: Intel(R) Xeon(R) Gold 6150 CPU @ 2.70GHz >>> Stepping: 4 >>> CPU MHz: 3204.246 >>> CPU max MHz: 3700.0000 >>> CPU min MHz: 1200.0000 >>> BogoMIPS: 5400.00 >>> Virtualization: VT-x >>> L1d cache: 32K >>> L1i cache: 32K >>> L2 cache: 1024K >>> L3 cache: 25344K >>> NUMA node0 CPU(s): 0-17,36-53 >>> NUMA node1 CPU(s): 18-35,54-71 >>> Flags: fpu vme de pse tsc msr pae mce cx8 apic sep >>> mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht >>> tm pbe s >>> yscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts >>> rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq >>> dtes64 monitor >>> ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 >>> sse4_2 x2apic movbe popcnt tsc_deadline_timer xsave avx f16c rdrand >>> lahf_lm abm >>> 3dnowprefetch cpuid_fault epb cat_l3 cdp_l3 invpcid_single intel_ppin >>> mba tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust >>> bmi1 hle >>> avx2 smep bmi2 erms invpcid rtm cqm mpx rdt_a avx512f avx512dq rdseed >>> adx smap clflushopt clwb intel_pt avx512cd avx512bw avx512vl xsaveopt >>> xsavec >>> xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_lo >>> >>> >> >> [snip] >> >>> 4. This patch removes below code from driver so what this piece of >>> code does is broken- >>> >>> >>> - if (instance->adapter_type >= >>> INVADER_SERIES && >>> - !instance->msix_combined) { >>> - instance->msix_load_balance = >>> true; >>> - instance->smp_affinity_enable >>> = false; >>> - } >> >> Does this code need to be re-added? Would this have affected your test? >> Primarily this patch was required to enable interrupt affinity on my > machine (Lenovo RAID 930-8i). > Can you give me some information why the code is present in the first > place? Some hardware limitation, maybe? > > Cheers, > > Hannes >
On 1/16/20 4:47 PM, John Garry wrote: > >>>> >>>> Hi Hannes, >>>> >>>> Sorry for the delay in replying, I observed a few issues with this >>>> patchset: >>>> >>>> 1. "blk_mq_unique_tag_to_hwq(tag)" does not return MSI-x vector to >>>> which IO submitter CPU is affined with. Due to this IO submission and >>>> completion CPUs are different which causes performance drop for low >>>> latency workloads. >>> >>> Hi Sumit, >>> >>> So the new code has: >>> >>> megasas_build_ldio_fusion() >>> { >>> >>> cmd->request_desc->SCSIIO.MSIxIndex = >>> blk_mq_unique_tag_to_hwq(tag); >>> >>> } >>> >>> So the value here is hw queue index from blk-mq point of view, and not >>> megaraid_sas msix index, as you alluded to. >>> >>> So we get 80 msix, 8 are reserved for low_latency_index_start (that's >>> how it seems to me), and we report other 72 as #hw queues = 72 to SCSI >>> midlayer. >>> >>> So I think that this should be: >>> >>> cmd->request_desc->SCSIIO.MSIxIndex = >>> blk_mq_unique_tag_to_hwq(tag) + low_latency_index_start; >>> >>> >> Indeed, that sounds reasonable. >> (The whole queue mapping stuff isn't exactly well documented :-( ) >> > > Yeah, there's certainly lots of knobs and levers in this driver. > >> I'll be updating the patch. > > About this one: > > > 2. Seeing below stack traces/messages in dmesg during driver unload – > > > > [2565601.054366] Call Trace: > > [2565601.054368] blk_mq_free_map_and_requests+0x28/0x50 > > [2565601.054369] blk_mq_free_tag_set+0x1d/0x90 > > [2565601.054370] scsi_host_dev_release+0x8a/0xf0 > > [2565601.054370] device_release+0x27/0x80 > > [2565601.054371] kobject_cleanup+0x61/0x190 > > [2565601.054373] megasas_detach_one+0x4c1/0x650 [megaraid_sas] > > [2565601.054374] pci_device_remove+0x3b/0xc0 > > [2565601.054375] device_release_driver_internal+0xec/0x1b0 > > [2565601.054376] driver_detach+0x46/0x90 > > [2565601.054377] bus_remove_driver+0x58/0xd0 > > [2565601.054378] pci_unregister_driver+0x26/0xa0 > > [2565601.054379] megasas_exit+0x91/0x882 [megaraid_sas] > > [2565601.054381] __x64_sys_delete_module+0x16c/0x250 > > [2565601.054382] do_syscall_64+0x5b/0x1b0 > > [2565601.054383] entry_SYSCALL_64_after_hwframe+0x44/0xa9 > > [2565601.054383] RIP: 0033:0x7f7212a82837 > > [2565601.054384] RSP: 002b:00007ffdfa2dcea8 EFLAGS: 00000202 ORIG_RAX: > > 00000000000000b0 > > [2565601.054385] RAX: ffffffffffffffda RBX: 0000000000b6e2e0 RCX: > > 00007f7212a82837 > > [2565601.054385] RDX: 00007f7212af3ac0 RSI: 0000000000000800 RDI: > > 0000000000b6e348 > > [2565601.054386] RBP: 0000000000000000 R08: 00007f7212d47060 R09: > > 00007f7212af3ac0 > > [2565601.054386] R10: 00007ffdfa2dcbc0 R11: 0000000000000202 R12: > > 00007ffdfa2dd71c > > [2565601.054387] R13: 0000000000000000 R14: 0000000000b6e2e0 R15: > > 0000000000b6e010 > > [2565601.054387] ---[ end trace 38899303bd85e838 ]--- > > > I see it also for hisi_sas_v3_hw. > > And so I don't understand the code change here, specifically where the > WARN is generated: > > void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, > unsigned int hctx_idx) > { > struct page *page; > int i; > > if (tags->rqs) { > for (i = 0; i < tags->nr_tags; i++) > if (WARN_ON(tags->rqs[i])) > tags->rqs[i] = NULL; <--- here > } > > > I thought that tags->rqs[i] was just a holder for a pointer to a static > tag, like assigned here: > > static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, > unsigned int tag, unsigned int op, u64 alloc_time_ns) > { > struct blk_mq_tags *tags = blk_mq_tags_from_data(data); > struct request *rq = tags->static_rqs[tag]; > > ... > > rq->tag = tag; > rq->internal_tag = -1; > data->hctx->tags->rqs[rq->tag] = rq; > > ... > } > > So I don't know why we need to WARN if unset, and then also clear it. > The memory is freed pretty soon after this anyway. > Indeed, ->rqs is a holder, referencing an entry in ->static_rqs. Point here is that ->rqs is set when allocating a request, and should be zeroed when freeing the request. And then this above patch would warn us if there's an imbalance, ie an allocated request didn't get freed. But apparently the latter part didn't happen, leaving us with stale entries in ->rqs. Either we fix that, or we drop the WARN_ON. Personally I like clearing of the ->rqs pointer (as then it's easier to track use-after-free issues), but then this might have performance implications, and Jens might have some views about it. So I'm fine with dropping it. Cheers, Hannes
On Mon, Jan 13, 2020 at 11:12 PM John Garry <john.garry@huawei.com> wrote: > > On 10/01/2020 04:00, Sumit Saxena wrote: > > On Mon, Dec 9, 2019 at 4:32 PM Hannes Reinecke <hare@suse.de> wrote: > >> > >> On 12/9/19 11:10 AM, Sumit Saxena wrote: > >>> On Mon, Dec 2, 2019 at 9:09 PM Hannes Reinecke <hare@suse.de> wrote: > >>>> > >>>> Fusion adapters can steer completions to individual queues, and > >>>> we now have support for shared host-wide tags. > >>>> So we can enable multiqueue support for fusion adapters and > >>>> drop the hand-crafted interrupt affinity settings. > >>> > >>> Hi Hannes, > >>> > >>> Ming Lei also proposed similar changes in megaraid_sas driver some > >>> time back and it had resulted in performance drop- > >>> https://patchwork.kernel.org/patch/10969511/ > >>> > >>> So, we will do some performance tests with this patch and update you. > >>> Thank you. > >> > >> I'm aware of the results of Ming Leis work, but I do hope this patchset > >> performs better. > >> > >> And when you do performance measurements, can you please run with both, > >> 'none' I/O scheduler and 'mq-deadline' I/O scheduler? > >> I've measured quite a performance improvements when using mq-deadline, > >> up to the point where I've gotten on-par performance with the original, > >> non-mq, implementation. > >> (As a data point, on my setup I've measured about 270k IOPS and 1092 > >> MB/s througput, running on just 2 SSDs). > >>asas_build_ldio_fusion > >> But thanks for doing a performance test here. > > > > Hi Hannes, > > > > Sorry for the delay in replying, I observed a few issues with this patchset: > > > > 1. "blk_mq_unique_tag_to_hwq(tag)" does not return MSI-x vector to > > which IO submitter CPU is affined with. Due to this IO submission and > > completion CPUs are different which causes performance drop for low > > latency workloads. > > Hi Sumit, > > So the new code has: > > megasas_build_ldio_fusion() > { > > cmd->request_desc->SCSIIO.MSIxIndex = > blk_mq_unique_tag_to_hwq(tag); > > } > > So the value here is hw queue index from blk-mq point of view, and not > megaraid_sas msix index, as you alluded to. Yes John, value filled in "cmd->request_desc->SCSIIO.MSIxIndex" is HW queue index. > > So we get 80 msix, 8 are reserved for low_latency_index_start (that's > how it seems to me), and we report other 72 as #hw queues = 72 to SCSI > midlayer. > > So I think that this should be: > > cmd->request_desc->SCSIIO.MSIxIndex = > blk_mq_unique_tag_to_hwq(tag) + low_latency_index_start; Agreed, this should return correct HW queue index. > > > > > > lspcu: > > > > # lscpu > > Architecture: x86_64 > > CPU op-mode(s): 32-bit, 64-bit > > Byte Order: Little Endian > > CPU(s): 72 > > On-line CPU(s) list: 0-71 > > Thread(s) per core: 2 > > Core(s) per socket: 18 > > Socket(s): 2 > > NUMA node(s): 2 > > Vendor ID: GenuineIntel > > CPU family: 6 > > Model: 85 > > Model name: Intel(R) Xeon(R) Gold 6150 CPU @ 2.70GHz > > Stepping: 4 > > CPU MHz: 3204.246 > > CPU max MHz: 3700.0000 > > CPU min MHz: 1200.0000 > > BogoMIPS: 5400.00 > > Virtualization: VT-x > > L1d cache: 32K > > L1i cache: 32K > > L2 cache: 1024K > > L3 cache: 25344K > > NUMA node0 CPU(s): 0-17,36-53 > > NUMA node1 CPU(s): 18-35,54-71 > > Flags: fpu vme de pse tsc msr pae mce cx8 apic sep > > mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht > > tm pbe s > > yscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts > > rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq > > dtes64 monitor > > ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 > > sse4_2 x2apic movbe popcnt tsc_deadline_timer xsave avx f16c rdrand > > lahf_lm abm > > 3dnowprefetch cpuid_fault epb cat_l3 cdp_l3 invpcid_single intel_ppin > > mba tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust > > bmi1 hle > > avx2 smep bmi2 erms invpcid rtm cqm mpx rdt_a avx512f avx512dq rdseed > > adx smap clflushopt clwb intel_pt avx512cd avx512bw avx512vl xsaveopt > > xsavec > > xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_lo > > > > > > [snip] > > > 4. This patch removes below code from driver so what this piece of > > code does is broken- > > > > > > - if (instance->adapter_type >= INVADER_SERIES && > > - !instance->msix_combined) { > > - instance->msix_load_balance = true; > > - instance->smp_affinity_enable = false; > > - } > > Does this code need to be re-added? Would this have affected your test? This code did not affect my test but has to be re-added for affected hardware. There are few megaraid_sas adapters for which "instance->msix_combined" would be "0" and we need this code for those adapters. Thanks, Sumit > > Thanks, > John
On Tue, Jan 14, 2020 at 12:35 PM Hannes Reinecke <hare@suse.de> wrote: > > On 1/13/20 6:42 PM, John Garry wrote: > > On 10/01/2020 04:00, Sumit Saxena wrote: > >> On Mon, Dec 9, 2019 at 4:32 PM Hannes Reinecke <hare@suse.de> wrote: > >>> > >>> On 12/9/19 11:10 AM, Sumit Saxena wrote: > >>>> On Mon, Dec 2, 2019 at 9:09 PM Hannes Reinecke <hare@suse.de> wrote: > >>>>> > >>>>> Fusion adapters can steer completions to individual queues, and > >>>>> we now have support for shared host-wide tags. > >>>>> So we can enable multiqueue support for fusion adapters and > >>>>> drop the hand-crafted interrupt affinity settings. > >>>> > >>>> Hi Hannes, > >>>> > >>>> Ming Lei also proposed similar changes in megaraid_sas driver some > >>>> time back and it had resulted in performance drop- > >>>> https://patchwork.kernel.org/patch/10969511/ > >>>> > >>>> So, we will do some performance tests with this patch and update you. > >>>> Thank you. > >>> > >>> I'm aware of the results of Ming Leis work, but I do hope this patchset > >>> performs better. > >>> > >>> And when you do performance measurements, can you please run with both, > >>> 'none' I/O scheduler and 'mq-deadline' I/O scheduler? > >>> I've measured quite a performance improvements when using mq-deadline, > >>> up to the point where I've gotten on-par performance with the original, > >>> non-mq, implementation. > >>> (As a data point, on my setup I've measured about 270k IOPS and 1092 > >>> MB/s througput, running on just 2 SSDs). > >>> asas_build_ldio_fusion > >>> But thanks for doing a performance test here. > >> > >> Hi Hannes, > >> > >> Sorry for the delay in replying, I observed a few issues with this > >> patchset: > >> > >> 1. "blk_mq_unique_tag_to_hwq(tag)" does not return MSI-x vector to > >> which IO submitter CPU is affined with. Due to this IO submission and > >> completion CPUs are different which causes performance drop for low > >> latency workloads. > > > > Hi Sumit, > > > > So the new code has: > > > > megasas_build_ldio_fusion() > > { > > > > cmd->request_desc->SCSIIO.MSIxIndex = > > blk_mq_unique_tag_to_hwq(tag); > > > > } > > > > So the value here is hw queue index from blk-mq point of view, and not > > megaraid_sas msix index, as you alluded to. > > > > So we get 80 msix, 8 are reserved for low_latency_index_start (that's > > how it seems to me), and we report other 72 as #hw queues = 72 to SCSI > > midlayer. > > > > So I think that this should be: > > > > cmd->request_desc->SCSIIO.MSIxIndex = > > blk_mq_unique_tag_to_hwq(tag) + low_latency_index_start; > > > > > Indeed, that sounds reasonable. > (The whole queue mapping stuff isn't exactly well documented :-( ) > > I'll be updating the patch. > > >> > >> lspcu: > >> > >> # lscpu > >> Architecture: x86_64 > >> CPU op-mode(s): 32-bit, 64-bit > >> Byte Order: Little Endian > >> CPU(s): 72 > >> On-line CPU(s) list: 0-71 > >> Thread(s) per core: 2 > >> Core(s) per socket: 18 > >> Socket(s): 2 > >> NUMA node(s): 2 > >> Vendor ID: GenuineIntel > >> CPU family: 6 > >> Model: 85 > >> Model name: Intel(R) Xeon(R) Gold 6150 CPU @ 2.70GHz > >> Stepping: 4 > >> CPU MHz: 3204.246 > >> CPU max MHz: 3700.0000 > >> CPU min MHz: 1200.0000 > >> BogoMIPS: 5400.00 > >> Virtualization: VT-x > >> L1d cache: 32K > >> L1i cache: 32K > >> L2 cache: 1024K > >> L3 cache: 25344K > >> NUMA node0 CPU(s): 0-17,36-53 > >> NUMA node1 CPU(s): 18-35,54-71 > >> Flags: fpu vme de pse tsc msr pae mce cx8 apic sep > >> mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht > >> tm pbe s > >> yscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts > >> rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq > >> dtes64 monitor > >> ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 > >> sse4_2 x2apic movbe popcnt tsc_deadline_timer xsave avx f16c rdrand > >> lahf_lm abm > >> 3dnowprefetch cpuid_fault epb cat_l3 cdp_l3 invpcid_single intel_ppin > >> mba tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust > >> bmi1 hle > >> avx2 smep bmi2 erms invpcid rtm cqm mpx rdt_a avx512f avx512dq rdseed > >> adx smap clflushopt clwb intel_pt avx512cd avx512bw avx512vl xsaveopt > >> xsavec > >> xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_lo > >> > >> > > > > [snip] > > > >> 4. This patch removes below code from driver so what this piece of > >> code does is broken- > >> > >> > >> - if (instance->adapter_type >= > >> INVADER_SERIES && > >> - !instance->msix_combined) { > >> - instance->msix_load_balance = > >> true; > >> - instance->smp_affinity_enable > >> = false; > >> - } > > > > Does this code need to be re-added? Would this have affected your test? > > Primarily this patch was required to enable interrupt affinity on my > machine (Lenovo RAID 930-8i). > Can you give me some information why the code is present in the first > place? Some hardware limitation, maybe? Hi Hannes, This code is for specific family of megaraid_sas adapters and Lenovo RAID 930-8i belongs to them. For those adapters, we want to use available HW queues in round robin fashion instead of using interrupt affinity. It helps to improve performance and fixes soft lockups. For interrupt affinity test purpose on Lenovo RAID 930-8i, you can disable this code to use affinity. But in the final patch, this code has to be present. This code was added through below commit: commit 1d15d9098ad12b0021ac5a6b851f26d1ab021e5a Author: Shivasharan S <shivasharan.srikanteshwara@broadcom.com> Date: Tue May 7 10:05:36 2019 -0700 scsi: megaraid_sas: Load balance completions across all MSI-X Driver will use "reply descriptor post queues" in round robin fashion when the combined MSI-X mode is not enabled. With this IO completions are distributed and load balanced across all the available reply descriptor post queues equally. This is enabled only if combined MSI-X mode is not enabled in firmware. This improves performance and also fixes soft lockups. When load balancing is enabled, IRQ affinity from driver needs to be disabled. Signed-off-by: Kashyap Desai <kashyap.desai@broadcom.com> Signed-off-by: Shivasharan S <shivasharan.srikanteshwara@broadcom.com> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com> Thanks, Sumit > > Cheers, > > Hannes > -- > Dr. Hannes Reinecke Teamlead Storage & Networking > hare@suse.de +49 911 74053 688 > SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg > HRB 36809 (AG Nürnberg), GF: Felix Imendörffer
On 17/01/2020 11:18, Sumit Saxena wrote: >>>> or doing a performance test here. >>> Hi Hannes, >>> Hi Sumit, >>> Sorry for the delay in replying, I observed a few issues with this patchset: >>> >>> 1. "blk_mq_unique_tag_to_hwq(tag)" does not return MSI-x vector to >>> which IO submitter CPU is affined with. Due to this IO submission and >>> completion CPUs are different which causes performance drop for low >>> latency workloads. >> Hi Sumit, >> >> So the new code has: >> >> megasas_build_ldio_fusion() >> { >> >> cmd->request_desc->SCSIIO.MSIxIndex = >> blk_mq_unique_tag_to_hwq(tag); >> >> } >> >> So the value here is hw queue index from blk-mq point of view, and not >> megaraid_sas msix index, as you alluded to. > Yes John, value filled in "cmd->request_desc->SCSIIO.MSIxIndex" is HW > queue index. > >> So we get 80 msix, 8 are reserved for low_latency_index_start (that's >> how it seems to me), and we report other 72 as #hw queues = 72 to SCSI >> midlayer. >> >> So I think that this should be: >> >> cmd->request_desc->SCSIIO.MSIxIndex = >> blk_mq_unique_tag_to_hwq(tag) + low_latency_index_start; Can you possibly test performance again with this local change, or would you rather an updated patchset be sent? > Agreed, this should return correct HW queue index. >> Thanks, John
On Thu, Feb 13, 2020 at 3:37 PM John Garry <john.garry@huawei.com> wrote: > > On 17/01/2020 11:18, Sumit Saxena wrote: > >>>> or doing a performance test here. > >>> Hi Hannes, > >>> > > Hi Sumit, > > >>> Sorry for the delay in replying, I observed a few issues with this patchset: > >>> > >>> 1. "blk_mq_unique_tag_to_hwq(tag)" does not return MSI-x vector to > >>> which IO submitter CPU is affined with. Due to this IO submission and > >>> completion CPUs are different which causes performance drop for low > >>> latency workloads. > >> Hi Sumit, > >> > >> So the new code has: > >> > >> megasas_build_ldio_fusion() > >> { > >> > >> cmd->request_desc->SCSIIO.MSIxIndex = > >> blk_mq_unique_tag_to_hwq(tag); > >> > >> } > >> > >> So the value here is hw queue index from blk-mq point of view, and not > >> megaraid_sas msix index, as you alluded to. > > Yes John, value filled in "cmd->request_desc->SCSIIO.MSIxIndex" is HW > > queue index. > > > >> So we get 80 msix, 8 are reserved for low_latency_index_start (that's > >> how it seems to me), and we report other 72 as #hw queues = 72 to SCSI > >> midlayer. > >> > >> So I think that this should be: > >> > >> cmd->request_desc->SCSIIO.MSIxIndex = > >> blk_mq_unique_tag_to_hwq(tag) + low_latency_index_start; > > Can you possibly test performance again with this local change, or would > you rather an updated patchset be sent? Updated patchset is not required. I will do performance run with this change and update. Thanks, Sumit > > > Agreed, this should return correct HW queue index. > >> > > > Thanks, > John
diff --git a/drivers/scsi/megaraid/megaraid_sas.h b/drivers/scsi/megaraid/megaraid_sas.h index bd8184072bed..844ea2d6dbb8 100644 --- a/drivers/scsi/megaraid/megaraid_sas.h +++ b/drivers/scsi/megaraid/megaraid_sas.h @@ -2261,7 +2261,6 @@ enum MR_PERF_MODE { struct megasas_instance { - unsigned int *reply_map; __le32 *producer; dma_addr_t producer_h; __le32 *consumer; diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index a4bc81479284..9d0d74e3d491 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -37,6 +37,7 @@ #include <linux/poll.h> #include <linux/vmalloc.h> #include <linux/irq_poll.h> +#include <linux/blk-mq-pci.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> @@ -3106,6 +3107,19 @@ megasas_bios_param(struct scsi_device *sdev, struct block_device *bdev, return 0; } +static int megasas_map_queues(struct Scsi_Host *shost) +{ + struct megasas_instance *instance; + + instance = (struct megasas_instance *)shost->hostdata; + + if (!instance->smp_affinity_enable) + return 0; + + return blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT], + instance->pdev, instance->low_latency_index_start); +} + static void megasas_aen_polling(struct work_struct *work); /** @@ -3414,9 +3428,11 @@ static struct scsi_host_template megasas_template = { .eh_timed_out = megasas_reset_timer, .shost_attrs = megaraid_host_attrs, .bios_param = megasas_bios_param, + .map_queues = megasas_map_queues, .change_queue_depth = scsi_change_queue_depth, .max_segment_size = 0xffffffff, .no_write_same = 1, + .host_tagset = 1, }; /** @@ -5695,34 +5711,6 @@ megasas_setup_jbod_map(struct megasas_instance *instance) instance->use_seqnum_jbod_fp = false; } -static void megasas_setup_reply_map(struct megasas_instance *instance) -{ - const struct cpumask *mask; - unsigned int queue, cpu, low_latency_index_start; - - low_latency_index_start = instance->low_latency_index_start; - - for (queue = low_latency_index_start; queue < instance->msix_vectors; queue++) { - mask = pci_irq_get_affinity(instance->pdev, queue); - if (!mask) - goto fallback; - - for_each_cpu(cpu, mask) - instance->reply_map[cpu] = queue; - } - return; - -fallback: - queue = low_latency_index_start; - for_each_possible_cpu(cpu) { - instance->reply_map[cpu] = queue; - if (queue == (instance->msix_vectors - 1)) - queue = low_latency_index_start; - else - queue++; - } -} - /** * megasas_get_device_list - Get the PD and LD device list from FW. * @instance: Adapter soft state @@ -6021,12 +6009,6 @@ static int megasas_init_fw(struct megasas_instance *instance) instance->is_rdpq = (scratch_pad_1 & MR_RDPQ_MODE_OFFSET) ? 1 : 0; - if (instance->adapter_type >= INVADER_SERIES && - !instance->msix_combined) { - instance->msix_load_balance = true; - instance->smp_affinity_enable = false; - } - /* Save 1-15 reply post index address to local memory * Index 0 is already saved from reg offset * MPI2_REPLY_POST_HOST_INDEX_OFFSET @@ -6145,8 +6127,6 @@ static int megasas_init_fw(struct megasas_instance *instance) goto fail_init_adapter; } - megasas_setup_reply_map(instance); - dev_info(&instance->pdev->dev, "current msix/online cpus\t: (%d/%d)\n", instance->msix_vectors, (unsigned int)num_online_cpus()); @@ -6780,6 +6760,9 @@ static int megasas_io_attach(struct megasas_instance *instance) host->max_id = MEGASAS_MAX_DEV_PER_CHANNEL; host->max_lun = MEGASAS_MAX_LUN; host->max_cmd_len = 16; + if (instance->adapter_type != MFI_SERIES && instance->msix_vectors > 0) + host->nr_hw_queues = instance->msix_vectors - + instance->low_latency_index_start; /* * Notify the mid-layer about the new controller @@ -6947,11 +6930,6 @@ static inline int megasas_alloc_mfi_ctrl_mem(struct megasas_instance *instance) */ static int megasas_alloc_ctrl_mem(struct megasas_instance *instance) { - instance->reply_map = kcalloc(nr_cpu_ids, sizeof(unsigned int), - GFP_KERNEL); - if (!instance->reply_map) - return -ENOMEM; - switch (instance->adapter_type) { case MFI_SERIES: if (megasas_alloc_mfi_ctrl_mem(instance)) @@ -6968,8 +6946,6 @@ static int megasas_alloc_ctrl_mem(struct megasas_instance *instance) return 0; fail: - kfree(instance->reply_map); - instance->reply_map = NULL; return -ENOMEM; } @@ -6982,7 +6958,6 @@ static int megasas_alloc_ctrl_mem(struct megasas_instance *instance) */ static inline void megasas_free_ctrl_mem(struct megasas_instance *instance) { - kfree(instance->reply_map); if (instance->adapter_type == MFI_SERIES) { if (instance->producer) dma_free_coherent(&instance->pdev->dev, sizeof(u32), @@ -7645,8 +7620,6 @@ megasas_resume(struct pci_dev *pdev) if (rval < 0) goto fail_reenable_msix; - megasas_setup_reply_map(instance); - if (instance->adapter_type != MFI_SERIES) { megasas_reset_reply_desc(instance); if (megasas_ioc_init_fusion(instance)) { diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c index e301458bcbae..bae96b82bb10 100644 --- a/drivers/scsi/megaraid/megaraid_sas_fusion.c +++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c @@ -2731,6 +2731,7 @@ megasas_build_ldio_fusion(struct megasas_instance *instance, struct MR_PRIV_DEVICE *mrdev_priv; struct RAID_CONTEXT *rctx; struct RAID_CONTEXT_G35 *rctx_g35; + u32 tag = blk_mq_unique_tag(scp->request); device_id = MEGASAS_DEV_INDEX(scp); @@ -2837,7 +2838,7 @@ megasas_build_ldio_fusion(struct megasas_instance *instance, instance->msix_vectors)); else cmd->request_desc->SCSIIO.MSIxIndex = - instance->reply_map[raw_smp_processor_id()]; + blk_mq_unique_tag_to_hwq(tag); if (instance->adapter_type >= VENTURA_SERIES) { /* FP for Optimal raid level 1. @@ -3080,6 +3081,7 @@ megasas_build_syspd_fusion(struct megasas_instance *instance, u16 pd_index = 0; u16 os_timeout_value; u16 timeout_limit; + u32 tag = blk_mq_unique_tag(scmd->request); struct MR_DRV_RAID_MAP_ALL *local_map_ptr; struct RAID_CONTEXT *pRAID_Context; struct MR_PD_CFG_SEQ_NUM_SYNC *pd_sync; @@ -3169,7 +3171,7 @@ megasas_build_syspd_fusion(struct megasas_instance *instance, instance->msix_vectors)); else cmd->request_desc->SCSIIO.MSIxIndex = - instance->reply_map[raw_smp_processor_id()]; + blk_mq_unique_tag_to_hwq(tag); if (!fp_possible) { /* system pd firmware path */ @@ -3373,7 +3375,7 @@ megasas_build_and_issue_cmd_fusion(struct megasas_instance *instance, { struct megasas_cmd_fusion *cmd, *r1_cmd = NULL; union MEGASAS_REQUEST_DESCRIPTOR_UNION *req_desc; - u32 index; + u32 index, blk_tag, unique_tag; if ((megasas_cmd_type(scmd) == READ_WRITE_LDIO) && instance->ldio_threshold && @@ -3389,7 +3391,9 @@ megasas_build_and_issue_cmd_fusion(struct megasas_instance *instance, return SCSI_MLQUEUE_HOST_BUSY; } - cmd = megasas_get_cmd_fusion(instance, scmd->request->tag); + unique_tag = blk_mq_unique_tag(scmd->request); + blk_tag = blk_mq_unique_tag_to_tag(unique_tag); + cmd = megasas_get_cmd_fusion(instance, blk_tag); if (!cmd) { atomic_dec(&instance->fw_outstanding); @@ -3430,7 +3434,7 @@ megasas_build_and_issue_cmd_fusion(struct megasas_instance *instance, */ if (cmd->r1_alt_dev_handle != MR_DEVHANDLE_INVALID) { r1_cmd = megasas_get_cmd_fusion(instance, - (scmd->request->tag + instance->max_fw_cmds)); + (blk_tag + instance->max_fw_cmds)); megasas_prepare_secondRaid1_IO(instance, cmd, r1_cmd); }
Fusion adapters can steer completions to individual queues, and we now have support for shared host-wide tags. So we can enable multiqueue support for fusion adapters and drop the hand-crafted interrupt affinity settings. Signed-off-by: Hannes Reinecke <hare@suse.com> --- drivers/scsi/megaraid/megaraid_sas.h | 1 - drivers/scsi/megaraid/megaraid_sas_base.c | 65 +++++++++-------------------- drivers/scsi/megaraid/megaraid_sas_fusion.c | 14 ++++--- 3 files changed, 28 insertions(+), 52 deletions(-)