Message ID | 20190625205701.17849-11-saeedm@mellanox.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | [for-next,V2,01/10] linux/dim: Move logic to dim.h | expand |
> +static int ib_poll_dim_handler(struct irq_poll *iop, int budget) > +{ > + struct ib_cq *cq = container_of(iop, struct ib_cq, iop); > + struct dim *dim = cq->dim; > + int completed; > + > + completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH); > + if (completed < budget) { > + irq_poll_complete(&cq->iop); > + if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) > + irq_poll_sched(&cq->iop); > + } > + > + rdma_dim(dim, completed); Why duplicate the entire thing for a one-liner? > + > + return completed; > +} > + > static void ib_cq_completion_softirq(struct ib_cq *cq, void *private) > { > irq_poll_sched(&cq->iop); > @@ -105,14 +157,18 @@ static void ib_cq_completion_softirq(struct ib_cq *cq, void *private) > > static void ib_cq_poll_work(struct work_struct *work) > { > - struct ib_cq *cq = container_of(work, struct ib_cq, work); > + struct ib_cq *cq = container_of(work, struct ib_cq, > + work); Why was that changed? > int completed; > > completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, cq->wc, > IB_POLL_BATCH); > + newline? > if (completed >= IB_POLL_BUDGET_WORKQUEUE || > ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) > queue_work(cq->comp_wq, &cq->work); > + else if (cq->dim) > + rdma_dim(cq->dim, completed); > } > > static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) > @@ -166,6 +222,8 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private, > rdma_restrack_set_task(&cq->res, caller); > rdma_restrack_kadd(&cq->res); > > + rdma_dim_init(cq); > + > switch (cq->poll_ctx) { > case IB_POLL_DIRECT: > cq->comp_handler = ib_cq_completion_direct; > @@ -173,7 +231,13 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private, > case IB_POLL_SOFTIRQ: > cq->comp_handler = ib_cq_completion_softirq; > > - irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler); > + if (cq->dim) { > + irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, > + ib_poll_dim_handler); > + } else > + irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, > + ib_poll_handler); > + > ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); > break; > case IB_POLL_WORKQUEUE: > @@ -226,6 +290,9 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata) > WARN_ON_ONCE(1); > } > > + if (cq->dim) > + cancel_work_sync(&cq->dim->work); > + kfree(cq->dim); > kfree(cq->wc); > rdma_restrack_del(&cq->res); > ret = cq->device->ops.destroy_cq(cq, udata); > diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c > index abac70ad5c7c..b1b45dbe24a5 100644 > --- a/drivers/infiniband/hw/mlx5/main.c > +++ b/drivers/infiniband/hw/mlx5/main.c > @@ -6305,6 +6305,8 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) > MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) > mutex_init(&dev->lb.mutex); > > + dev->ib_dev.use_cq_dim = true; > + Please don't. This is a bad choice to opt it in by default.
" Please don't. This is a bad choice to opt it in by default." I disagree here. I'd prefer Linux to have good out of the box experience (e.g. reach 100G in 4K NVMeOF on Intel servers) with the default parameters. Especially since Yamin have shown it is beneficial / not hurting in terms of performance for variety of use cases. The whole concept of DIM is that it adapts to the workload requirements in terms of bandwidth and latency. Moreover, net-dim is enabled by default, I don't see why RDMA is different. -----Original Message----- From: linux-rdma-owner@vger.kernel.org <linux-rdma-owner@vger.kernel.org> On Behalf Of Sagi Grimberg Sent: Wednesday, June 26, 2019 12:14 AM To: Saeed Mahameed <saeedm@mellanox.com>; David S. Miller <davem@davemloft.net>; Doug Ledford <dledford@redhat.com>; Jason Gunthorpe <jgg@mellanox.com> Cc: Leon Romanovsky <leonro@mellanox.com>; Or Gerlitz <ogerlitz@mellanox.com>; Tal Gilboa <talgi@mellanox.com>; netdev@vger.kernel.org; linux-rdma@vger.kernel.org; Yamin Friedman <yaminf@mellanox.com>; Max Gurtovoy <maxg@mellanox.com> Subject: Re: [for-next V2 10/10] RDMA/core: Provide RDMA DIM support for ULPs > +static int ib_poll_dim_handler(struct irq_poll *iop, int budget) { > + struct ib_cq *cq = container_of(iop, struct ib_cq, iop); > + struct dim *dim = cq->dim; > + int completed; > + > + completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH); > + if (completed < budget) { > + irq_poll_complete(&cq->iop); > + if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) > + irq_poll_sched(&cq->iop); > + } > + > + rdma_dim(dim, completed); Why duplicate the entire thing for a one-liner? > + > + return completed; > +} > + > static void ib_cq_completion_softirq(struct ib_cq *cq, void *private) > { > irq_poll_sched(&cq->iop); > @@ -105,14 +157,18 @@ static void ib_cq_completion_softirq(struct > ib_cq *cq, void *private) > > static void ib_cq_poll_work(struct work_struct *work) > { > - struct ib_cq *cq = container_of(work, struct ib_cq, work); > + struct ib_cq *cq = container_of(work, struct ib_cq, > + work); Why was that changed? > int completed; > > completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, cq->wc, > IB_POLL_BATCH); > + newline? > if (completed >= IB_POLL_BUDGET_WORKQUEUE || > ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) > queue_work(cq->comp_wq, &cq->work); > + else if (cq->dim) > + rdma_dim(cq->dim, completed); > } > > static void ib_cq_completion_workqueue(struct ib_cq *cq, void > *private) @@ -166,6 +222,8 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private, > rdma_restrack_set_task(&cq->res, caller); > rdma_restrack_kadd(&cq->res); > > + rdma_dim_init(cq); > + > switch (cq->poll_ctx) { > case IB_POLL_DIRECT: > cq->comp_handler = ib_cq_completion_direct; @@ -173,7 +231,13 @@ > struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private, > case IB_POLL_SOFTIRQ: > cq->comp_handler = ib_cq_completion_softirq; > > - irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler); > + if (cq->dim) { > + irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, > + ib_poll_dim_handler); > + } else > + irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, > + ib_poll_handler); > + > ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); > break; > case IB_POLL_WORKQUEUE: > @@ -226,6 +290,9 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata) > WARN_ON_ONCE(1); > } > > + if (cq->dim) > + cancel_work_sync(&cq->dim->work); > + kfree(cq->dim); > kfree(cq->wc); > rdma_restrack_del(&cq->res); > ret = cq->device->ops.destroy_cq(cq, udata); diff --git > a/drivers/infiniband/hw/mlx5/main.c > b/drivers/infiniband/hw/mlx5/main.c > index abac70ad5c7c..b1b45dbe24a5 100644 > --- a/drivers/infiniband/hw/mlx5/main.c > +++ b/drivers/infiniband/hw/mlx5/main.c > @@ -6305,6 +6305,8 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) > MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) > mutex_init(&dev->lb.mutex); > > + dev->ib_dev.use_cq_dim = true; > + Please don't. This is a bad choice to opt it in by default.
On 6/26/2019 12:14 AM, Sagi Grimberg wrote: > > >> +static int ib_poll_dim_handler(struct irq_poll *iop, int budget) >> +{ >> + struct ib_cq *cq = container_of(iop, struct ib_cq, iop); >> + struct dim *dim = cq->dim; >> + int completed; >> + >> + completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH); >> + if (completed < budget) { >> + irq_poll_complete(&cq->iop); >> + if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) >> + irq_poll_sched(&cq->iop); >> + } >> + >> + rdma_dim(dim, completed); > > Why duplicate the entire thing for a one-liner? You are right, this was leftover from a previous version where there were more significant changes. I will remove the extra function. > >> + >> + return completed; >> +} >> + >> static void ib_cq_completion_softirq(struct ib_cq *cq, void *private) >> { >> irq_poll_sched(&cq->iop); >> @@ -105,14 +157,18 @@ static void ib_cq_completion_softirq(struct >> ib_cq *cq, void *private) >> static void ib_cq_poll_work(struct work_struct *work) >> { >> - struct ib_cq *cq = container_of(work, struct ib_cq, work); >> + struct ib_cq *cq = container_of(work, struct ib_cq, >> + work); > > Why was that changed? I will fix this. > >> int completed; >> completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, >> cq->wc, >> IB_POLL_BATCH); >> + > > newline? Same as above. > >> if (completed >= IB_POLL_BUDGET_WORKQUEUE || >> ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) >> queue_work(cq->comp_wq, &cq->work); >> + else if (cq->dim) >> + rdma_dim(cq->dim, completed); >> } >> static void ib_cq_completion_workqueue(struct ib_cq *cq, void >> *private) >> @@ -166,6 +222,8 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device >> *dev, void *private, >> rdma_restrack_set_task(&cq->res, caller); >> rdma_restrack_kadd(&cq->res); >> + rdma_dim_init(cq); >> + >> switch (cq->poll_ctx) { >> case IB_POLL_DIRECT: >> cq->comp_handler = ib_cq_completion_direct; >> @@ -173,7 +231,13 @@ struct ib_cq *__ib_alloc_cq_user(struct >> ib_device *dev, void *private, >> case IB_POLL_SOFTIRQ: >> cq->comp_handler = ib_cq_completion_softirq; >> - irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler); >> + if (cq->dim) { >> + irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, >> + ib_poll_dim_handler); >> + } else >> + irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, >> + ib_poll_handler); >> + >> ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); >> break; >> case IB_POLL_WORKQUEUE: >> @@ -226,6 +290,9 @@ void ib_free_cq_user(struct ib_cq *cq, struct >> ib_udata *udata) >> WARN_ON_ONCE(1); >> } >> + if (cq->dim) >> + cancel_work_sync(&cq->dim->work); >> + kfree(cq->dim); >> kfree(cq->wc); >> rdma_restrack_del(&cq->res); >> ret = cq->device->ops.destroy_cq(cq, udata); >> diff --git a/drivers/infiniband/hw/mlx5/main.c >> b/drivers/infiniband/hw/mlx5/main.c >> index abac70ad5c7c..b1b45dbe24a5 100644 >> --- a/drivers/infiniband/hw/mlx5/main.c >> +++ b/drivers/infiniband/hw/mlx5/main.c >> @@ -6305,6 +6305,8 @@ static int mlx5_ib_stage_caps_init(struct >> mlx5_ib_dev *dev) >> MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) >> mutex_init(&dev->lb.mutex); >> + dev->ib_dev.use_cq_dim = true; >> + > > Please don't. This is a bad choice to opt it in by default.
Hey Idan, > " Please don't. This is a bad choice to opt it in by default." > > I disagree here. I'd prefer Linux to have good out of the box experience (e.g. reach 100G in 4K NVMeOF on Intel servers) with the default parameters. Especially since Yamin have shown it is beneficial / not hurting in terms of performance for variety of use cases. The whole concept of DIM is that it adapts to the workload requirements in terms of bandwidth and latency. Well, its a Mellanox device driver after all. But do note that by far, the vast majority of users are not saturating 100G of 4K I/O. The absolute vast majority of users are primarily sensitive to synchronous QD=1 I/O latency, and when the workload is much more dynamic than the synthetic 100%/50%/0% read mix. As much as I'm a fan (IIRC I was the one giving a first pass at this), the dim default opt-in is not only not beneficial, but potentially harmful to the majority of users out-of-the-box experience. Given that this is a fresh code with almost no exposure, and that was not tested outside of Yamin running limited performance testing, I think it would be a mistake to add it as a default opt-in, that can come as an incremental stage. Obviously, I cannot tell what Mellanox should/shouldn't do in its own device driver of course, but I just wanted to emphasize that I think this is a mistake. > Moreover, net-dim is enabled by default, I don't see why RDMA is different. Very different animals.
On Mon, Jul 01, 2019 at 10:36:41PM -0700, Sagi Grimberg wrote: > Hey Idan, > > > " Please don't. This is a bad choice to opt it in by default." > > > > I disagree here. I'd prefer Linux to have good out of the box experience (e.g. reach 100G in 4K NVMeOF on Intel servers) with the default parameters. Especially since Yamin have shown it is beneficial / not hurting in terms of performance for variety of use cases. The whole concept of DIM is that it adapts to the workload requirements in terms of bandwidth and latency. > > Well, its a Mellanox device driver after all. > > But do note that by far, the vast majority of users are not saturating > 100G of 4K I/O. The absolute vast majority of users are primarily > sensitive to synchronous QD=1 I/O latency, and when the workload > is much more dynamic than the synthetic 100%/50%/0% read mix. > > As much as I'm a fan (IIRC I was the one giving a first pass at this), > the dim default opt-in is not only not beneficial, but potentially > harmful to the majority of users out-of-the-box experience. > > Given that this is a fresh code with almost no exposure, and that was > not tested outside of Yamin running limited performance testing, I think > it would be a mistake to add it as a default opt-in, that can come as an > incremental stage. > > Obviously, I cannot tell what Mellanox should/shouldn't do in its own > device driver of course, but I just wanted to emphasize that I think > this is a mistake. Hi Sagi, I'm not sharing your worries about bad out-of-the-box experience for a number of reasons. First of all, this code is part of upstream kernel and will take time till users actually start to use it as is and not as part of some distro backports or MOFED packages. Second, Yamin did extensive testing and worked very close with Or G. and I have very high confident in the results of their team work. Third (outcome of first), actually the opposite is true, the setting this option as a default will give us more time to fix/adjust code if needed, before users will see any potential degradation. > > > Moreover, net-dim is enabled by default, I don't see why RDMA is different. > > Very different animals. Yes and no, the logic behind is the same and both solutions have same constrains of throughput vs. latency. Thanks
> Hi Sagi, > > I'm not sharing your worries about bad out-of-the-box experience for a > number of reasons. > > First of all, this code is part of upstream kernel and will take time > till users actually start to use it as is and not as part of some distro > backports or MOFED packages. True, but I am still saying that this feature is damaging sync IO which represents the majority of the users. It might not be an extreme impact but it is still a degradation (from a very limited testing I did this morning I'm seeing a consistent 5%-10% latency increase for low QD workloads which is consistent with what Yamin reported AFAIR). But having said that, the call is for you guys to make as this is a Mellanox device. I absolutely think that this is useful (as I said before), I just don't think its necessarily a good idea to opt it by default given that only a limited set of users would take full advantage of it while the rest would see a negative impact (even if its 10%). I don't have a hard objection here, just wanted to give you my opinion on this because mlx5 is an important driver for rdma users. > Second, Yamin did extensive testing and worked very close with Or G. > and I have very high confident in the results of their team work. Has anyone tested other RDMA ulps? NFS/RDMA or SRP/iSER? Would be interesting to understand how other subsystems with different characteristics behave with this.
On Wed, Jul 03, 2019 at 11:56:04AM -0700, Sagi Grimberg wrote: > > > Hi Sagi, > > > > I'm not sharing your worries about bad out-of-the-box experience for a > > number of reasons. > > > > First of all, this code is part of upstream kernel and will take time > > till users actually start to use it as is and not as part of some distro > > backports or MOFED packages. > > True, but I am still saying that this feature is damaging sync IO which > represents the majority of the users. It might not be an extreme impact > but it is still a degradation (from a very limited testing I did this > morning I'm seeing a consistent 5%-10% latency increase for low QD > workloads which is consistent with what Yamin reported AFAIR). > > But having said that, the call is for you guys to make as this is a > Mellanox device. I absolutely think that this is useful (as I said > before), I just don't think its necessarily a good idea to opt it by > default given that only a limited set of users would take full advantage > of it while the rest would see a negative impact (even if its 10%). > > I don't have a hard objection here, just wanted to give you my > opinion on this because mlx5 is an important driver for rdma > users. Your opinion is very valuable for us and we started internal thread to challenge this "enable by default", it just takes time and I prefer to enable this code to get test coverage as wide as possible. > > > Second, Yamin did extensive testing and worked very close with Or G. > > and I have very high confident in the results of their team work. > > Has anyone tested other RDMA ulps? NFS/RDMA or SRP/iSER? > > Would be interesting to understand how other subsystems with different > characteristics behave with this. Me too, and I'll revert this default if needed. Thanks
The essence of the dynamic in DIM is that it would fit to the workload running on the cores. For user not to trade bandwidth/cqu% and latency with a module parameter they don't know how to config. If DIM consistently hurts latency of latency critical workloads we should debug and fix. This is where we should go. End goal of no configurate with out of the box performance in terms of both bandwidth/cpu% and latency. We could make several steps towards this direction if we are not mature enough today but let's define them (e.g. tests on different ulps). -----Original Message----- From: linux-rdma-owner@vger.kernel.org <linux-rdma-owner@vger.kernel.org> On Behalf Of Sagi Grimberg Sent: Tuesday, July 2, 2019 8:37 AM To: Idan Burstein <idanb@mellanox.com>; Saeed Mahameed <saeedm@mellanox.com>; David S. Miller <davem@davemloft.net>; Doug Ledford <dledford@redhat.com>; Jason Gunthorpe <jgg@mellanox.com> Cc: Leon Romanovsky <leonro@mellanox.com>; Or Gerlitz <ogerlitz@mellanox.com>; Tal Gilboa <talgi@mellanox.com>; netdev@vger.kernel.org; linux-rdma@vger.kernel.org; Yamin Friedman <yaminf@mellanox.com>; Max Gurtovoy <maxg@mellanox.com> Subject: Re: [for-next V2 10/10] RDMA/core: Provide RDMA DIM support for ULPs Hey Idan, > " Please don't. This is a bad choice to opt it in by default." > > I disagree here. I'd prefer Linux to have good out of the box experience (e.g. reach 100G in 4K NVMeOF on Intel servers) with the default parameters. Especially since Yamin have shown it is beneficial / not hurting in terms of performance for variety of use cases. The whole concept of DIM is that it adapts to the workload requirements in terms of bandwidth and latency. Well, its a Mellanox device driver after all. But do note that by far, the vast majority of users are not saturating 100G of 4K I/O. The absolute vast majority of users are primarily sensitive to synchronous QD=1 I/O latency, and when the workload is much more dynamic than the synthetic 100%/50%/0% read mix. As much as I'm a fan (IIRC I was the one giving a first pass at this), the dim default opt-in is not only not beneficial, but potentially harmful to the majority of users out-of-the-box experience. Given that this is a fresh code with almost no exposure, and that was not tested outside of Yamin running limited performance testing, I think it would be a mistake to add it as a default opt-in, that can come as an incremental stage. Obviously, I cannot tell what Mellanox should/shouldn't do in its own device driver of course, but I just wanted to emphasize that I think this is a mistake. > Moreover, net-dim is enabled by default, I don't see why RDMA is different. Very different animals.
diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index a4c81992267c..d8a8c466d897 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -26,6 +26,40 @@ #define IB_POLL_FLAGS \ (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS) +static void ib_cq_rdma_dim_work(struct work_struct *w) +{ + struct dim *dim = container_of(w, struct dim, work); + struct ib_cq *cq = (struct ib_cq *)dim->dim_owner; + + u16 usec = rdma_dim_prof[dim->profile_ix].usec; + u16 comps = rdma_dim_prof[dim->profile_ix].comps; + + dim->state = DIM_START_MEASURE; + + cq->device->ops.modify_cq(cq, comps, usec); +} + +static void rdma_dim_init(struct ib_cq *cq) +{ + struct dim *dim; + + if (!cq->device->ops.modify_cq || !cq->device->use_cq_dim || + cq->poll_ctx == IB_POLL_DIRECT) + return; + + dim = kzalloc(sizeof(struct dim), GFP_KERNEL); + if (!dim) + return; + + dim->state = DIM_START_MEASURE; + dim->tune_state = DIM_GOING_RIGHT; + dim->profile_ix = RDMA_DIM_START_PROFILE; + dim->dim_owner = cq; + cq->dim = dim; + + INIT_WORK(&dim->work, ib_cq_rdma_dim_work); +} + static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs, int batch) { @@ -98,6 +132,24 @@ static int ib_poll_handler(struct irq_poll *iop, int budget) return completed; } +static int ib_poll_dim_handler(struct irq_poll *iop, int budget) +{ + struct ib_cq *cq = container_of(iop, struct ib_cq, iop); + struct dim *dim = cq->dim; + int completed; + + completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH); + if (completed < budget) { + irq_poll_complete(&cq->iop); + if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) + irq_poll_sched(&cq->iop); + } + + rdma_dim(dim, completed); + + return completed; +} + static void ib_cq_completion_softirq(struct ib_cq *cq, void *private) { irq_poll_sched(&cq->iop); @@ -105,14 +157,18 @@ static void ib_cq_completion_softirq(struct ib_cq *cq, void *private) static void ib_cq_poll_work(struct work_struct *work) { - struct ib_cq *cq = container_of(work, struct ib_cq, work); + struct ib_cq *cq = container_of(work, struct ib_cq, + work); int completed; completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, cq->wc, IB_POLL_BATCH); + if (completed >= IB_POLL_BUDGET_WORKQUEUE || ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) queue_work(cq->comp_wq, &cq->work); + else if (cq->dim) + rdma_dim(cq->dim, completed); } static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) @@ -166,6 +222,8 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private, rdma_restrack_set_task(&cq->res, caller); rdma_restrack_kadd(&cq->res); + rdma_dim_init(cq); + switch (cq->poll_ctx) { case IB_POLL_DIRECT: cq->comp_handler = ib_cq_completion_direct; @@ -173,7 +231,13 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private, case IB_POLL_SOFTIRQ: cq->comp_handler = ib_cq_completion_softirq; - irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler); + if (cq->dim) { + irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, + ib_poll_dim_handler); + } else + irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, + ib_poll_handler); + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); break; case IB_POLL_WORKQUEUE: @@ -226,6 +290,9 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata) WARN_ON_ONCE(1); } + if (cq->dim) + cancel_work_sync(&cq->dim->work); + kfree(cq->dim); kfree(cq->wc); rdma_restrack_del(&cq->res); ret = cq->device->ops.destroy_cq(cq, udata); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index abac70ad5c7c..b1b45dbe24a5 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -6305,6 +6305,8 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) mutex_init(&dev->lb.mutex); + dev->ib_dev.use_cq_dim = true; + return 0; }