@@ -1278,6 +1278,18 @@ static void mlx4_ib_tunnel_comp_handler(struct ib_cq *cq, void *arg)
spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
}
+static void mlx4_ib_wire_comp_handler(struct ib_cq *cq, void *arg)
+{
+ unsigned long flags;
+ struct mlx4_ib_demux_pv_ctx *ctx = cq->cq_context;
+ struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev);
+
+ spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+ if (!dev->sriov.is_going_down && ctx->state == DEMUX_PV_STATE_ACTIVE)
+ queue_work(ctx->wi_wq, &ctx->work);
+ spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
static int mlx4_ib_post_pv_qp_buf(struct mlx4_ib_demux_pv_ctx *ctx,
struct mlx4_ib_demux_pv_qp *tun_qp,
int index)
@@ -1986,7 +1998,8 @@ static int create_pv_resources(struct ib_device *ibdev, int slave, int port,
cq_size *= 2;
cq_attr.cqe = cq_size;
- ctx->cq = ib_create_cq(ctx->ib_dev, mlx4_ib_tunnel_comp_handler,
+ ctx->cq = ib_create_cq(ctx->ib_dev,
+ create_tun ? mlx4_ib_tunnel_comp_handler : mlx4_ib_wire_comp_handler,
NULL, ctx, &cq_attr);
if (IS_ERR(ctx->cq)) {
ret = PTR_ERR(ctx->cq);
@@ -2023,6 +2036,7 @@ static int create_pv_resources(struct ib_device *ibdev, int slave, int port,
INIT_WORK(&ctx->work, mlx4_ib_sqp_comp_worker);
ctx->wq = to_mdev(ibdev)->sriov.demux[port - 1].wq;
+ ctx->wi_wq = to_mdev(ibdev)->sriov.demux[port - 1].wi_wq;
ret = ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP);
if (ret) {
@@ -2166,7 +2180,7 @@ static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev,
goto err_mcg;
}
- snprintf(name, sizeof name, "mlx4_ibt%d", port);
+ snprintf(name, sizeof(name), "mlx4_ibt%d", port);
ctx->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM);
if (!ctx->wq) {
pr_err("Failed to create tunnelling WQ for port %d\n", port);
@@ -2174,7 +2188,15 @@ static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev,
goto err_wq;
}
- snprintf(name, sizeof name, "mlx4_ibud%d", port);
+ snprintf(name, sizeof(name), "mlx4_ibwi%d", port);
+ ctx->wi_wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM);
+ if (!ctx->wi_wq) {
+ pr_err("Failed to create wire WQ for port %d\n", port);
+ ret = -ENOMEM;
+ goto err_wiwq;
+ }
+
+ snprintf(name, sizeof(name), "mlx4_ibud%d", port);
ctx->ud_wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM);
if (!ctx->ud_wq) {
pr_err("Failed to create up/down WQ for port %d\n", port);
@@ -2185,6 +2207,10 @@ static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev,
return 0;
err_udwq:
+ destroy_workqueue(ctx->wi_wq);
+ ctx->wi_wq = NULL;
+
+err_wiwq:
destroy_workqueue(ctx->wq);
ctx->wq = NULL;
@@ -2232,12 +2258,14 @@ static void mlx4_ib_free_demux_ctx(struct mlx4_ib_demux_ctx *ctx)
ctx->tun[i]->state = DEMUX_PV_STATE_DOWNING;
}
flush_workqueue(ctx->wq);
+ flush_workqueue(ctx->wi_wq);
for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
destroy_pv_resources(dev, i, ctx->port, ctx->tun[i], 0);
free_pv_object(dev, i, ctx->port);
}
kfree(ctx->tun);
destroy_workqueue(ctx->ud_wq);
+ destroy_workqueue(ctx->wi_wq);
destroy_workqueue(ctx->wq);
}
}
@@ -455,6 +455,7 @@ struct mlx4_ib_demux_pv_ctx {
struct ib_pd *pd;
struct work_struct work;
struct workqueue_struct *wq;
+ struct workqueue_struct *wi_wq;
struct mlx4_ib_demux_pv_qp qp[2];
};
@@ -462,6 +463,7 @@ struct mlx4_ib_demux_ctx {
struct ib_device *ib_dev;
int port;
struct workqueue_struct *wq;
+ struct workqueue_struct *wi_wq;
struct workqueue_struct *ud_wq;
spinlock_t ud_lock;
atomic64_t subnet_prefix;
The mlx4 driver will proxy MAD packets through the PF driver. A VM or an instantiated VF will send its MAD packets to the PF driver using loop-back. The PF driver will be informed by an interrupt, but defer the handling and polling of CQEs to a worker thread running on an ordered work-queue. Consider the following scenario: the VMs will in short proximity in time, for example due to a network event, send many MAD packets to the PF driver. Lets say there are K VMs, each sending N packets. The interrupt from the first VM will start the worker thread, which will poll N CQEs. A common case here is where the PF driver will multiplex the packets received from the VMs out on the wire QP. But before the wire QP has returned a send CQE and associated interrupt, the other K - 1 VMs has sent their N packets. The PF driver will have to multiplex K * N packets out on the wire QP. But the send-queue on the wire QP has a finite capacity. So, in this scenario, if K * N is larger that the send-queue capacity of the wire QP, we will get MAD packets dropped on the floor with this dynamic debug message: mlx4_ib_multiplex_mad: failed sending GSI to wire on behalf of slave 2 (-11) and this despite the fact that the wire send-queue could have capacity, but the PF driver isn't aware, because the wire send CQEs have not yet been polled. We can also have a similar scenario inbound, with a wire recv-queue larger than the tunnel QP's recv queue. If many remote peers sends MAD packets to the very same VM, the tunnel send-queue destined to the VM could overflow. This starvation is fixed by introducing separate work queues for the wire QPs vs. the tunnel QPs With this fix, using a dual ported HCA, 8 VFs instantiated, we could run cmtime on each of the 18 interfaces towards a similar configured peer, each cmtime instance with 800 QPs (all in all 14400 QPs) without a single CM packet getting lost. Signed-off-by: HÃ¥kon Bugge <haakon.bugge@oracle.com> --- drivers/infiniband/hw/mlx4/mad.c | 34 +++++++++++++++++++++++++--- drivers/infiniband/hw/mlx4/mlx4_ib.h | 2 ++ 2 files changed, 33 insertions(+), 3 deletions(-)