Message ID | 20190109201559.3906-1-yuval.shaia@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | hw/pvrdma: Post CQE when receive invalid gid index | expand |
On 1/9/19 10:15 PM, Yuval Shaia wrote: > This error should propagate back to guest. > > Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com> > --- > hw/rdma/rdma_backend.h | 1 + > hw/rdma/vmw/pvrdma_qp_ops.c | 6 ++++-- > 2 files changed, 5 insertions(+), 2 deletions(-) > > diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h > index a9ba40ae48..5114c90e67 100644 > --- a/hw/rdma/rdma_backend.h > +++ b/hw/rdma/rdma_backend.h > @@ -32,6 +32,7 @@ > #define VENDOR_ERR_INVLKEY 0x207 > #define VENDOR_ERR_MR_SMALL 0x208 > #define VENDOR_ERR_INV_MAD_BUFF 0x209 > +#define VENDOR_ERR_INV_GID_IDX 0x210 > > /* Add definition for QP0 and QP1 as there is no userspace enums for them */ > enum ibv_special_qp_type { > diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c > index 465bee8641..0565eba981 100644 > --- a/hw/rdma/vmw/pvrdma_qp_ops.c > +++ b/hw/rdma/vmw/pvrdma_qp_ops.c > @@ -178,7 +178,8 @@ int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle) > sgid = rdma_rm_get_gid(&dev->rdma_dev_res, wqe->hdr.wr.ud.av.gid_index); > if (!sgid) { > pr_dbg("Fail to get gid for idx %d\n", wqe->hdr.wr.ud.av.gid_index); > - return -EIO; > + complete_with_error(VENDOR_ERR_INV_GID_IDX, comp_ctx); Informing the guest is OK, but are you sure it makes sense to continue sending the other work requests? Is maybe safer to return with error as before? > + continue; > } > pr_dbg("sgid_id=%d, sgid=0x%llx\n", wqe->hdr.wr.ud.av.gid_index, > sgid->global.interface_id); > @@ -189,7 +190,8 @@ int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle) > if (sgid_idx <= 0) { > pr_dbg("Fail to get bk sgid_idx for sgid_idx %d\n", > wqe->hdr.wr.ud.av.gid_index); > - return -EIO; > + complete_with_error(VENDOR_ERR_INV_GID_IDX, comp_ctx); > + continue; Same question here. Thanks, Marcel > } > > if (wqe->hdr.num_sge > dev->dev_attr.max_sge) {
On Sat, Jan 12, 2019 at 05:11:39PM +0200, Marcel Apfelbaum wrote: > > > On 1/9/19 10:15 PM, Yuval Shaia wrote: > > This error should propagate back to guest. > > > > Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com> > > --- > > hw/rdma/rdma_backend.h | 1 + > > hw/rdma/vmw/pvrdma_qp_ops.c | 6 ++++-- > > 2 files changed, 5 insertions(+), 2 deletions(-) > > > > diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h > > index a9ba40ae48..5114c90e67 100644 > > --- a/hw/rdma/rdma_backend.h > > +++ b/hw/rdma/rdma_backend.h > > @@ -32,6 +32,7 @@ > > #define VENDOR_ERR_INVLKEY 0x207 > > #define VENDOR_ERR_MR_SMALL 0x208 > > #define VENDOR_ERR_INV_MAD_BUFF 0x209 > > +#define VENDOR_ERR_INV_GID_IDX 0x210 > > /* Add definition for QP0 and QP1 as there is no userspace enums for them */ > > enum ibv_special_qp_type { > > diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c > > index 465bee8641..0565eba981 100644 > > --- a/hw/rdma/vmw/pvrdma_qp_ops.c > > +++ b/hw/rdma/vmw/pvrdma_qp_ops.c > > @@ -178,7 +178,8 @@ int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle) > > sgid = rdma_rm_get_gid(&dev->rdma_dev_res, wqe->hdr.wr.ud.av.gid_index); > > if (!sgid) { > > pr_dbg("Fail to get gid for idx %d\n", wqe->hdr.wr.ud.av.gid_index); > > - return -EIO; > > + complete_with_error(VENDOR_ERR_INV_GID_IDX, comp_ctx); > > Informing the guest is OK, but are you sure it makes sense > to continue sending the other work requests? Yes, it is exactly what is expected. RDMA application (or driver) would usually push several work requests to ring (optimally populating the entire ring) and then ring the doorbell. An error in a specific WR does not means all WRs in ring should not be processed. > Is maybe safer to return with error as before? And what about the remaining WR in the ring? any way the device would have to process them on next doorbell. > > > + continue; > > } > > pr_dbg("sgid_id=%d, sgid=0x%llx\n", wqe->hdr.wr.ud.av.gid_index, > > sgid->global.interface_id); > > @@ -189,7 +190,8 @@ int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle) > > if (sgid_idx <= 0) { > > pr_dbg("Fail to get bk sgid_idx for sgid_idx %d\n", > > wqe->hdr.wr.ud.av.gid_index); > > - return -EIO; > > + complete_with_error(VENDOR_ERR_INV_GID_IDX, comp_ctx); > > + continue; > > Same question here. > > Thanks, > Marcel > > } > > if (wqe->hdr.num_sge > dev->dev_attr.max_sge) { >
On 1/13/19 9:19 PM, Yuval Shaia wrote: > On Sat, Jan 12, 2019 at 05:11:39PM +0200, Marcel Apfelbaum wrote: >> >> On 1/9/19 10:15 PM, Yuval Shaia wrote: >>> This error should propagate back to guest. >>> >>> Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com> >>> --- >>> hw/rdma/rdma_backend.h | 1 + >>> hw/rdma/vmw/pvrdma_qp_ops.c | 6 ++++-- >>> 2 files changed, 5 insertions(+), 2 deletions(-) >>> >>> diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h >>> index a9ba40ae48..5114c90e67 100644 >>> --- a/hw/rdma/rdma_backend.h >>> +++ b/hw/rdma/rdma_backend.h >>> @@ -32,6 +32,7 @@ >>> #define VENDOR_ERR_INVLKEY 0x207 >>> #define VENDOR_ERR_MR_SMALL 0x208 >>> #define VENDOR_ERR_INV_MAD_BUFF 0x209 >>> +#define VENDOR_ERR_INV_GID_IDX 0x210 >>> /* Add definition for QP0 and QP1 as there is no userspace enums for them */ >>> enum ibv_special_qp_type { >>> diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c >>> index 465bee8641..0565eba981 100644 >>> --- a/hw/rdma/vmw/pvrdma_qp_ops.c >>> +++ b/hw/rdma/vmw/pvrdma_qp_ops.c >>> @@ -178,7 +178,8 @@ int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle) >>> sgid = rdma_rm_get_gid(&dev->rdma_dev_res, wqe->hdr.wr.ud.av.gid_index); >>> if (!sgid) { >>> pr_dbg("Fail to get gid for idx %d\n", wqe->hdr.wr.ud.av.gid_index); >>> - return -EIO; >>> + complete_with_error(VENDOR_ERR_INV_GID_IDX, comp_ctx); >> Informing the guest is OK, but are you sure it makes sense >> to continue sending the other work requests? > Yes, it is exactly what is expected. RDMA application (or driver) would > usually push several work requests to ring (optimally populating the entire > ring) and then ring the doorbell. > An error in a specific WR does not means all WRs in ring should not be > processed. Thanks for the explanation. Reviewed-by: Marcel Apfelbaum <marcel.apfelbaum@gmail.com> Thanks, Marcel >> Is maybe safer to return with error as before? > And what about the remaining WR in the ring? any way the device would have > to process them on next doorbell. > >>> + continue; >>> } >>> pr_dbg("sgid_id=%d, sgid=0x%llx\n", wqe->hdr.wr.ud.av.gid_index, >>> sgid->global.interface_id); >>> @@ -189,7 +190,8 @@ int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle) >>> if (sgid_idx <= 0) { >>> pr_dbg("Fail to get bk sgid_idx for sgid_idx %d\n", >>> wqe->hdr.wr.ud.av.gid_index); >>> - return -EIO; >>> + complete_with_error(VENDOR_ERR_INV_GID_IDX, comp_ctx); >>> + continue; >> Same question here. >> >> Thanks, >> Marcel >>> } >>> if (wqe->hdr.num_sge > dev->dev_attr.max_sge) {
Hi Yuval, On 1/9/19 10:15 PM, Yuval Shaia wrote: > This error should propagate back to guest. > > Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com> > --- > hw/rdma/rdma_backend.h | 1 + > hw/rdma/vmw/pvrdma_qp_ops.c | 6 ++++-- > 2 files changed, 5 insertions(+), 2 deletions(-) > > diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h > index a9ba40ae48..5114c90e67 100644 > --- a/hw/rdma/rdma_backend.h > +++ b/hw/rdma/rdma_backend.h > @@ -32,6 +32,7 @@ > #define VENDOR_ERR_INVLKEY 0x207 > #define VENDOR_ERR_MR_SMALL 0x208 > #define VENDOR_ERR_INV_MAD_BUFF 0x209 > +#define VENDOR_ERR_INV_GID_IDX 0x210 > > /* Add definition for QP0 and QP1 as there is no userspace enums for them */ > enum ibv_special_qp_type { > diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c > index 465bee8641..0565eba981 100644 > --- a/hw/rdma/vmw/pvrdma_qp_ops.c > +++ b/hw/rdma/vmw/pvrdma_qp_ops.c > @@ -178,7 +178,8 @@ int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle) > sgid = rdma_rm_get_gid(&dev->rdma_dev_res, wqe->hdr.wr.ud.av.gid_index); > if (!sgid) { > pr_dbg("Fail to get gid for idx %d\n", wqe->hdr.wr.ud.av.gid_index); > - return -EIO; > + complete_with_error(VENDOR_ERR_INV_GID_IDX, comp_ctx); There may be a problem here, comp_ctx may be uninitialized at this point. I see that comp_ctx gets initalized after this call: /* Prepare CQE */ comp_ctx = g_malloc(sizeof(CompHandlerCtx)); comp_ctx->dev = dev; What do you think? Thanks, Marcel > + continue; > } > pr_dbg("sgid_id=%d, sgid=0x%llx\n", wqe->hdr.wr.ud.av.gid_index, > sgid->global.interface_id); > @@ -189,7 +190,8 @@ int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle) > if (sgid_idx <= 0) { > pr_dbg("Fail to get bk sgid_idx for sgid_idx %d\n", > wqe->hdr.wr.ud.av.gid_index); > - return -EIO; > + complete_with_error(VENDOR_ERR_INV_GID_IDX, comp_ctx); > + continue; > } > > if (wqe->hdr.num_sge > dev->dev_attr.max_sge) {
On Fri, Jan 18, 2019 at 03:55:36PM +0200, Marcel Apfelbaum wrote: > Hi Yuval, > > On 1/9/19 10:15 PM, Yuval Shaia wrote: > > This error should propagate back to guest. > > > > Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com> > > --- > > hw/rdma/rdma_backend.h | 1 + > > hw/rdma/vmw/pvrdma_qp_ops.c | 6 ++++-- > > 2 files changed, 5 insertions(+), 2 deletions(-) > > > > diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h > > index a9ba40ae48..5114c90e67 100644 > > --- a/hw/rdma/rdma_backend.h > > +++ b/hw/rdma/rdma_backend.h > > @@ -32,6 +32,7 @@ > > #define VENDOR_ERR_INVLKEY 0x207 > > #define VENDOR_ERR_MR_SMALL 0x208 > > #define VENDOR_ERR_INV_MAD_BUFF 0x209 > > +#define VENDOR_ERR_INV_GID_IDX 0x210 > > /* Add definition for QP0 and QP1 as there is no userspace enums for them */ > > enum ibv_special_qp_type { > > diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c > > index 465bee8641..0565eba981 100644 > > --- a/hw/rdma/vmw/pvrdma_qp_ops.c > > +++ b/hw/rdma/vmw/pvrdma_qp_ops.c > > @@ -178,7 +178,8 @@ int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle) > > sgid = rdma_rm_get_gid(&dev->rdma_dev_res, wqe->hdr.wr.ud.av.gid_index); > > if (!sgid) { > > pr_dbg("Fail to get gid for idx %d\n", wqe->hdr.wr.ud.av.gid_index); > > - return -EIO; > > + complete_with_error(VENDOR_ERR_INV_GID_IDX, comp_ctx); > > There may be a problem here, comp_ctx may be uninitialized at this point. > I see that comp_ctx gets initalized after this call: > > /* Prepare CQE */ > comp_ctx = g_malloc(sizeof(CompHandlerCtx)); > comp_ctx->dev = dev; > > > What do you think? > > Thanks, > Marcel Applying this patch on top of upstream make sense. Problem start because you probably applied Li Qiang's patch "[PATCH v2] hw: pvrdma: fix memory leak in error path" which moves the initialization of comp_ctx to be right after all checks are done. More than that, accepting this patch makes Li Qiang's patch redundant since the cleanup of comp_ctx will be taken care by the usual process of 'completion' (see the callback function pvrdma_qp_ops_comp_handler). Li Qiang, can you confirm? Yuval > > > + continue; > > } > > pr_dbg("sgid_id=%d, sgid=0x%llx\n", wqe->hdr.wr.ud.av.gid_index, > > sgid->global.interface_id); > > @@ -189,7 +190,8 @@ int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle) > > if (sgid_idx <= 0) { > > pr_dbg("Fail to get bk sgid_idx for sgid_idx %d\n", > > wqe->hdr.wr.ud.av.gid_index); > > - return -EIO; > > + complete_with_error(VENDOR_ERR_INV_GID_IDX, comp_ctx); > > + continue; > > } > > if (wqe->hdr.num_sge > dev->dev_attr.max_sge) { >
On 1/18/19 6:23 PM, Yuval Shaia wrote: > On Fri, Jan 18, 2019 at 03:55:36PM +0200, Marcel Apfelbaum wrote: >> Hi Yuval, >> >> On 1/9/19 10:15 PM, Yuval Shaia wrote: >>> This error should propagate back to guest. >>> >>> Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com> >>> --- >>> hw/rdma/rdma_backend.h | 1 + >>> hw/rdma/vmw/pvrdma_qp_ops.c | 6 ++++-- >>> 2 files changed, 5 insertions(+), 2 deletions(-) >>> >>> diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h >>> index a9ba40ae48..5114c90e67 100644 >>> --- a/hw/rdma/rdma_backend.h >>> +++ b/hw/rdma/rdma_backend.h >>> @@ -32,6 +32,7 @@ >>> #define VENDOR_ERR_INVLKEY 0x207 >>> #define VENDOR_ERR_MR_SMALL 0x208 >>> #define VENDOR_ERR_INV_MAD_BUFF 0x209 >>> +#define VENDOR_ERR_INV_GID_IDX 0x210 >>> /* Add definition for QP0 and QP1 as there is no userspace enums for them */ >>> enum ibv_special_qp_type { >>> diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c >>> index 465bee8641..0565eba981 100644 >>> --- a/hw/rdma/vmw/pvrdma_qp_ops.c >>> +++ b/hw/rdma/vmw/pvrdma_qp_ops.c >>> @@ -178,7 +178,8 @@ int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle) >>> sgid = rdma_rm_get_gid(&dev->rdma_dev_res, wqe->hdr.wr.ud.av.gid_index); >>> if (!sgid) { >>> pr_dbg("Fail to get gid for idx %d\n", wqe->hdr.wr.ud.av.gid_index); >>> - return -EIO; >>> + complete_with_error(VENDOR_ERR_INV_GID_IDX, comp_ctx); >> There may be a problem here, comp_ctx may be uninitialized at this point. >> I see that comp_ctx gets initalized after this call: >> >> /* Prepare CQE */ >> comp_ctx = g_malloc(sizeof(CompHandlerCtx)); >> comp_ctx->dev = dev; >> >> >> What do you think? >> >> Thanks, >> Marcel > Applying this patch on top of upstream make sense. > Problem start because you probably applied Li Qiang's patch "[PATCH v2] hw: > pvrdma: fix memory leak in error path" which moves the initialization of > comp_ctx to be right after all checks are done. > > More than that, accepting this patch makes Li Qiang's patch redundant since > the cleanup of comp_ctx will be taken care by the usual process of > 'completion' (see the callback function pvrdma_qp_ops_comp_handler). > Li Qiang, can you confirm? Sounds right to me. I'll add Reported-by tag and keep the coverity info. Thanks, Marcel > Yuval > >>> + continue; >>> } >>> pr_dbg("sgid_id=%d, sgid=0x%llx\n", wqe->hdr.wr.ud.av.gid_index, >>> sgid->global.interface_id); >>> @@ -189,7 +190,8 @@ int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle) >>> if (sgid_idx <= 0) { >>> pr_dbg("Fail to get bk sgid_idx for sgid_idx %d\n", >>> wqe->hdr.wr.ud.av.gid_index); >>> - return -EIO; >>> + complete_with_error(VENDOR_ERR_INV_GID_IDX, comp_ctx); >>> + continue; >>> } >>> if (wqe->hdr.num_sge > dev->dev_attr.max_sge) {
diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h index a9ba40ae48..5114c90e67 100644 --- a/hw/rdma/rdma_backend.h +++ b/hw/rdma/rdma_backend.h @@ -32,6 +32,7 @@ #define VENDOR_ERR_INVLKEY 0x207 #define VENDOR_ERR_MR_SMALL 0x208 #define VENDOR_ERR_INV_MAD_BUFF 0x209 +#define VENDOR_ERR_INV_GID_IDX 0x210 /* Add definition for QP0 and QP1 as there is no userspace enums for them */ enum ibv_special_qp_type { diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c index 465bee8641..0565eba981 100644 --- a/hw/rdma/vmw/pvrdma_qp_ops.c +++ b/hw/rdma/vmw/pvrdma_qp_ops.c @@ -178,7 +178,8 @@ int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle) sgid = rdma_rm_get_gid(&dev->rdma_dev_res, wqe->hdr.wr.ud.av.gid_index); if (!sgid) { pr_dbg("Fail to get gid for idx %d\n", wqe->hdr.wr.ud.av.gid_index); - return -EIO; + complete_with_error(VENDOR_ERR_INV_GID_IDX, comp_ctx); + continue; } pr_dbg("sgid_id=%d, sgid=0x%llx\n", wqe->hdr.wr.ud.av.gid_index, sgid->global.interface_id); @@ -189,7 +190,8 @@ int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle) if (sgid_idx <= 0) { pr_dbg("Fail to get bk sgid_idx for sgid_idx %d\n", wqe->hdr.wr.ud.av.gid_index); - return -EIO; + complete_with_error(VENDOR_ERR_INV_GID_IDX, comp_ctx); + continue; } if (wqe->hdr.num_sge > dev->dev_attr.max_sge) {
This error should propagate back to guest. Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com> --- hw/rdma/rdma_backend.h | 1 + hw/rdma/vmw/pvrdma_qp_ops.c | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-)