Message ID | 20200414111720.1789168-1-haakon.bugge@oracle.com (mailing list archive) |
---|---|
State | Changes Requested |
Headers | show |
Series | [for-rc] RDMA/cm: Do not send REJ when remote_id is unknown | expand |
On Tue, Apr 14, 2020 at 01:17:20PM +0200, Håkon Bugge wrote: > In cm_destroy_id(), when the cm_id state is IB_CM_REQ_SENT or > IB_CM_MRA_REQ_RCVD, an attempt is made to send a REJ with > IB_CM_REJ_TIMEOUT as the reject reason. Yes, this causes the remote to destroy the half completed connection, for instance if the path is unidirectional. > However, in said states, we have no remote_id. For the REQ_SENT case, > we simply haven't received anything from our peer, Which is correct, the spec covers this in Table 108 which describes the remote communication ID as '0 if REJecting due to REP timeout and no MRA was received' This is implemented in cm_acquire_rejected_id(), assuming it isn't buggy. > for the MRA_REQ_RCVD case, the cm_rma_handler() doesn't pick up the > remote_id. This seems like a bug. It would be appropriate to store the remote id when getting a MRA and set it in cm_format_rej() if a MRA has been rx'd It also seems like a bug that cm_acquire_rejected_id() does not check the remote_comm_id if it is not zero. And for this reason it also seems unwise that cm_alloc_id_priv() will allocate 0 cm_id's, as that value appears to have special meaning, oh and it is unwise to use 0 with cm_acquire_id(). Tsk. The CM_MSG_RESPONSE_REQ path looks kind of wrong too.. > Therefore, it is no reason to send this REJ, since it simply will be > tossed at the peer's CM layer (if it reaches it). If running in CX-3 > virtualized and having the pr_debug enabled in the mlx4_ib driver, we > will see: > > mlx4_ib_demux_cm_handler: Couldn't find an entry for pv_cm_id 0x0 This seems to be a bug in mlx4. The pv layer should be duplicating how cm_acquire_rejected_id() works Something like this for the cm parts - what do you think? diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 4794113ecd596c..fb384bf60b6f02 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -592,20 +592,35 @@ static void cm_free_id(__be32 local_id) xa_erase_irq(&cm.local_id_table, cm_local_id(local_id)); } -static struct cm_id_private *cm_acquire_id(__be32 local_id, __be32 remote_id) +/* + * If we know the message is related to a REQ then there is no remote_id set, so + * it should not be checked. The state should be in IB_CM_REQ_SENT, + * IB_CM_SIDR_REQ_SENT or IB_CM_MRA_REQ_RCVD and the caller should check this. + */ +static struct cm_id_private *cm_acquire_req(__be32 local_id) { struct cm_id_private *cm_id_priv; rcu_read_lock(); cm_id_priv = xa_load(&cm.local_id_table, cm_local_id(local_id)); - if (!cm_id_priv || cm_id_priv->id.remote_id != remote_id || - !refcount_inc_not_zero(&cm_id_priv->refcount)) + if (!cm_id_priv || !refcount_inc_not_zero(&cm_id_priv->refcount)) cm_id_priv = NULL; rcu_read_unlock(); return cm_id_priv; } +static struct cm_id_private *cm_acquire_id(__be32 local_id, __be32 remote_id) +{ + struct cm_id_private *cm_id_priv = cm_acquire_req(local_id); + + if (READ_ONCE(cm_id_priv->id.remote_id) != remote_id) { + cm_deref_id(cm_id_priv); + return NULL; + } + return cm_id_priv; +} + /* * Trivial helpers to strip endian annotation and compare; the * endianness doesn't actually matter since we just need a stable @@ -1856,6 +1871,10 @@ static void cm_format_rej(struct cm_rej_msg *rej_msg, be32_to_cpu(cm_id_priv->id.local_id)); IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, CM_MSG_RESPONSE_REP); break; + case IB_CM_MRA_REQ_RCVD: + IBA_SET(CM_REJ_REMOTE_COMM_ID, rej_msg, + be32_to_cpu(cm_id_priv->id.remote_id)); + fallthrough; default: IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg, be32_to_cpu(cm_id_priv->id.local_id)); @@ -2409,8 +2428,8 @@ static int cm_rep_handler(struct cm_work *work) struct cm_timewait_info *timewait_info; rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad; - cm_id_priv = cm_acquire_id( - cpu_to_be32(IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)), 0); + cm_id_priv = cm_acquire_req( + cpu_to_be32(IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg))); if (!cm_id_priv) { cm_dup_rep_handler(work); pr_debug("%s: remote_comm_id %d, no cm_id_priv\n", __func__, @@ -2991,7 +3010,8 @@ static struct cm_id_private * cm_acquire_rejected_id(struct cm_rej_msg *rej_msg) remote_id = cpu_to_be32(IBA_GET(CM_REJ_LOCAL_COMM_ID, rej_msg)); - if (IBA_GET(CM_REJ_REASON, rej_msg) == IB_CM_REJ_TIMEOUT) { + if (IBA_GET(CM_REJ_REASON, rej_msg) == IB_CM_REJ_TIMEOUT && + IBA_GET(CM_REJ_REMOTE_COMM_ID, rej_msg) == 0) { spin_lock_irq(&cm.lock); timewait_info = cm_find_remote_id( *((__be64 *)IBA_GET_MEM_PTR(CM_REJ_ARI, rej_msg)), @@ -3005,9 +3025,8 @@ static struct cm_id_private * cm_acquire_rejected_id(struct cm_rej_msg *rej_msg) spin_unlock_irq(&cm.lock); } else if (IBA_GET(CM_REJ_MESSAGE_REJECTED, rej_msg) == CM_MSG_RESPONSE_REQ) - cm_id_priv = cm_acquire_id( - cpu_to_be32(IBA_GET(CM_REJ_REMOTE_COMM_ID, rej_msg)), - 0); + cm_id_priv = cm_acquire_req( + cpu_to_be32(IBA_GET(CM_REJ_REMOTE_COMM_ID, rej_msg))); else cm_id_priv = cm_acquire_id( cpu_to_be32(IBA_GET(CM_REJ_REMOTE_COMM_ID, rej_msg)), @@ -3171,9 +3190,8 @@ static struct cm_id_private * cm_acquire_mraed_id(struct cm_mra_msg *mra_msg) { switch (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg)) { case CM_MSG_RESPONSE_REQ: - return cm_acquire_id( - cpu_to_be32(IBA_GET(CM_MRA_REMOTE_COMM_ID, mra_msg)), - 0); + return cm_acquire_req( + cpu_to_be32(IBA_GET(CM_MRA_REMOTE_COMM_ID, mra_msg))); case CM_MSG_RESPONSE_REP: case CM_MSG_RESPONSE_OTHER: return cm_acquire_id( @@ -3211,6 +3229,8 @@ static int cm_mra_handler(struct cm_work *work) cm_id_priv->msg, timeout)) goto out; cm_id_priv->id.state = IB_CM_MRA_REQ_RCVD; + cm_id_priv->id.remote_id = + IBA_GET(CM_MRA_LOCAL_COMM_ID, mra_msg); break; case IB_CM_REP_SENT: if (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg) != @@ -3769,8 +3789,8 @@ static int cm_sidr_rep_handler(struct cm_work *work) sidr_rep_msg = (struct cm_sidr_rep_msg *) work->mad_recv_wc->recv_buf.mad; - cm_id_priv = cm_acquire_id( - cpu_to_be32(IBA_GET(CM_SIDR_REP_REQUESTID, sidr_rep_msg)), 0); + cm_id_priv = cm_acquire_req( + cpu_to_be32(IBA_GET(CM_SIDR_REP_REQUESTID, sidr_rep_msg))); if (!cm_id_priv) return -EINVAL; /* Unmatched reply. */
> On 14 Apr 2020, at 22:40, Jason Gunthorpe <jgg@ziepe.ca> wrote: > > On Tue, Apr 14, 2020 at 01:17:20PM +0200, Håkon Bugge wrote: >> In cm_destroy_id(), when the cm_id state is IB_CM_REQ_SENT or >> IB_CM_MRA_REQ_RCVD, an attempt is made to send a REJ with >> IB_CM_REJ_TIMEOUT as the reject reason. > > Yes, this causes the remote to destroy the half completed connection, > for instance if the path is unidirectional. > >> However, in said states, we have no remote_id. For the REQ_SENT case, >> we simply haven't received anything from our peer, > > Which is correct, the spec covers this in Table 108 which describes > the remote communication ID as '0 if REJecting due to REP timeout and > no MRA was received' Yes, the spec has the phrase for a REJ due toa timeout: "The recipient of a REJ message with this reason code must use this CA GUID to identify the sender, as it is possible that the Remote Communication ID in the REJ message may not be valid." [1] > This is implemented in cm_acquire_rejected_id(), assuming it isn't > buggy. > >> for the MRA_REQ_RCVD case, the cm_rma_handler() doesn't pick up the >> remote_id. > > This seems like a bug. It would be appropriate to store the remote id > when getting a MRA and set it in cm_format_rej() if a MRA has been rx'd > > It also seems like a bug that cm_acquire_rejected_id() does not check > the remote_comm_id if it is not zero. > > And for this reason it also seems unwise that cm_alloc_id_priv() will > allocate 0 cm_id's, as that value appears to have special meaning, oh > and it is unwise to use 0 with cm_acquire_id(). Tsk. > > The CM_MSG_RESPONSE_REQ path looks kind of wrong too.. > >> Therefore, it is no reason to send this REJ, since it simply will be >> tossed at the peer's CM layer (if it reaches it). If running in CX-3 >> virtualized and having the pr_debug enabled in the mlx4_ib driver, we >> will see: >> >> mlx4_ib_demux_cm_handler: Couldn't find an entry for pv_cm_id 0x0 > > This seems to be a bug in mlx4. The pv layer should be duplicating how > cm_acquire_rejected_id() works Looks like this pv layer missed (as I did ;-)) [1] above. > Something like this for the cm parts - what do you think? > > diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c > index 4794113ecd596c..fb384bf60b6f02 100644 > --- a/drivers/infiniband/core/cm.c > +++ b/drivers/infiniband/core/cm.c > @@ -592,20 +592,35 @@ static void cm_free_id(__be32 local_id) > xa_erase_irq(&cm.local_id_table, cm_local_id(local_id)); > } > > -static struct cm_id_private *cm_acquire_id(__be32 local_id, __be32 remote_id) > +/* > + * If we know the message is related to a REQ then there is no remote_id set, so > + * it should not be checked. The state should be in IB_CM_REQ_SENT, > + * IB_CM_SIDR_REQ_SENT or IB_CM_MRA_REQ_RCVD and the caller should check this. > + */ > +static struct cm_id_private *cm_acquire_req(__be32 local_id) > { > struct cm_id_private *cm_id_priv; > > rcu_read_lock(); > cm_id_priv = xa_load(&cm.local_id_table, cm_local_id(local_id)); > - if (!cm_id_priv || cm_id_priv->id.remote_id != remote_id || > - !refcount_inc_not_zero(&cm_id_priv->refcount)) > + if (!cm_id_priv || !refcount_inc_not_zero(&cm_id_priv->refcount)) > cm_id_priv = NULL; > rcu_read_unlock(); > > return cm_id_priv; > } > > +static struct cm_id_private *cm_acquire_id(__be32 local_id, __be32 remote_id) > +{ > + struct cm_id_private *cm_id_priv = cm_acquire_req(local_id); > + > + if (READ_ONCE(cm_id_priv->id.remote_id) != remote_id) { Hmm, what if cm_id_priv is NULL? The rest looks good to me, but I would like to backport it to the version I am familiar with and test it. Thxs, Håkon > + cm_deref_id(cm_id_priv); > + return NULL; > + } > + return cm_id_priv; > +} > + > /* > * Trivial helpers to strip endian annotation and compare; the > * endianness doesn't actually matter since we just need a stable > @@ -1856,6 +1871,10 @@ static void cm_format_rej(struct cm_rej_msg *rej_msg, > be32_to_cpu(cm_id_priv->id.local_id)); > IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, CM_MSG_RESPONSE_REP); > break; > + case IB_CM_MRA_REQ_RCVD: > + IBA_SET(CM_REJ_REMOTE_COMM_ID, rej_msg, > + be32_to_cpu(cm_id_priv->id.remote_id)); > + fallthrough; > default: > IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg, > be32_to_cpu(cm_id_priv->id.local_id)); > @@ -2409,8 +2428,8 @@ static int cm_rep_handler(struct cm_work *work) > struct cm_timewait_info *timewait_info; > > rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad; > - cm_id_priv = cm_acquire_id( > - cpu_to_be32(IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)), 0); > + cm_id_priv = cm_acquire_req( > + cpu_to_be32(IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg))); > if (!cm_id_priv) { > cm_dup_rep_handler(work); > pr_debug("%s: remote_comm_id %d, no cm_id_priv\n", __func__, > @@ -2991,7 +3010,8 @@ static struct cm_id_private * cm_acquire_rejected_id(struct cm_rej_msg *rej_msg) > > remote_id = cpu_to_be32(IBA_GET(CM_REJ_LOCAL_COMM_ID, rej_msg)); > > - if (IBA_GET(CM_REJ_REASON, rej_msg) == IB_CM_REJ_TIMEOUT) { > + if (IBA_GET(CM_REJ_REASON, rej_msg) == IB_CM_REJ_TIMEOUT && > + IBA_GET(CM_REJ_REMOTE_COMM_ID, rej_msg) == 0) { > spin_lock_irq(&cm.lock); > timewait_info = cm_find_remote_id( > *((__be64 *)IBA_GET_MEM_PTR(CM_REJ_ARI, rej_msg)), > @@ -3005,9 +3025,8 @@ static struct cm_id_private * cm_acquire_rejected_id(struct cm_rej_msg *rej_msg) > spin_unlock_irq(&cm.lock); > } else if (IBA_GET(CM_REJ_MESSAGE_REJECTED, rej_msg) == > CM_MSG_RESPONSE_REQ) > - cm_id_priv = cm_acquire_id( > - cpu_to_be32(IBA_GET(CM_REJ_REMOTE_COMM_ID, rej_msg)), > - 0); > + cm_id_priv = cm_acquire_req( > + cpu_to_be32(IBA_GET(CM_REJ_REMOTE_COMM_ID, rej_msg))); > else > cm_id_priv = cm_acquire_id( > cpu_to_be32(IBA_GET(CM_REJ_REMOTE_COMM_ID, rej_msg)), > @@ -3171,9 +3190,8 @@ static struct cm_id_private * cm_acquire_mraed_id(struct cm_mra_msg *mra_msg) > { > switch (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg)) { > case CM_MSG_RESPONSE_REQ: > - return cm_acquire_id( > - cpu_to_be32(IBA_GET(CM_MRA_REMOTE_COMM_ID, mra_msg)), > - 0); > + return cm_acquire_req( > + cpu_to_be32(IBA_GET(CM_MRA_REMOTE_COMM_ID, mra_msg))); > case CM_MSG_RESPONSE_REP: > case CM_MSG_RESPONSE_OTHER: > return cm_acquire_id( > @@ -3211,6 +3229,8 @@ static int cm_mra_handler(struct cm_work *work) > cm_id_priv->msg, timeout)) > goto out; > cm_id_priv->id.state = IB_CM_MRA_REQ_RCVD; > + cm_id_priv->id.remote_id = > + IBA_GET(CM_MRA_LOCAL_COMM_ID, mra_msg); > break; > case IB_CM_REP_SENT: > if (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg) != > @@ -3769,8 +3789,8 @@ static int cm_sidr_rep_handler(struct cm_work *work) > > sidr_rep_msg = (struct cm_sidr_rep_msg *) > work->mad_recv_wc->recv_buf.mad; > - cm_id_priv = cm_acquire_id( > - cpu_to_be32(IBA_GET(CM_SIDR_REP_REQUESTID, sidr_rep_msg)), 0); > + cm_id_priv = cm_acquire_req( > + cpu_to_be32(IBA_GET(CM_SIDR_REP_REQUESTID, sidr_rep_msg))); > if (!cm_id_priv) > return -EINVAL; /* Unmatched reply. */ >
On Thu, Apr 16, 2020 at 03:24:15PM +0200, Håkon Bugge wrote: > > > > On 14 Apr 2020, at 22:40, Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > > On Tue, Apr 14, 2020 at 01:17:20PM +0200, Håkon Bugge wrote: > >> In cm_destroy_id(), when the cm_id state is IB_CM_REQ_SENT or > >> IB_CM_MRA_REQ_RCVD, an attempt is made to send a REJ with > >> IB_CM_REJ_TIMEOUT as the reject reason. > > > > Yes, this causes the remote to destroy the half completed connection, > > for instance if the path is unidirectional. > > > >> However, in said states, we have no remote_id. For the REQ_SENT case, > >> we simply haven't received anything from our peer, > > > > Which is correct, the spec covers this in Table 108 which describes > > the remote communication ID as '0 if REJecting due to REP timeout and > > no MRA was received' > > Yes, the spec has the phrase for a REJ due toa timeout: "The > recipient of a REJ message with this reason code must use this CA > GUID to identify the sender, as it is possible that the Remote > Communication ID in the REJ message may not be valid." [1] It is a bit confusing, should we use the remote_id if not 0 after MRA? Seems like a reasonable thing to do... > > +static struct cm_id_private *cm_acquire_id(__be32 local_id, __be32 remote_id) > > +{ > > + struct cm_id_private *cm_id_priv = cm_acquire_req(local_id); > > + > > + if (READ_ONCE(cm_id_priv->id.remote_id) != remote_id) { > > Hmm, what if cm_id_priv is NULL? Right, it should return NULL, woops > The rest looks good to me, but I would like to backport it to the > version I am familiar with and test it. You might need a lot of backporting at this point :) Jason
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 4794113ecd59..ed80e5b56e2b 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1058,10 +1058,6 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err) case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - cm_send_rej_locked(cm_id_priv, IB_CM_REJ_TIMEOUT, - &cm_id_priv->id.device->node_guid, - sizeof(cm_id_priv->id.device->node_guid), - NULL, 0); break; case IB_CM_REQ_RCVD: if (err == -ENOMEM) {
In cm_destroy_id(), when the cm_id state is IB_CM_REQ_SENT or IB_CM_MRA_REQ_RCVD, an attempt is made to send a REJ with IB_CM_REJ_TIMEOUT as the reject reason. However, in said states, we have no remote_id. For the REQ_SENT case, we simply haven't received anything from our peer, for the MRA_REQ_RCVD case, the cm_rma_handler() doesn't pick up the remote_id. Therefore, it is no reason to send this REJ, since it simply will be tossed at the peer's CM layer (if it reaches it). If running in CX-3 virtualized and having the pr_debug enabled in the mlx4_ib driver, we will see: mlx4_ib_demux_cm_handler: Couldn't find an entry for pv_cm_id 0x0 because the proxy de-muxer is unable to determine which slave this MAD packet should be sent to. Signed-off-by: Håkon Bugge <haakon.bugge@oracle.com> --- drivers/infiniband/core/cm.c | 4 ---- 1 file changed, 4 deletions(-)