diff mbox

[v4,09/14] IB/cm: Expose BTH P_Key in CM and SIDR request events

Message ID 1438267826-32155-10-git-send-email-haggaie@mellanox.com (mailing list archive)
State Accepted
Headers show

Commit Message

Haggai Eran July 30, 2015, 2:50 p.m. UTC
The rdma_cm module will later use the P_Key from the BTH to de-mux
requests.

See discussion at:
  http://www.spinics.net/lists/netdev/msg336067.html

Cc: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Cc: Liran Liss <liranl@mellanox.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
---
 drivers/infiniband/core/cm.c | 20 ++++++++++++++++++++
 include/rdma/ib_cm.h         |  6 ++++++
 2 files changed, 26 insertions(+)

Comments

Sagi Grimberg Aug. 30, 2015, 6:23 p.m. UTC | #1
On 7/30/2015 5:50 PM, Haggai Eran wrote:
> The rdma_cm module will later use the P_Key from the BTH to de-mux
> requests.
>
> See discussion at:
>    http://www.spinics.net/lists/netdev/msg336067.html

I've been hitting errors with srp target with this series applied.

Not sure if this series exposes a bug in ib_srpt or breaks it at
this point, so I just thought I'd send it out at the meantime...

Looks like for some reason cm_get_bth_pkey got pkey_index of 0xffff
instead of 0 (working on the default pkey 0xffff at entry 0).

I have modified the srp initiator, but I doubt that my changes are
related at the moment as well as I didn't modify the channel
establishment at all.

log:
infiniband mlx5_0: ib_cm: Couldn't retrieve pkey for incoming request 
(port 1, pkey index 65535). -22
ib_srpt Received SRP_LOGIN_REQ with i_port_id 0x0:0x2c90300ed0960, 
t_port_id 0x2c90300ed0950:0x2c90300ed0950 and it_iu_len 260 on port 1 
(guid=0xfe80000000000000:0x2c90300ed0950)
ib_srpt Session : kernel thread ib_srpt_compl (PID 8584) started
infiniband mlx5_0: ib_cm: Couldn't retrieve pkey for incoming request 
(port 1, pkey index 65535). -22
ib_srpt Received SRP_LOGIN_REQ with i_port_id 0x0:0x2c90300ed0960, 
t_port_id 0x2c90300ed0950:0x2c90300ed0950 and it_iu_len 260 on port 1 
(guid=0xfe80000000000000:0x2c90300ed0950)
ib_srpt Session : kernel thread ib_srpt_compl (PID 8585) started
mlx5_0:dump_cqe:238:(pid 8584): dump error cqe
00000000 00000000 00000000 00000000
00000000 00000000 00000000 00000000
0000002b 00000000 00000000 00000000
00000000 94003004 0000002c 0000b8e0
ib_srpt receiving failed for idx 0 with status 4
0000:04:00.0:poll_health:151:(pid 0): device's health compromised
assert_var[0] 0x00000094
assert_var[1] 0x00000000
assert_var[2] 0x00000000
assert_var[3] 0x00000000
assert_var[4] 0x00000000
assert_exit_ptr 0x0061d35c
assert_callra 0x0067a5f4
fw_ver 0xa0641900
hw_id 0x000001ff
irisc_index 2
synd 0x1: firmware internal error
ext_sync 0x0000
0000:04:00.0:health_care:76:(pid 7943): handling bad device here
ib_srpt Received DREQ and sent DREP for session 
0x00000000000000000002c90300ed0960.
ib_srpt Received DREQ and sent DREP for session 
0x00000000000000000002c90300ed0960.
ib_srpt Received IB TimeWait exit for cm_id ffff88046d1fb200.
ib_srpt Received IB TimeWait exit for cm_id ffff880454ffa000.
ib_srpt Session 0x00000000000000000002c90300ed0960: kernel thread 
ib_srpt_compl (PID 8585) stopped

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Haggai Eran Aug. 31, 2015, 6:50 a.m. UTC | #2
On 30/08/2015 21:23, Sagi Grimberg wrote:
> 
> Looks like for some reason cm_get_bth_pkey got pkey_index of 0xffff
> instead of 0 (working on the default pkey 0xffff at entry 0).

It looks like the mlx5 driver doesn't interpret the completion format
correctly. It takes a field defined in the programmer reference manual
as pkey, and interprets it as pkey_index [1].

> log:
> infiniband mlx5_0: ib_cm: Couldn't retrieve pkey for incoming request (port 1, pkey index 65535). -22
> ib_srpt Received SRP_LOGIN_REQ with i_port_id 0x0:0x2c90300ed0960, t_port_id 0x2c90300ed0950:0x2c90300ed0950 and it_iu_len 260 on port 1 (guid=0xfe80000000000000:0x2c90300ed0950)
> ib_srpt Session : kernel thread ib_srpt_compl (PID 8584) started
> infiniband mlx5_0: ib_cm: Couldn't retrieve pkey for incoming request (port 1, pkey index 65535). -22
> ib_srpt Received SRP_LOGIN_REQ with i_port_id 0x0:0x2c90300ed0960, t_port_id 0x2c90300ed0950:0x2c90300ed0950 and it_iu_len 260 on port 1 (guid=0xfe80000000000000:0x2c90300ed0950)
> ib_srpt Session : kernel thread ib_srpt_compl (PID 8585) started
> mlx5_0:dump_cqe:238:(pid 8584): dump error cqe
> 00000000 00000000 00000000 00000000
> 00000000 00000000 00000000 00000000
> 0000002b 00000000 00000000 00000000
> 00000000 94003004 0000002c 0000b8e0
> ib_srpt receiving failed for idx 0 with status 4
> 0000:04:00.0:poll_health:151:(pid 0): device's health compromised
> assert_var[0] 0x00000094
> assert_var[1] 0x00000000
> assert_var[2] 0x00000000
> assert_var[3] 0x00000000
> assert_var[4] 0x00000000
> assert_exit_ptr 0x0061d35c
> assert_callra 0x0067a5f4
> fw_ver 0xa0641900
> hw_id 0x000001ff
> irisc_index 2
> synd 0x1: firmware internal error
> ext_sync 0x0000
> 0000:04:00.0:health_care:76:(pid 7943): handling bad device here
> ib_srpt Received DREQ and sent DREP for session 0x00000000000000000002c90300ed0960.
> ib_srpt Received DREQ and sent DREP for session 0x00000000000000000002c90300ed0960.
> ib_srpt Received IB TimeWait exit for cm_id ffff88046d1fb200.
> ib_srpt Received IB TimeWait exit for cm_id ffff880454ffa000.
> ib_srpt Session 0x00000000000000000002c90300ed0960: kernel thread ib_srpt_compl (PID 8585) stopped

I don't know how that can cause all the other errors though.

Haggai

[1]
http://lxr.free-electrons.com/source/drivers/infiniband/hw/mlx5/cq.c?v=4.1#L230
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg Aug. 31, 2015, 7:41 a.m. UTC | #3
On 8/31/2015 9:50 AM, Haggai Eran wrote:
> On 30/08/2015 21:23, Sagi Grimberg wrote:
>>
>> Looks like for some reason cm_get_bth_pkey got pkey_index of 0xffff
>> instead of 0 (working on the default pkey 0xffff at entry 0).
>
> It looks like the mlx5 driver doesn't interpret the completion format
> correctly. It takes a field defined in the programmer reference manual
> as pkey, and interprets it as pkey_index [1].

You're right! I wonder how this ever used to work (and it did...).
So the driver needs to lookup a pkey_index on each GSI packet?

>
>> log:
>> infiniband mlx5_0: ib_cm: Couldn't retrieve pkey for incoming request (port 1, pkey index 65535). -22
>> ib_srpt Received SRP_LOGIN_REQ with i_port_id 0x0:0x2c90300ed0960, t_port_id 0x2c90300ed0950:0x2c90300ed0950 and it_iu_len 260 on port 1 (guid=0xfe80000000000000:0x2c90300ed0950)
>> ib_srpt Session : kernel thread ib_srpt_compl (PID 8584) started
>> infiniband mlx5_0: ib_cm: Couldn't retrieve pkey for incoming request (port 1, pkey index 65535). -22
>> ib_srpt Received SRP_LOGIN_REQ with i_port_id 0x0:0x2c90300ed0960, t_port_id 0x2c90300ed0950:0x2c90300ed0950 and it_iu_len 260 on port 1 (guid=0xfe80000000000000:0x2c90300ed0950)
>> ib_srpt Session : kernel thread ib_srpt_compl (PID 8585) started
>> mlx5_0:dump_cqe:238:(pid 8584): dump error cqe
>> 00000000 00000000 00000000 00000000
>> 00000000 00000000 00000000 00000000
>> 0000002b 00000000 00000000 00000000
>> 00000000 94003004 0000002c 0000b8e0
>> ib_srpt receiving failed for idx 0 with status 4
>> 0000:04:00.0:poll_health:151:(pid 0): device's health compromised
>> assert_var[0] 0x00000094
>> assert_var[1] 0x00000000
>> assert_var[2] 0x00000000
>> assert_var[3] 0x00000000
>> assert_var[4] 0x00000000
>> assert_exit_ptr 0x0061d35c
>> assert_callra 0x0067a5f4
>> fw_ver 0xa0641900
>> hw_id 0x000001ff
>> irisc_index 2
>> synd 0x1: firmware internal error
>> ext_sync 0x0000
>> 0000:04:00.0:health_care:76:(pid 7943): handling bad device here
>> ib_srpt Received DREQ and sent DREP for session 0x00000000000000000002c90300ed0960.
>> ib_srpt Received DREQ and sent DREP for session 0x00000000000000000002c90300ed0960.
>> ib_srpt Received IB TimeWait exit for cm_id ffff88046d1fb200.
>> ib_srpt Received IB TimeWait exit for cm_id ffff880454ffa000.
>> ib_srpt Session 0x00000000000000000002c90300ed0960: kernel thread ib_srpt_compl (PID 8585) stopped
>
> I don't know how that can cause all the other errors though.

Me neither...
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index fa3d3e755127..d2b2c83f0076 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -1404,6 +1404,24 @@  static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
 	}
 }
 
+static u16 cm_get_bth_pkey(struct cm_work *work)
+{
+	struct ib_device *ib_dev = work->port->cm_dev->ib_device;
+	u8 port_num = work->port->port_num;
+	u16 pkey_index = work->mad_recv_wc->wc->pkey_index;
+	u16 pkey;
+	int ret;
+
+	ret = ib_get_cached_pkey(ib_dev, port_num, pkey_index, &pkey);
+	if (ret) {
+		dev_warn_ratelimited(&ib_dev->dev, "ib_cm: Couldn't retrieve pkey for incoming request (port %d, pkey index %d). %d\n",
+				     port_num, pkey_index, ret);
+		return 0;
+	}
+
+	return pkey;
+}
+
 static void cm_format_req_event(struct cm_work *work,
 				struct cm_id_private *cm_id_priv,
 				struct ib_cm_id *listen_id)
@@ -1414,6 +1432,7 @@  static void cm_format_req_event(struct cm_work *work,
 	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
 	param = &work->cm_event.param.req_rcvd;
 	param->listen_id = listen_id;
+	param->bth_pkey = cm_get_bth_pkey(work);
 	param->port = cm_id_priv->av.port->port_num;
 	param->primary_path = &work->path[0];
 	if (req_msg->alt_local_lid)
@@ -3105,6 +3124,7 @@  static void cm_format_sidr_req_event(struct cm_work *work,
 	param->pkey = __be16_to_cpu(sidr_req_msg->pkey);
 	param->listen_id = listen_id;
 	param->service_id = sidr_req_msg->service_id;
+	param->bth_pkey = cm_get_bth_pkey(work);
 	param->port = work->port->port_num;
 	work->cm_event.private_data = &sidr_req_msg->private_data;
 }
diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h
index 9cc496e1f2ad..e3f48632e237 100644
--- a/include/rdma/ib_cm.h
+++ b/include/rdma/ib_cm.h
@@ -113,6 +113,10 @@  struct ib_cm_id;
 
 struct ib_cm_req_event_param {
 	struct ib_cm_id		*listen_id;
+
+	/* P_Key that was used by the GMP's BTH header */
+	u16			bth_pkey;
+
 	u8			port;
 
 	struct ib_sa_path_rec	*primary_path;
@@ -224,6 +228,8 @@  struct ib_cm_apr_event_param {
 struct ib_cm_sidr_req_event_param {
 	struct ib_cm_id		*listen_id;
 	__be64			service_id;
+	/* P_Key that was used by the GMP's BTH header */
+	u16			bth_pkey;
 	u8			port;
 	u16			pkey;
 };