diff mbox

[01/28] ibtrs: add header shared between ibtrs_client and ibtrs_server

Message ID 1490352343-20075-2-git-send-email-jinpu.wangl@profitbricks.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jinpu Wang March 24, 2017, 10:45 a.m. UTC
From: Jack Wang <jinpu.wang@profitbricks.com>

Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Kleber Souza <kleber.souza@profitbricks.com>
Signed-off-by: Danil Kipnis <danil.kipnis@profitbricks.com>
Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
---
 include/rdma/ibtrs.h | 514 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 514 insertions(+)
 create mode 100644 include/rdma/ibtrs.h

Comments

Johannes Thumshirn March 24, 2017, 12:35 p.m. UTC | #1
On Fri, Mar 24, 2017 at 11:45:16AM +0100, Jack Wang wrote:
> From: Jack Wang <jinpu.wang@profitbricks.com>
> 
> Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
> Signed-off-by: Kleber Souza <kleber.souza@profitbricks.com>
> Signed-off-by: Danil Kipnis <danil.kipnis@profitbricks.com>
> Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
> ---

[...]

> +
> +#define XX(a) case (a): return #a

please no macros with retun in them and XX isn't quite too descriptive as
well.

[...]

> +static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode)
> +{
> +	switch (opcode) {
> +	XX(IB_WC_SEND);
> +	XX(IB_WC_RDMA_WRITE);
> +	XX(IB_WC_RDMA_READ);
> +	XX(IB_WC_COMP_SWAP);
> +	XX(IB_WC_FETCH_ADD);
> +	/* recv-side); inbound completion */
> +	XX(IB_WC_RECV);
> +	XX(IB_WC_RECV_RDMA_WITH_IMM);
> +	default: return "IB_WC_OPCODE_UNKNOWN";
> +	}
> +}

How about:

struct {
	char *name;
	enum ib_wc_opcode opcode;
} ib_wc_opcode_table[] = {
	{ stringyfy(IB_WC_SEND), IB_WC_SEND },
	{ stringyfy(IB_WC_RDMA_WRITE), IB_WC_RDMA_WRITE },
	{ stringyfy(IB_WC_RDMA_READ ), IB_WC_RDMA_READ }
	{ stringyfy(IB_WC_COMP_SWAP), IB_WC_COMP_SWAP },
	{ stringyfy(IB_WC_FETCH_ADD), IB_WC_FETCH_ADD },
	{ stringyfy(IB_WC_RECV), IB_WC_RECV },
	{ stringyfy(IB_WC_RECV_RDMA_WITH_IMM), IB_WC_RECV_RDMA_WITH_IMM },
	{ NULL, 0 },
};

static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode)
{
	int i;

	for (i = 0; i < ARRAY_SIZE(ib_wc_opcode_table); i++)
		if (ib_wc_opcode_table[i].opcode == opcode)
			return ib_wc_opcode_table[i].name;

	return "IB_WC_OPCODE_UNKNOWN";
}


[...]

> +/**
> + * struct ibtrs_msg_hdr - Common header of all IBTRS messages
> + * @type:	Message type, valid values see: enum ibtrs_msg_types
> + * @tsize:	Total size of transferred data
> + *
> + * Don't move the first 8 padding bytes! It's a workaround for a kernel bug.
> + * See IBNBD-610 for details

What about resolving the kernel bug instead of making workarounds?

> + *
> + * DO NOT CHANGE!
> + */
> +struct ibtrs_msg_hdr {
> +	u8			__padding1;
> +	u8			type;
> +	u16			__padding2;
> +	u32			tsize;
> +};

[...]
Jinpu Wang March 24, 2017, 12:54 p.m. UTC | #2
>> +
>> +#define XX(a) case (a): return #a
>
> please no macros with retun in them and XX isn't quite too descriptive as
> well.
>
> [...]
>
>> +static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode)
>> +{
>> +     switch (opcode) {
>> +     XX(IB_WC_SEND);
>> +     XX(IB_WC_RDMA_WRITE);
>> +     XX(IB_WC_RDMA_READ);
>> +     XX(IB_WC_COMP_SWAP);
>> +     XX(IB_WC_FETCH_ADD);
>> +     /* recv-side); inbound completion */
>> +     XX(IB_WC_RECV);
>> +     XX(IB_WC_RECV_RDMA_WITH_IMM);
>> +     default: return "IB_WC_OPCODE_UNKNOWN";
>> +     }
>> +}
>
> How about:
>
> struct {
>         char *name;
>         enum ib_wc_opcode opcode;
> } ib_wc_opcode_table[] = {
>         { stringyfy(IB_WC_SEND), IB_WC_SEND },
>         { stringyfy(IB_WC_RDMA_WRITE), IB_WC_RDMA_WRITE },
>         { stringyfy(IB_WC_RDMA_READ ), IB_WC_RDMA_READ }
>         { stringyfy(IB_WC_COMP_SWAP), IB_WC_COMP_SWAP },
>         { stringyfy(IB_WC_FETCH_ADD), IB_WC_FETCH_ADD },
>         { stringyfy(IB_WC_RECV), IB_WC_RECV },
>         { stringyfy(IB_WC_RECV_RDMA_WITH_IMM), IB_WC_RECV_RDMA_WITH_IMM },
>         { NULL, 0 },
> };
>
> static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode)
> {
>         int i;
>
>         for (i = 0; i < ARRAY_SIZE(ib_wc_opcode_table); i++)
>                 if (ib_wc_opcode_table[i].opcode == opcode)
>                         return ib_wc_opcode_table[i].name;
>
>         return "IB_WC_OPCODE_UNKNOWN";
> }
>
Looks nice, might be better to put it into ib_verbs.h?

>
> [...]
>
>> +/**
>> + * struct ibtrs_msg_hdr - Common header of all IBTRS messages
>> + * @type:    Message type, valid values see: enum ibtrs_msg_types
>> + * @tsize:   Total size of transferred data
>> + *
>> + * Don't move the first 8 padding bytes! It's a workaround for a kernel bug.
>> + * See IBNBD-610 for details
>
> What about resolving the kernel bug instead of making workarounds?
I tried to send a patch upsteam, but was rejected by Sean.
http://www.spinics.net/lists/linux-rdma/msg22381.html

>
>> + *
>> + * DO NOT CHANGE!
>> + */
>> +struct ibtrs_msg_hdr {
>> +     u8                      __padding1;
>> +     u8                      type;
>> +     u16                     __padding2;
>> +     u32                     tsize;
>> +};
>
> [...]
>
> --
> Johannes Thumshirn                                          Storage
> jthumshirn@suse.de                                +49 911 74053 689
> SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
> GF: Felix Imendörffer, Jane Smithard, Graham Norton
> HRB 21284 (AG Nürnberg)
> Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

Thanks Johannes for review.
Johannes Thumshirn March 24, 2017, 2:31 p.m. UTC | #3
On Fri, Mar 24, 2017 at 01:54:04PM +0100, Jinpu Wang wrote:
> >> +
> >> +#define XX(a) case (a): return #a
> >
> > please no macros with retun in them and XX isn't quite too descriptive as
> > well.
> >
> > [...]
> >
> >> +static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode)
> >> +{
> >> +     switch (opcode) {
> >> +     XX(IB_WC_SEND);
> >> +     XX(IB_WC_RDMA_WRITE);
> >> +     XX(IB_WC_RDMA_READ);
> >> +     XX(IB_WC_COMP_SWAP);
> >> +     XX(IB_WC_FETCH_ADD);
> >> +     /* recv-side); inbound completion */
> >> +     XX(IB_WC_RECV);
> >> +     XX(IB_WC_RECV_RDMA_WITH_IMM);
> >> +     default: return "IB_WC_OPCODE_UNKNOWN";
> >> +     }
> >> +}
> >
> > How about:
> >
> > struct {
> >         char *name;
> >         enum ib_wc_opcode opcode;
> > } ib_wc_opcode_table[] = {
> >         { stringyfy(IB_WC_SEND), IB_WC_SEND },
> >         { stringyfy(IB_WC_RDMA_WRITE), IB_WC_RDMA_WRITE },
> >         { stringyfy(IB_WC_RDMA_READ ), IB_WC_RDMA_READ }
> >         { stringyfy(IB_WC_COMP_SWAP), IB_WC_COMP_SWAP },
> >         { stringyfy(IB_WC_FETCH_ADD), IB_WC_FETCH_ADD },
> >         { stringyfy(IB_WC_RECV), IB_WC_RECV },
> >         { stringyfy(IB_WC_RECV_RDMA_WITH_IMM), IB_WC_RECV_RDMA_WITH_IMM },
> >         { NULL, 0 },
> > };
> >
> > static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode)
> > {
> >         int i;
> >
> >         for (i = 0; i < ARRAY_SIZE(ib_wc_opcode_table); i++)
> >                 if (ib_wc_opcode_table[i].opcode == opcode)
> >                         return ib_wc_opcode_table[i].name;
> >
> >         return "IB_WC_OPCODE_UNKNOWN";
> > }
> >
> Looks nice, might be better to put it into ib_verbs.h?

Probably yes, as are your kvec functions for lib/iov_iter.c

[...]

> > What about resolving the kernel bug instead of making workarounds?
> I tried to send a patch upsteam, but was rejected by Sean.
> http://www.spinics.net/lists/linux-rdma/msg22381.html
> 

I don't see a NACK in this thread.

From http://www.spinics.net/lists/linux-rdma/msg22410.html:
"The port space (which maps to the service ID) needs to be included as part of
the check that determines the format of the private data, and not simply the
address family." 

After such a state I would have expected to see a v2 of the patch with above
comment addressed.

Byte,
	Johannes
Jinpu Wang March 24, 2017, 2:35 p.m. UTC | #4
On Fri, Mar 24, 2017 at 3:31 PM, Johannes Thumshirn <jthumshirn@suse.de> wrote:
> On Fri, Mar 24, 2017 at 01:54:04PM +0100, Jinpu Wang wrote:
>> >> +
>> >> +#define XX(a) case (a): return #a
>> >
>> > please no macros with retun in them and XX isn't quite too descriptive as
>> > well.
>> >
>> > [...]
>> >
>> >> +static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode)
>> >> +{
>> >> +     switch (opcode) {
>> >> +     XX(IB_WC_SEND);
>> >> +     XX(IB_WC_RDMA_WRITE);
>> >> +     XX(IB_WC_RDMA_READ);
>> >> +     XX(IB_WC_COMP_SWAP);
>> >> +     XX(IB_WC_FETCH_ADD);
>> >> +     /* recv-side); inbound completion */
>> >> +     XX(IB_WC_RECV);
>> >> +     XX(IB_WC_RECV_RDMA_WITH_IMM);
>> >> +     default: return "IB_WC_OPCODE_UNKNOWN";
>> >> +     }
>> >> +}
>> >
>> > How about:
>> >
>> > struct {
>> >         char *name;
>> >         enum ib_wc_opcode opcode;
>> > } ib_wc_opcode_table[] = {
>> >         { stringyfy(IB_WC_SEND), IB_WC_SEND },
>> >         { stringyfy(IB_WC_RDMA_WRITE), IB_WC_RDMA_WRITE },
>> >         { stringyfy(IB_WC_RDMA_READ ), IB_WC_RDMA_READ }
>> >         { stringyfy(IB_WC_COMP_SWAP), IB_WC_COMP_SWAP },
>> >         { stringyfy(IB_WC_FETCH_ADD), IB_WC_FETCH_ADD },
>> >         { stringyfy(IB_WC_RECV), IB_WC_RECV },
>> >         { stringyfy(IB_WC_RECV_RDMA_WITH_IMM), IB_WC_RECV_RDMA_WITH_IMM },
>> >         { NULL, 0 },
>> > };
>> >
>> > static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode)
>> > {
>> >         int i;
>> >
>> >         for (i = 0; i < ARRAY_SIZE(ib_wc_opcode_table); i++)
>> >                 if (ib_wc_opcode_table[i].opcode == opcode)
>> >                         return ib_wc_opcode_table[i].name;
>> >
>> >         return "IB_WC_OPCODE_UNKNOWN";
>> > }
>> >
>> Looks nice, might be better to put it into ib_verbs.h?
>
> Probably yes, as are your kvec functions for lib/iov_iter.c
Thanks, will do in next round!

>
> [...]
>
>> > What about resolving the kernel bug instead of making workarounds?
>> I tried to send a patch upsteam, but was rejected by Sean.
>> http://www.spinics.net/lists/linux-rdma/msg22381.html
>>
>
> I don't see a NACK in this thread.
>
> From http://www.spinics.net/lists/linux-rdma/msg22410.html:
> "The port space (which maps to the service ID) needs to be included as part of
> the check that determines the format of the private data, and not simply the
> address family."
>
> After such a state I would have expected to see a v2 of the patch with above
> comment addressed.
I might busy with other staff at that time, I will check again and
revisit the bug.

>
> Byte,
>         Johannes
> --
> Johannes Thumshirn                                          Storage
> jthumshirn@suse.de                                +49 911 74053 689
> SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
> GF: Felix Imendörffer, Jane Smithard, Graham Norton
> HRB 21284 (AG Nürnberg)
> Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

Regards,
diff mbox

Patch

diff --git a/include/rdma/ibtrs.h b/include/rdma/ibtrs.h
new file mode 100644
index 0000000..4fc572b
--- /dev/null
+++ b/include/rdma/ibtrs.h
@@ -0,0 +1,514 @@ 
+/*
+ * InfiniBand Transport Layer
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler < mail@fholler.de>
+ *          Jack Wang <jinpu.wang@profitbricks.com>
+ *   	    Kleber Souza <kleber.souza@profitbricks.com>
+ * 	    Danil Kipnis <danil.kipnis@profitbricks.com>
+ *   	    Roman Pen <roman.penyaev@profitbricks.com>
+ *          Milind Dumbare <Milind.dumbare@gmail.com>
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *    substantially similar to the "NO WARRANTY" disclaimer below
+ *    ("Disclaimer") and any redistribution must be conditioned upon
+ *    including a substantially similar Disclaimer requirement for further
+ *    binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ *    of any contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ */
+
+#ifndef __IBTRS_H
+#define __IBTRS_H
+
+#include <linux/uio.h>
+#include <linux/types.h>
+#include <linux/uuid.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/ib_cm.h>
+#include <linux/list.h>
+#include <linux/dma-direction.h>
+#include <rdma/ib_verbs.h>
+#include <linux/time.h>
+#include <linux/ktime.h>
+#include <linux/timekeeping.h>
+
+#define IBTRS_SERVER_PORT 1234
+#define WC_ARRAY_SIZE 16
+#define IB_APM_TIMEOUT 16 /* 4.096 * 2 ^ 16 = 260 msec */
+
+#define USR_MSG_CNT 64
+#define USR_CON_BUF_SIZE (USR_MSG_CNT * 2) /* double bufs for ACK's */
+
+#define DEFAULT_HEARTBEAT_TIMEOUT_MS 20000
+#define MIN_HEARTBEAT_TIMEOUT_MS 5000
+#define HEARTBEAT_INTV_MS 500
+#define HEARTBEAT_INTV_JIFFIES msecs_to_jiffies(HEARTBEAT_INTV_MS)
+
+#define MIN_RTR_CNT 1
+#define MAX_RTR_CNT 7
+
+/*
+ * With the current size of the tag allocated on the client, 4K is the maximum
+ * number of tags we can allocate. (see IBNBD-2321)
+ * This number is also used on the client to allocate the IU for the user
+ * connection to receive the RDMA addresses from the server.
+ */
+#define MAX_SESS_QUEUE_DEPTH 4096
+
+#define XX(a) case (a): return #a
+
+#define IBTRS_ADDRLEN sizeof("ipv6:[xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx]")
+
+static inline const char *ib_wc_opcode_str(enum ib_wc_opcode opcode)
+{
+	switch (opcode) {
+	XX(IB_WC_SEND);
+	XX(IB_WC_RDMA_WRITE);
+	XX(IB_WC_RDMA_READ);
+	XX(IB_WC_COMP_SWAP);
+	XX(IB_WC_FETCH_ADD);
+	/* recv-side); inbound completion */
+	XX(IB_WC_RECV);
+	XX(IB_WC_RECV_RDMA_WITH_IMM);
+	default: return "IB_WC_OPCODE_UNKNOWN";
+	}
+}
+
+
+struct ib_session {
+	struct ib_pd		*pd;
+	struct ib_mr		*mr;
+	struct ib_event_handler	event_handler;
+};
+
+struct ibtrs_ib_path {
+	union ib_gid    p_sgid;
+	union ib_gid    p_dgid;
+};
+
+struct ib_con {
+	struct ib_qp		*qp ____cacheline_aligned;
+	struct ib_cq		*cq ____cacheline_aligned;
+	struct ib_send_wr	beacon;
+	struct rdma_cm_id	*cm_id;
+	struct ibtrs_ib_path    pri_path;
+	struct ibtrs_ib_path   cur_path;
+	char			*addr;
+	char			*hostname;
+};
+
+struct ibtrs_iu {
+	struct list_head        list;
+	dma_addr_t              dma_addr;
+	void                    *buf;
+	size_t                  size;
+	enum dma_data_direction direction;
+	bool			is_msg;
+	u32			tag;
+};
+
+struct ibtrs_heartbeat {
+	atomic64_t	send_ts_ms;
+	atomic64_t	recv_ts_ms;
+	u32		timeout_ms;
+	u32		warn_timeout_ms;
+	char		*addr;
+	char		*hostname;
+};
+
+#define IBTRS_VERSION 2
+#define IBTRS_UUID_SIZE 16
+#define IO_MSG_SIZE 24
+#define IB_IMM_SIZE_BITS 32
+
+#define GCC_DIAGNOSTIC_AWARE ((__GNUC__ > 6))
+#if GCC_DIAGNOSTIC_AWARE
+#pragma GCC diagnostic push
+#pragma GCC diagnostic warning "-Wpadded"
+#endif
+
+/**
+ * enum ibtrs_msg_types - IBTRS message types. DO NOT REMOVE OR REORDER!!!
+ * @IBTRS_MSG_SESS_OPEN:	Client requests new session on Server
+ * @IBTRS_MSG_SESS_OPEN_RESP:	Server informs Client about session parameters
+ * @IBTRS_MSG_CON_OPEN:		Client requests new connection to server
+ * @IBTRS_MSG_RDMA_WRITE:	Client writes data per RDMA to Server
+ * @IBTRS_MSG_REQ_RDMA_WRITE:	Client requests data transfer per RDMA
+ * @IBTRS_MSG_USER:		Data transfer per Infiniband message
+ * @IBTRS_MSG_ERR:		Fatal Error happened
+ * @IBTRS_MSG_SESS_INFO:	Client requests about session info
+ */
+enum ibtrs_msg_types {
+	IBTRS_MSG_SESS_OPEN,
+	IBTRS_MSG_SESS_OPEN_RESP,
+	IBTRS_MSG_CON_OPEN,
+	IBTRS_MSG_RDMA_WRITE,
+	IBTRS_MSG_REQ_RDMA_WRITE,
+	IBTRS_MSG_USER,
+	IBTRS_MSG_ERROR,
+	IBTRS_MSG_SESS_INFO,
+};
+
+/**
+ * struct ibtrs_msg_hdr - Common header of all IBTRS messages
+ * @type:	Message type, valid values see: enum ibtrs_msg_types
+ * @tsize:	Total size of transferred data
+ *
+ * Don't move the first 8 padding bytes! It's a workaround for a kernel bug.
+ * See IBNBD-610 for details
+ *
+ * DO NOT CHANGE!
+ */
+struct ibtrs_msg_hdr {
+	u8			__padding1;
+	u8			type;
+	u16			__padding2;
+	u32			tsize;
+};
+
+#define IBTRS_HDR_LEN sizeof(struct ibtrs_msg_hdr)
+
+/**
+ * struct ibtrs_msg_session_open - Opens a new session between client and server
+ * @hdr:	message header
+ * @uuid:	client host identifier, unique until module reload
+ * @ver:	IBTRS protocol version
+ * @con_cnt:    number of connections in this session
+ * @reserved:   reserved fields for future usage, 28 bytes is maximum for
+ *		all IPv6/IPv4 session
+ *
+ * DO NOT CHANGE members before ver.
+ */
+struct ibtrs_msg_sess_open {
+	struct ibtrs_msg_hdr	hdr;
+	u8			uuid[IBTRS_UUID_SIZE];
+	u8			ver;
+	u8			con_cnt;
+	u8			reserved[30];
+};
+
+/**
+ * struct ibtrs_msg_sess_info
+ * @hdr:		message header
+ * @hostname:		client host name
+ */
+struct ibtrs_msg_sess_info {
+	struct ibtrs_msg_hdr	hdr;
+	u8                      hostname[MAXHOSTNAMELEN];
+};
+
+#define MSG_SESS_INFO_SIZE sizeof(struct ibtrs_msg_sess_info)
+
+/*
+ *  Data Layout in RDMA-Bufs:
+ *
+ * +---------RDMA-BUF--------+
+ * |         Slice N	     |
+ * | +---------------------+ |
+ * | |      I/O data       | |
+ * | |---------------------| |
+ * | |      IBNBD MSG	   | |
+ * | |---------------------| |
+ * | |	    IBTRS MSG	   | |
+ * | +---------------------+ |
+ * +-------------------------+
+ * |	     Slice N+1	     |
+ * | +---------------------+ |
+ * | |       I/O data	   | |
+ * | |---------------------| |
+ * | |	     IBNBD MSG     | |
+ * | |---------------------| |
+ * | |       IBTRS MSG     | |
+ * | +---------------------+ |
+ * +-------------------------+
+ */
+
+#define IBTRS_MSG_RESV_LEN 128
+/**
+ * struct ibtrs_msg_sess_open_resp - Servers response to %IBTRS_MSG_SESS_OPEN
+ * @hdr:	message header
+ * @ver:	IBTRS protocol version
+ * @cnt:	Number of rdma addresses in this message
+ * @rkey:	remote key to allow client to access buffers
+ * @hostname:   hostname of local host
+ * @reserved:    reserved fields for future usage
+ * @max_inflight_msg:  max inflight messages (queue-depth) in this session
+ * @max_io_size:   max io size server supports
+ * @max_req_size:   max infiniband message size server supports
+ * @addr:	rdma addresses of buffers
+ *
+ * DO NOT CHANGE members before ver.
+ */
+struct ibtrs_msg_sess_open_resp {
+	struct ibtrs_msg_hdr	hdr;
+	u8			ver;
+	u8			__padding1;
+	u16			cnt;
+	u32			rkey;
+	u8                      hostname[MAXHOSTNAMELEN];
+	u8			reserved[IBTRS_MSG_RESV_LEN];
+	u16			max_inflight_msg;
+	u32			max_io_size;
+	u32			max_req_size;
+	u64			addr[];
+};
+
+#define IBTRS_MSG_SESS_OPEN_RESP_LEN(cnt) \
+	(sizeof(struct ibtrs_msg_sess_open_resp) + sizeof(u64) * cnt)
+/**
+ * struct ibtrs_msg_con_open - Opens a new connection between client and server
+ * @hdr:		message header
+ * @uuid:		client host identifier, unique until module reload
+ */
+struct ibtrs_msg_con_open {
+	struct ibtrs_msg_hdr	hdr;
+	u8			uuid[IBTRS_UUID_SIZE];
+};
+
+/**
+ * struct ibtrs_msg_user - Data exchanged a Infiniband message
+ * @hdr:		message header
+ * @payl:		Payload from user user module
+ */
+struct ibtrs_msg_user {
+	struct ibtrs_msg_hdr	hdr;
+	u8			payl[];
+};
+
+/**
+ * struct ibtrs_sg_desc - RDMA-Buffer entry description
+ * @addr:	Address of RDMA destination buffer
+ * @key:	Authorization rkey to write to the buffer
+ * @len:	Size of the buffer
+ */
+struct ibtrs_sg_desc {
+	u64			addr;
+	u32			key;
+	u32			len;
+};
+
+#define IBTRS_SG_DESC_LEN sizeof(struct ibtrs_sg_desc)
+
+/**
+ * struct ibtrs_msg_req_rdma_write - RDMA data transfer request from client
+ * @hdr:		message header
+ * @sg_cnt:		number of @desc entries
+ * @desc:		RDMA bufferst where the server can write the result to
+ */
+struct ibtrs_msg_req_rdma_write {
+	struct ibtrs_msg_hdr	hdr;
+	u32			__padding;
+	u32			sg_cnt;
+	struct ibtrs_sg_desc    desc[];
+};
+
+/**
+ * struct_msg_rdma_write - Message transferred to server with RDMA-Write
+ * @hdr:		message header
+ */
+struct ibtrs_msg_rdma_write {
+	struct ibtrs_msg_hdr	hdr;
+};
+
+/**
+ * struct ibtrs_msg_error - Error message
+ * @hdr:		message header
+ * @errno:		Errno number describing the error
+ */
+struct ibtrs_msg_error {
+	struct ibtrs_msg_hdr	hdr;
+	s32			errno;
+	u32			__padding;
+};
+
+#if GCC_DIAGNOSTIC_AWARE
+#pragma GCC diagnostic pop
+#endif
+
+int ibtrs_validate_message(u16 queue_depth, const void *hdr);
+
+void fill_ibtrs_msg_sess_open(struct ibtrs_msg_sess_open *msg, u8 con_cnt,
+			      const uuid_le *uuid);
+
+void fill_ibtrs_msg_con_open(struct ibtrs_msg_con_open *msg,
+			     const uuid_le *uuid);
+
+void fill_ibtrs_msg_sess_info(struct ibtrs_msg_sess_info *msg,
+			      const char *hostname);
+
+void ibtrs_heartbeat_set_send_ts(struct ibtrs_heartbeat *h);
+void ibtrs_set_last_heartbeat(struct ibtrs_heartbeat *h);
+u64 ibtrs_last_heartbeat_diff_ms(const struct ibtrs_heartbeat *h);
+u64 ibtrs_heartbeat_send_ts_diff_ms(const struct ibtrs_heartbeat *h);
+
+void ibtrs_set_heartbeat_timeout(struct ibtrs_heartbeat *h, u32 timeout_ms);
+
+void ibtrs_heartbeat_warn(const struct ibtrs_heartbeat *h);
+
+bool ibtrs_heartbeat_timeout_is_expired(const struct ibtrs_heartbeat *h);
+
+u32 ibtrs_heartbeat_get_send_delay(const struct ibtrs_heartbeat *h);
+u32 ibtrs_heartbeat_get_check_delay(const struct ibtrs_heartbeat *h);
+void ibtrs_iu_put(struct list_head *iu_list, struct ibtrs_iu *iu);
+struct ibtrs_iu *ibtrs_iu_get(struct list_head *iu_list);
+
+struct ibtrs_iu *ibtrs_iu_alloc(u32 tag, size_t size, gfp_t t,
+				struct ib_device *dev,
+				enum dma_data_direction, bool is_msg);
+
+void ibtrs_iu_free(struct ibtrs_iu *iu, enum dma_data_direction dir,
+		   struct ib_device *dev);
+
+int ibtrs_write_empty_imm(struct ib_qp *qp, u32 imm_data,
+			  enum ib_send_flags flags);
+
+int ibtrs_post_send(struct ib_qp *qp, struct ib_mr *mr, struct ibtrs_iu *iu,
+		    u32 size);
+
+int ib_post_rdma_write_imm(struct ib_qp *qp, struct ib_sge *sge,
+			   unsigned int num_sge, u32 rkey, u64 rdma_addr,
+			   u64 wr_id, u32 imm_data, enum ib_send_flags flags);
+
+int ib_post_rdma_write(struct ib_qp *qp, struct ib_sge *sge,
+		       unsigned int num_sge, u32 rkey, u64 rdma_addr,
+		       u64 wr_id);
+int post_beacon(struct ib_con *con);
+/**
+ * ib_session_init() - Create a new IB session
+ */
+int ib_session_init(struct ib_device *dev, struct ib_session *session);
+
+/**
+ * ib_con_init() - initialize and add a ib_con to the session
+ * @con:	&ib_con to initialize
+ * @session:	session the &ib_con is added to
+ * @ctx:	CQ context, returned to the user via completion handler
+ *
+ * Returns 0 on success otherwise a negative errno code
+ */
+int ib_con_init(struct ib_con *con, struct rdma_cm_id *cm_id,
+		u32 max_send_sge,
+		ib_comp_handler comp_handler, void *ctx, int cq_vector,
+		u16 cq_size, u16 wr_queue_size, struct ib_session *session);
+
+int ibtrs_request_cq_notifications(struct ib_con *con);
+
+void ib_con_destroy(struct ib_con *con);
+
+/**
+ * ib_session_destroy() - Free a session
+ * The corresponding &ib_con must have been freed before.
+ */
+void ib_session_destroy(struct ib_session *session);
+
+int ib_get_max_wr_queue_size(struct ib_device *dev);
+
+int ibtrs_addr_to_str(const struct sockaddr_storage *addr, char *buf,
+		      size_t len);
+
+int ibtrs_heartbeat_timeout_validate(int timeout);
+
+/**
+ * kvec_length() - Total number of bytes covered by an kvec.
+ */
+static inline size_t kvec_length(const struct kvec *vec, size_t nr)
+{
+	size_t seg, ret = 0;
+
+	for (seg = 0; seg < nr; seg++)
+		ret += vec[seg].iov_len;
+	return ret;
+}
+
+/**
+ * copy_from_kvec() - Copy kvec to the buffer.
+ */
+static inline void copy_from_kvec(void *data, const struct kvec *vec,
+				  size_t copy)
+{
+	size_t seg, len;
+
+	for (seg = 0; copy; seg++) {
+		len = min(vec[seg].iov_len, copy);
+		memcpy(data, vec[seg].iov_base, len);
+		data += len;
+		copy -= len;
+	}
+}
+
+static inline u64 timespec_to_ms(const struct timespec *ts)
+{
+	return timespec_to_ns(ts) / NSEC_PER_MSEC;
+}
+
+u64 timediff_cur_ms(u64 cur_ms);
+
+void *ibtrs_malloc(size_t size);
+void *ibtrs_zalloc(size_t size);
+
+#define STAT_STORE_FUNC(store, reset) \
+static ssize_t store##_store(struct kobject *kobj, \
+			    struct kobj_attribute *attr, \
+			    const char *buf, size_t count) \
+{ \
+	int ret = -EINVAL; \
+	struct ibtrs_session *sess = container_of(kobj, struct ibtrs_session, \
+						  kobj_stats); \
+\
+	if (sysfs_streq(buf, "1")) \
+		ret = reset(sess, true); \
+	else if (sysfs_streq(buf, "0"))\
+		ret = reset(sess, false); \
+	if (ret) \
+		return ret; \
+\
+	return count; \
+}
+
+#define STAT_SHOW_FUNC(show, print) \
+static ssize_t show##_show(struct kobject *kobj, \
+			   struct kobj_attribute *attr, \
+			   char *page) \
+{ \
+	struct ibtrs_session *sess = container_of(kobj, struct ibtrs_session, \
+						  kobj_stats); \
+\
+	return print(sess, page, PAGE_SIZE); \
+}
+
+#define STAT_ATTR(stat, print, reset) \
+STAT_STORE_FUNC(stat, reset) \
+STAT_SHOW_FUNC(stat, print) \
+static struct kobj_attribute stat##_attr = \
+		__ATTR(stat, 0644, \
+		       stat##_show, \
+		       stat##_store)
+
+#endif /*__IBTRS_H*/