diff mbox

[RFC,1/2] libibverbs: Allow 3rd party extensions to verb routines

Message ID 1828884A29C6694DAF28B7E6B8A82373012B5B@ORSMSX101.amr.corp.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Hefty, Sean June 4, 2011, 12:34 a.m. UTC
In order to support OFED or vendor specific calls, define a
generic extension mechanism.  This allows OFED, an RDMA vendor,
or another registered 3rd party (for example, the librdmacm)
to define RDMA extensions.

Users which make use extensions are aware that they are not
only using an extended call, but are given information regarding
how widely the extension by be supported.  Support for extended
functions, data structures, and enums are defined.

Extensions are referenced by name.  There is an assumption that
extension names are prefixed relative to the supporting party.
Until an extension has been incorporated into libibverbs, it
should be defined in an appropriate external header file.

For example, OFA could provide a header file with their definition
for XRC extensions.  A partial view of such a header file might
look something similar to:

#ifndef OFA_XRC_H
#define OFA_XRC_H
#include <infiniband/verbs.h>

#define OFA_XRC_OPS "ofa-xrc"

/* Extend IBV_QP_TYPE for XRC */
#define OFA_QPT_XRC ((enum ibv_qp_type) \
	(IBV_EXTENSION_OFA << IBV_EXTENSION_BASE_SHIFT) + 6)

struct ofa_xrcd {
	struct ibv_context *context;
};

struct ofa_xrc_ops {
	struct ofa_xrcd * (*open_xrcd)(struct ibv_context *context,
					inf fd, int oflags);
	int		* (*close_xrcd)(struct ofa_xrcd *xrcd);
	/* other functions left as exercise to the reader */
};

#endif /* OFA_XRC_H */

Driver libraries that support extensions are given a new
registration call, ibv_register_device_ext().  Use of this call
indicates to libibverbs that the library allocates extended
versions of struct ibv_device and struct ibv_context.

The following new APIs are added to libibverbs to applications
to use to determine if an extension is supported and to obtain the
extended function calls.

ibv_have_ext_ops - returns true if an extension is supported
ibv_get_device_ext_ops - return extended operations for a device
ibv_get_ext_ops - return extended operations for an open context

To maintain backwards compatibility with existing applications,
internally, the library uses the last byte of the device name
to record if the device was registered with extension support.

Signed-off-by: Sean Hefty <sean.hefty@intel.com>
---
Compile tested only at this point.  I'm still working on writing
an XRC sample program.

 include/infiniband/driver.h |    1 +
 include/infiniband/verbs.h  |   40 +++++++++++++++++++++++++++++++++++++++-
 src/device.c                |   18 ++++++++++++++++++
 src/ibverbs.h               |   18 ++++++++++++++++++
 src/init.c                  |   17 ++++++++++++++++-
 src/libibverbs.map          |    5 +++++
 src/verbs.c                 |    9 +++++++++
 7 files changed, 106 insertions(+), 2 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Ira Weiny June 4, 2011, 4:39 a.m. UTC | #1
I'm not quite sure I understand the use for "enum ibv_extension_type".  Where/how is this used?

Ira

On Jun 3, 2011, at 5:34 PM, Hefty, Sean wrote:

> In order to support OFED or vendor specific calls, define a
> generic extension mechanism.  This allows OFED, an RDMA vendor,
> or another registered 3rd party (for example, the librdmacm)
> to define RDMA extensions.
> 
> Users which make use extensions are aware that they are not
> only using an extended call, but are given information regarding
> how widely the extension by be supported.  Support for extended
> functions, data structures, and enums are defined.
> 
> Extensions are referenced by name.  There is an assumption that
> extension names are prefixed relative to the supporting party.
> Until an extension has been incorporated into libibverbs, it
> should be defined in an appropriate external header file.
> 
> For example, OFA could provide a header file with their definition
> for XRC extensions.  A partial view of such a header file might
> look something similar to:
> 
> #ifndef OFA_XRC_H
> #define OFA_XRC_H
> #include <infiniband/verbs.h>
> 
> #define OFA_XRC_OPS "ofa-xrc"
> 
> /* Extend IBV_QP_TYPE for XRC */
> #define OFA_QPT_XRC ((enum ibv_qp_type) \
> 	(IBV_EXTENSION_OFA << IBV_EXTENSION_BASE_SHIFT) + 6)
> 
> struct ofa_xrcd {
> 	struct ibv_context *context;
> };
> 
> struct ofa_xrc_ops {
> 	struct ofa_xrcd * (*open_xrcd)(struct ibv_context *context,
> 					inf fd, int oflags);
> 	int		* (*close_xrcd)(struct ofa_xrcd *xrcd);
> 	/* other functions left as exercise to the reader */
> };
> 
> #endif /* OFA_XRC_H */
> 
> Driver libraries that support extensions are given a new
> registration call, ibv_register_device_ext().  Use of this call
> indicates to libibverbs that the library allocates extended
> versions of struct ibv_device and struct ibv_context.
> 
> The following new APIs are added to libibverbs to applications
> to use to determine if an extension is supported and to obtain the
> extended function calls.
> 
> ibv_have_ext_ops - returns true if an extension is supported
> ibv_get_device_ext_ops - return extended operations for a device
> ibv_get_ext_ops - return extended operations for an open context
> 
> To maintain backwards compatibility with existing applications,
> internally, the library uses the last byte of the device name
> to record if the device was registered with extension support.
> 
> Signed-off-by: Sean Hefty <sean.hefty@intel.com>
> ---
> Compile tested only at this point.  I'm still working on writing
> an XRC sample program.
> 
> include/infiniband/driver.h |    1 +
> include/infiniband/verbs.h  |   40 +++++++++++++++++++++++++++++++++++++++-
> src/device.c                |   18 ++++++++++++++++++
> src/ibverbs.h               |   18 ++++++++++++++++++
> src/init.c                  |   17 ++++++++++++++++-
> src/libibverbs.map          |    5 +++++
> src/verbs.c                 |    9 +++++++++
> 7 files changed, 106 insertions(+), 2 deletions(-)
> 
> diff --git a/include/infiniband/driver.h b/include/infiniband/driver.h
> index 9a81416..e48abfd 100644
> --- a/include/infiniband/driver.h
> +++ b/include/infiniband/driver.h
> @@ -57,6 +57,7 @@ typedef struct ibv_device *(*ibv_driver_init_func)(const char *uverbs_sys_path,
> 						   int abi_version);
> 
> void ibv_register_driver(const char *name, ibv_driver_init_func init_func);
> +void ibv_register_driver_ext(const char *name, ibv_driver_init_func init_func);
> int ibv_cmd_get_context(struct ibv_context *context, struct ibv_get_context *cmd,
> 			size_t cmd_size, struct ibv_get_context_resp *resp,
> 			size_t resp_size);
> diff --git a/include/infiniband/verbs.h b/include/infiniband/verbs.h
> index 0f1cb2e..b82cd3a 100644
> --- a/include/infiniband/verbs.h
> +++ b/include/infiniband/verbs.h
> @@ -55,6 +55,15 @@
> 
> BEGIN_C_DECLS
> 
> +enum ibv_extension_type {
> +	IBV_EXTENSION_COMMON,
> +	IBV_EXTENSION_VENDOR,
> +	IBV_EXTENSION_OFA,
> +	IBV_EXTENSION_RDMA_CM
> +};
> +#define IBV_EXTENSION_BASE_SHIFT 24
> +#define IBV_EXTENSION_MASK 0xFF000000
> +
> union ibv_gid {
> 	uint8_t			raw[16];
> 	struct {
> @@ -92,7 +101,8 @@ enum ibv_device_cap_flags {
> 	IBV_DEVICE_SYS_IMAGE_GUID	= 1 << 11,
> 	IBV_DEVICE_RC_RNR_NAK_GEN	= 1 << 12,
> 	IBV_DEVICE_SRQ_RESIZE		= 1 << 13,
> -	IBV_DEVICE_N_NOTIFY_CQ		= 1 << 14
> +	IBV_DEVICE_N_NOTIFY_CQ		= 1 << 14,
> +	IBV_DEVICE_EXTENSIONS		= 1 << (IBV_EXTENSION_BASE_SHIFT - 1)
> };
> 
> enum ibv_atomic_cap {
> @@ -623,6 +633,13 @@ struct ibv_device {
> 	char			dev_path[IBV_SYSFS_PATH_MAX];
> 	/* Path to infiniband class device in sysfs */
> 	char			ibdev_path[IBV_SYSFS_PATH_MAX];
> +
> +	/* Following fields only available if device supports extensions */
> +	void		       *private;
> +	int			(*have_ext_ops)(struct ibv_device *device,
> +						const char *ext_name);
> +	void *			(*get_device_ext_ops)(struct ibv_device *device,
> +						      const char *ext_name);
> };
> 
> struct ibv_context_ops {
> @@ -691,6 +708,11 @@ struct ibv_context {
> 	int			num_comp_vectors;
> 	pthread_mutex_t		mutex;
> 	void		       *abi_compat;
> +
> +	/* Following fields only available if device supports extensions */
> +	void		       *private;
> +	void *			(*get_ext_ops)(struct ibv_context *context,
> +					       const char *ext_name);
> };
> 
> /**
> @@ -724,6 +746,17 @@ const char *ibv_get_device_name(struct ibv_device *device);
> uint64_t ibv_get_device_guid(struct ibv_device *device);
> 
> /**
> + * ibv_have_ext_ops - Return true if device supports the requested
> + * extended operations.
> + */
> +int ibv_have_ext_ops(struct ibv_device *device, const char *name);
> +
> +/**
> + * ibv_get_device_ext_ops - Return extended operations.
> + */
> +void *ibv_get_device_ext_ops(struct ibv_device *device, const char *name);
> +
> +/**
>  * ibv_open_device - Initialize device for use
>  */
> struct ibv_context *ibv_open_device(struct ibv_device *device);
> @@ -734,6 +767,11 @@ struct ibv_context *ibv_open_device(struct ibv_device *device);
> int ibv_close_device(struct ibv_context *context);
> 
> /**
> + * ibv_get_ext_ops - Return extended operations.
> + */
> +void *ibv_get_ext_ops(struct ibv_context *context, const char *name);
> +
> +/**
>  * ibv_get_async_event - Get next async event
>  * @event: Pointer to use to return async event
>  *
> diff --git a/src/device.c b/src/device.c
> index 185f4a6..78d9d35 100644
> --- a/src/device.c
> +++ b/src/device.c
> @@ -181,6 +181,24 @@ int __ibv_close_device(struct ibv_context *context)
> }
> default_symver(__ibv_close_device, ibv_close_device);
> 
> +int __ibv_have_ext_ops(struct ibv_device *device, const char *name)
> +{
> +	if (!ibv_get_ext_support(device))
> +		return ENOSYS;
> +
> +	return device->have_ext_ops(device, name);
> +}
> +default_symver(__ibv_have_ext_ops, ibv_have_ext_ops);
> +
> +void *__ibv_get_device_ext_ops(struct ibv_device *device, const char *name)
> +{
> +	if (!ibv_get_ext_support(device) || !device->get_device_ext_ops)
> +		return NULL;
> +
> +	return device->get_device_ext_ops(device, name);
> +}
> +default_symver(__ibv_get_device_ext_ops, ibv_get_device_ext_ops);
> +
> int __ibv_get_async_event(struct ibv_context *context,
> 			  struct ibv_async_event *event)
> {
> diff --git a/src/ibverbs.h b/src/ibverbs.h
> index 6a6e3c8..33bdee2 100644
> --- a/src/ibverbs.h
> +++ b/src/ibverbs.h
> @@ -35,6 +35,7 @@
> #define IB_VERBS_H
> 
> #include <pthread.h>
> +#include <string.h>
> 
> #include <infiniband/driver.h>
> 
> @@ -102,4 +103,21 @@ HIDDEN int ibverbs_init(struct ibv_device ***list);
> 		(cmd)->response  = (uintptr_t) (out);			\
> 	} while (0)
> 
> +/*
> + * Support for extended operations is recorded at the end of
> + * the name character array.  This way we don't need to query
> + * for the device capabilities with every call.
> + */
> +static inline int ibv_get_ext_support(struct ibv_device *device)
> +{
> +	return device->name[IBV_SYSFS_NAME_MAX - 1];
> +}
> +
> +static inline void ibv_set_ext_support(struct ibv_device *device,
> +				       int ext_supported)
> +{
> +	if (strlen(device->name) < IBV_SYSFS_NAME_MAX - 1)
> +		device->name[IBV_SYSFS_NAME_MAX - 1] = (char) ext_supported;
> +}
> +
> #endif /* IB_VERBS_H */
> diff --git a/src/init.c b/src/init.c
> index 4f0130e..419ab31 100644
> --- a/src/init.c
> +++ b/src/init.c
> @@ -71,6 +71,7 @@ struct ibv_driver {
> 	const char	       *name;
> 	ibv_driver_init_func	init_func;
> 	struct ibv_driver      *next;
> +	int			ext_support;
> };
> 
> static struct ibv_sysfs_dev *sysfs_dev_list;
> @@ -153,7 +154,8 @@ static int find_sysfs_devs(void)
> 	return ret;
> }
> 
> -void ibv_register_driver(const char *name, ibv_driver_init_func init_func)
> +static void __ibv_register_driver(const char *name, ibv_driver_init_func init_func,
> +				  int ext_support)
> {
> 	struct ibv_driver *driver;
> 
> @@ -166,6 +168,7 @@ void ibv_register_driver(const char *name, ibv_driver_init_func init_func)
> 	driver->name      = name;
> 	driver->init_func = init_func;
> 	driver->next      = NULL;
> +	driver->ext_support = ext_support;
> 
> 	if (tail_driver)
> 		tail_driver->next = driver;
> @@ -174,6 +177,16 @@ void ibv_register_driver(const char *name, ibv_driver_init_func init_func)
> 	tail_driver = driver;
> }
> 
> +void ibv_register_driver(const char *name, ibv_driver_init_func init_func)
> +{
> +	__ibv_register_driver(name, init_func, 0);
> +}
> +
> +void ibv_register_driver_ext(const char *name, ibv_driver_init_func init_func)
> +{
> +	__ibv_register_driver(name, init_func, 1);
> +}
> +
> static void load_driver(const char *name)
> {
> 	char *so_name;
> @@ -368,6 +381,8 @@ static struct ibv_device *try_driver(struct ibv_driver *driver,
> 	strcpy(dev->name,       sysfs_dev->ibdev_name);
> 	strcpy(dev->ibdev_path, sysfs_dev->ibdev_path);
> 
> +	ibv_set_ext_support(dev, driver->ext_support);
> +
> 	return dev;
> }
> 
> diff --git a/src/libibverbs.map b/src/libibverbs.map
> index 1827da0..422e07f 100644
> --- a/src/libibverbs.map
> +++ b/src/libibverbs.map
> @@ -96,4 +96,9 @@ IBVERBS_1.1 {
> 		ibv_port_state_str;
> 		ibv_event_type_str;
> 		ibv_wc_status_str;
> +
> +		ibv_register_driver_ext;
> +		ibv_have_ext_ops;
> +		ibv_get_device_ext_ops;
> +		ibv_get_ext_ops;
> } IBVERBS_1.0;
> diff --git a/src/verbs.c b/src/verbs.c
> index ba3c0a4..a34a784 100644
> --- a/src/verbs.c
> +++ b/src/verbs.c
> @@ -76,6 +76,15 @@ enum ibv_rate mult_to_ibv_rate(int mult)
> 	}
> }
> 
> +void *__ibv_get_ext_ops(struct ibv_context *context, const char *name)
> +{
> +	if (!ibv_get_ext_support(context->device) || !context->get_ext_ops)
> +		return NULL;
> +
> +	return context->get_ext_ops(context, name);
> +}
> +default_symver(__ibv_get_ext_ops, ibv_get_ext_ops);
> +
> int __ibv_query_device(struct ibv_context *context,
> 		       struct ibv_device_attr *device_attr)
> {
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hefty, Sean June 5, 2011, 1:47 p.m. UTC | #2
> I'm not quite sure I understand the use for "enum ibv_extension_type".  Where/how is this used?

The intent is that this allows users to extend existing enums without conflicts.  For example,

> > /* Extend IBV_QP_TYPE for XRC */
> > #define OFA_QPT_XRC ((enum ibv_qp_type) \
> > 	(IBV_EXTENSION_OFA << IBV_EXTENSION_BASE_SHIFT) + 6)

This defines a new QP type for XRC.  This new QP type is only usable as part of an OFA specific extension.  When XRC QP types are added directly to libibverbs, it receives a new number because there's no guarantee that the upstream version of XRC support will match what OFA published.  (Obviously it's too late to handle XRC QPs in this way.)

The trade-off is that the upper X number of bits (8 in the patch) of most enums end up being reserved for extension use.  The benefit is that existing functions, like ibv_create_qp, can support extensions, versus duplicating functions and structures.

- Sean
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise June 6, 2011, 4:30 p.m. UTC | #3
On 06/03/2011 07:34 PM, Hefty, Sean wrote:
> In order to support OFED or vendor specific calls, define a
> generic extension mechanism.  This allows OFED, an RDMA vendor,
> or another registered 3rd party (for example, the librdmacm)
> to define RDMA extensions.
>

Will this mechanism allow an RDMA provider driver to export a new qp-related operation for use internally bit the 
supporting provider library?  IE Not exposes to the RDMA application, but an internal interface between the library and 
driver.  I have need for this with the T4 driver.

> Users which make use extensions are aware that they are not
> only using an extended call, but are given information regarding
> how widely the extension by be supported.

Can you expand on the above sentence?  I don't get the "how widely supported" angle?

Steve.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Roland Dreier June 6, 2011, 6:07 p.m. UTC | #4
On Mon, Jun 6, 2011 at 9:30 AM, Steve Wise <swise@opengridcomputing.com> wrote:
> Will this mechanism allow an RDMA provider driver to export a new qp-related
> operation for use internally bit the supporting provider library?  IE Not
> exposes to the RDMA application, but an internal interface between the
> library and driver.  I have need for this with the T4 driver.

Not sure I follow this... how specifically would this work?  Why does the
userspace library need help to talk to the kernel driver?

 - R.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise June 6, 2011, 6:28 p.m. UTC | #5
On 06/06/2011 01:07 PM, Roland Dreier wrote:
> On Mon, Jun 6, 2011 at 9:30 AM, Steve Wise<swise@opengridcomputing.com>  wrote:
>> Will this mechanism allow an RDMA provider driver to export a new qp-related
>> operation for use internally bit the supporting provider library?  IE Not
>> exposes to the RDMA application, but an internal interface between the
>> library and driver.  I have need for this with the T4 driver.
> Not sure I follow this... how specifically would this work?  Why does the
> userspace library need help to talk to the kernel driver?
>
>   - R.

I'm investigating ways to support a kernel mode "ring the QP doorbell" for user mode QPs.  This can allow optimization 
and coalescing of db-credits to improve the ring rate for large amounts of qps.  Currently for experimentation, I hacked 
this by posting a special send WR (with opcode 0xdeadbeef :) ) which ends up calling my db ring function, but I need to 
come up with an acceptable solution.   I was thinking about maybe enhancing the modify_qp verb, but perhaps this new 
extension proposal is a better way? I was also pondering some sort of provider-specific ioctl.  But this is really 
internal to the provider lib and driver.  I just don't want to have to implement a full character device interface for 
this since the uverbs interface provides this already.

Steve.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hefty, Sean June 6, 2011, 6:28 p.m. UTC | #6
> > Will this mechanism allow an RDMA provider driver to export a new qp-
> related operation for use internally bit the
> > supporting provider library?  IE Not exposes to the RDMA application,
> but an internal interface between the library
> > and driver.  I have need for this with the T4 driver.
> 
> Sorry about my bad English:
> 
> "internally bit the" should be "internally by the".
> 
> And "IE Not exposes" should be "IE Not exposed".

I don't fully understand this request.  The idea is that libibverbs does not change as new extensions are added by providers, and that there is only 1 version of libibverbs (from Roland's tree).  This does not try to extend the kernel interfaces in any way.  If kernel patches are required, my thinking is that the provider library should communicate directly with the patched kernel. 

That said, libibverbs _could_ obtain some sort of non-published interface to a provider and make use of it.  Roland would need to accept any such patches.

Btw, adding the ibv_extension_mask to kernel commands (ib_user_verbs_cmd_*), rather than simply taking the next value, should help avoid breaking the ABI when dealing with patched kernels.

> >> Users which make use extensions are aware that they are not
> >> only using an extended call, but are given information regarding
> >> how widely the extension by be supported.
> >
> > Can you expand on the above sentence?  I don't get the "how widely
> supported" angle?

The idea is that the extension name indicates if it's common to verbs, specific to a vendor, or supported by some external group, such as OFA.  E.g. you can have vendor specific XRC ops, OFA XRC ops, or ibverbs XRC ops.

- Sean
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise June 6, 2011, 6:35 p.m. UTC | #7
On 06/06/2011 01:28 PM, Hefty, Sean wrote:
>>> Will this mechanism allow an RDMA provider driver to export a new qp-
>> related operation for use internally bit the
>>> supporting provider library?  IE Not exposes to the RDMA application,
>> but an internal interface between the library
>>> and driver.  I have need for this with the T4 driver.
>> Sorry about my bad English:
>>
>> "internally bit the" should be "internally by the".
>>
>> And "IE Not exposes" should be "IE Not exposed".
> I don't fully understand this request.  The idea is that libibverbs does not change as new extensions are added by providers, and that there is only 1 version of libibverbs (from Roland's tree).  This does not try to extend the kernel interfaces in any way.  If kernel patches are required, my thinking is that the provider library should communicate directly with the patched kernel.
>
> That said, libibverbs _could_ obtain some sort of non-published interface to a provider and make use of it.  Roland would need to accept any such patches.
>
> Btw, adding the ibv_extension_mask to kernel commands (ib_user_verbs_cmd_*), rather than simply taking the next value, should help avoid breaking the ABI when dealing with patched kernels.
>

See my answer to Roland's question as to what I'm trying to do.  I guess your proposal isn't what I'm needing...


>>>> Users which make use extensions are aware that they are not
>>>> only using an extended call, but are given information regarding
>>>> how widely the extension by be supported.
>>> Can you expand on the above sentence?  I don't get the "how widely
>> supported" angle?
> The idea is that the extension name indicates if it's common to verbs, specific to a vendor, or supported by some external group, such as OFA.  E.g. you can have vendor specific XRC ops, OFA XRC ops, or ibverbs XRC ops.
>
> - Sean


I see.  Thanks.

Steve.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe June 9, 2011, 4:55 p.m. UTC | #8
On Sat, Jun 04, 2011 at 12:34:46AM +0000, Hefty, Sean wrote:
> In order to support OFED or vendor specific calls, define a
> generic extension mechanism.  This allows OFED, an RDMA vendor,
> or another registered 3rd party (for example, the librdmacm)
> to define RDMA extensions.

After looking at this a bit I'm a bit concerned that your first
extension goes around and requires edits to the various structs
anyhow, which sort of defeats the purpose of making extensions, IMHO..

Do you think such changes will be required so often that this is just
extra complexity? For instance, could you implement the multicast self
loop flag feature using an extension ?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hefty, Sean June 9, 2011, 6:59 p.m. UTC | #9
> After looking at this a bit I'm a bit concerned that your first
> extension goes around and requires edits to the various structs
> anyhow, which sort of defeats the purpose of making extensions, IMHO..

My thinking is that extensions mainly provide:

- a way for a vendor or organization to provide add-on functionality to
  an application (so we don't break things like the xrc patches did)
- a mechanism for libibverbs to obtain additional functionality from a
  provider library (where existing ibv_context_ops are insufficient)

Once a feature has been added directly to libibverbs, I would treat it slightly differently.  For example, I've since modified libibverbs to export direct APIs for opening/closing xrcd's and creating extended srq's.  An app doesn't need to call ibv_get_ext_ops() directly for xrc qp's; however, it would make use of the define for IBV_XRC_OPS if it wanted to support older versions of libibverbs.

Changing ibv_send_wr, ibv_qp_init_attr, ibv_srq, ibv_qp, etc. seemed like a much better alternative for providing XRC support than introducing an entire new set of structures and APIs.

> Do you think such changes will be required so often that this is just
> extra complexity? For instance, could you implement the multicast self
> loop flag feature using an extension ?

Based on history, I'd anticipate a small number of extensions provided by OFA or a vendor: tag matching? off-loaded MPI collective operations?  who knows...

Certain ibverbs APIs more easily support extensions that others.  Adding XRC QP support to the existing APIs without breaking existing apps is fairly easy, since the qp_type indicates if extended attributes are available.  XRC SRQs required a new API.  Supporting the multicast self loop flag is definitely doable; it's really just a matter of how easy it would be for an application to use it. :)

- Sean
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/infiniband/driver.h b/include/infiniband/driver.h
index 9a81416..e48abfd 100644
--- a/include/infiniband/driver.h
+++ b/include/infiniband/driver.h
@@ -57,6 +57,7 @@  typedef struct ibv_device *(*ibv_driver_init_func)(const char *uverbs_sys_path,
 						   int abi_version);
 
 void ibv_register_driver(const char *name, ibv_driver_init_func init_func);
+void ibv_register_driver_ext(const char *name, ibv_driver_init_func init_func);
 int ibv_cmd_get_context(struct ibv_context *context, struct ibv_get_context *cmd,
 			size_t cmd_size, struct ibv_get_context_resp *resp,
 			size_t resp_size);
diff --git a/include/infiniband/verbs.h b/include/infiniband/verbs.h
index 0f1cb2e..b82cd3a 100644
--- a/include/infiniband/verbs.h
+++ b/include/infiniband/verbs.h
@@ -55,6 +55,15 @@ 
 
 BEGIN_C_DECLS
 
+enum ibv_extension_type {
+	IBV_EXTENSION_COMMON,
+	IBV_EXTENSION_VENDOR,
+	IBV_EXTENSION_OFA,
+	IBV_EXTENSION_RDMA_CM
+};
+#define IBV_EXTENSION_BASE_SHIFT 24
+#define IBV_EXTENSION_MASK 0xFF000000
+
 union ibv_gid {
 	uint8_t			raw[16];
 	struct {
@@ -92,7 +101,8 @@  enum ibv_device_cap_flags {
 	IBV_DEVICE_SYS_IMAGE_GUID	= 1 << 11,
 	IBV_DEVICE_RC_RNR_NAK_GEN	= 1 << 12,
 	IBV_DEVICE_SRQ_RESIZE		= 1 << 13,
-	IBV_DEVICE_N_NOTIFY_CQ		= 1 << 14
+	IBV_DEVICE_N_NOTIFY_CQ		= 1 << 14,
+	IBV_DEVICE_EXTENSIONS		= 1 << (IBV_EXTENSION_BASE_SHIFT - 1)
 };
 
 enum ibv_atomic_cap {
@@ -623,6 +633,13 @@  struct ibv_device {
 	char			dev_path[IBV_SYSFS_PATH_MAX];
 	/* Path to infiniband class device in sysfs */
 	char			ibdev_path[IBV_SYSFS_PATH_MAX];
+
+	/* Following fields only available if device supports extensions */
+	void		       *private;
+	int			(*have_ext_ops)(struct ibv_device *device,
+						const char *ext_name);
+	void *			(*get_device_ext_ops)(struct ibv_device *device,
+						      const char *ext_name);
 };
 
 struct ibv_context_ops {
@@ -691,6 +708,11 @@  struct ibv_context {
 	int			num_comp_vectors;
 	pthread_mutex_t		mutex;
 	void		       *abi_compat;
+
+	/* Following fields only available if device supports extensions */
+	void		       *private;
+	void *			(*get_ext_ops)(struct ibv_context *context,
+					       const char *ext_name);
 };
 
 /**
@@ -724,6 +746,17 @@  const char *ibv_get_device_name(struct ibv_device *device);
 uint64_t ibv_get_device_guid(struct ibv_device *device);
 
 /**
+ * ibv_have_ext_ops - Return true if device supports the requested
+ * extended operations.
+ */
+int ibv_have_ext_ops(struct ibv_device *device, const char *name);
+
+/**
+ * ibv_get_device_ext_ops - Return extended operations.
+ */
+void *ibv_get_device_ext_ops(struct ibv_device *device, const char *name);
+
+/**
  * ibv_open_device - Initialize device for use
  */
 struct ibv_context *ibv_open_device(struct ibv_device *device);
@@ -734,6 +767,11 @@  struct ibv_context *ibv_open_device(struct ibv_device *device);
 int ibv_close_device(struct ibv_context *context);
 
 /**
+ * ibv_get_ext_ops - Return extended operations.
+ */
+void *ibv_get_ext_ops(struct ibv_context *context, const char *name);
+
+/**
  * ibv_get_async_event - Get next async event
  * @event: Pointer to use to return async event
  *
diff --git a/src/device.c b/src/device.c
index 185f4a6..78d9d35 100644
--- a/src/device.c
+++ b/src/device.c
@@ -181,6 +181,24 @@  int __ibv_close_device(struct ibv_context *context)
 }
 default_symver(__ibv_close_device, ibv_close_device);
 
+int __ibv_have_ext_ops(struct ibv_device *device, const char *name)
+{
+	if (!ibv_get_ext_support(device))
+		return ENOSYS;
+
+	return device->have_ext_ops(device, name);
+}
+default_symver(__ibv_have_ext_ops, ibv_have_ext_ops);
+
+void *__ibv_get_device_ext_ops(struct ibv_device *device, const char *name)
+{
+	if (!ibv_get_ext_support(device) || !device->get_device_ext_ops)
+		return NULL;
+
+	return device->get_device_ext_ops(device, name);
+}
+default_symver(__ibv_get_device_ext_ops, ibv_get_device_ext_ops);
+
 int __ibv_get_async_event(struct ibv_context *context,
 			  struct ibv_async_event *event)
 {
diff --git a/src/ibverbs.h b/src/ibverbs.h
index 6a6e3c8..33bdee2 100644
--- a/src/ibverbs.h
+++ b/src/ibverbs.h
@@ -35,6 +35,7 @@ 
 #define IB_VERBS_H
 
 #include <pthread.h>
+#include <string.h>
 
 #include <infiniband/driver.h>
 
@@ -102,4 +103,21 @@  HIDDEN int ibverbs_init(struct ibv_device ***list);
 		(cmd)->response  = (uintptr_t) (out);			\
 	} while (0)
 
+/*
+ * Support for extended operations is recorded at the end of
+ * the name character array.  This way we don't need to query
+ * for the device capabilities with every call.
+ */
+static inline int ibv_get_ext_support(struct ibv_device *device)
+{
+	return device->name[IBV_SYSFS_NAME_MAX - 1];
+}
+
+static inline void ibv_set_ext_support(struct ibv_device *device,
+				       int ext_supported)
+{
+	if (strlen(device->name) < IBV_SYSFS_NAME_MAX - 1)
+		device->name[IBV_SYSFS_NAME_MAX - 1] = (char) ext_supported;
+}
+
 #endif /* IB_VERBS_H */
diff --git a/src/init.c b/src/init.c
index 4f0130e..419ab31 100644
--- a/src/init.c
+++ b/src/init.c
@@ -71,6 +71,7 @@  struct ibv_driver {
 	const char	       *name;
 	ibv_driver_init_func	init_func;
 	struct ibv_driver      *next;
+	int			ext_support;
 };
 
 static struct ibv_sysfs_dev *sysfs_dev_list;
@@ -153,7 +154,8 @@  static int find_sysfs_devs(void)
 	return ret;
 }
 
-void ibv_register_driver(const char *name, ibv_driver_init_func init_func)
+static void __ibv_register_driver(const char *name, ibv_driver_init_func init_func,
+				  int ext_support)
 {
 	struct ibv_driver *driver;
 
@@ -166,6 +168,7 @@  void ibv_register_driver(const char *name, ibv_driver_init_func init_func)
 	driver->name      = name;
 	driver->init_func = init_func;
 	driver->next      = NULL;
+	driver->ext_support = ext_support;
 
 	if (tail_driver)
 		tail_driver->next = driver;
@@ -174,6 +177,16 @@  void ibv_register_driver(const char *name, ibv_driver_init_func init_func)
 	tail_driver = driver;
 }
 
+void ibv_register_driver(const char *name, ibv_driver_init_func init_func)
+{
+	__ibv_register_driver(name, init_func, 0);
+}
+
+void ibv_register_driver_ext(const char *name, ibv_driver_init_func init_func)
+{
+	__ibv_register_driver(name, init_func, 1);
+}
+
 static void load_driver(const char *name)
 {
 	char *so_name;
@@ -368,6 +381,8 @@  static struct ibv_device *try_driver(struct ibv_driver *driver,
 	strcpy(dev->name,       sysfs_dev->ibdev_name);
 	strcpy(dev->ibdev_path, sysfs_dev->ibdev_path);
 
+	ibv_set_ext_support(dev, driver->ext_support);
+
 	return dev;
 }
 
diff --git a/src/libibverbs.map b/src/libibverbs.map
index 1827da0..422e07f 100644
--- a/src/libibverbs.map
+++ b/src/libibverbs.map
@@ -96,4 +96,9 @@  IBVERBS_1.1 {
 		ibv_port_state_str;
 		ibv_event_type_str;
 		ibv_wc_status_str;
+
+		ibv_register_driver_ext;
+		ibv_have_ext_ops;
+		ibv_get_device_ext_ops;
+		ibv_get_ext_ops;
 } IBVERBS_1.0;
diff --git a/src/verbs.c b/src/verbs.c
index ba3c0a4..a34a784 100644
--- a/src/verbs.c
+++ b/src/verbs.c
@@ -76,6 +76,15 @@  enum ibv_rate mult_to_ibv_rate(int mult)
 	}
 }
 
+void *__ibv_get_ext_ops(struct ibv_context *context, const char *name)
+{
+	if (!ibv_get_ext_support(context->device) || !context->get_ext_ops)
+		return NULL;
+
+	return context->get_ext_ops(context, name);
+}
+default_symver(__ibv_get_ext_ops, ibv_get_ext_ops);
+
 int __ibv_query_device(struct ibv_context *context,
 		       struct ibv_device_attr *device_attr)
 {