diff mbox

[for-next,V2,02/22] IB/core: change pkey table lookups to support full and partial membership for the same pkey

Message ID 1343983258-6268-3-git-send-email-jackm@dev.mellanox.co.il (mailing list archive)
State Accepted, archived
Delegated to: Roland Dreier
Headers show

Commit Message

jackm Aug. 3, 2012, 8:40 a.m. UTC
Enhance the cached and non-cached pkey table lookups to enable limited and full
members of the same pkey to co-exist in the pkey table.

This is necessary for SRIOV to allow for a scheme where some guests would have the full
membership pkey in their virtual pkey table, where other guests on the same hypervisor
would have the limited one. In that sense, its an extension of the IBTA model for
non virtualized nodes.

To accomplish this, we need both the limited and full membership pkeys to be present
in the master's (hypervisor physical port) pkey table.

The algorithm for supporting pkey tables which contain both the limited and the full
membership versions of the same pkey works as follows:

When scanning the pkey table for a 15 bit pkey:

A. If there is a full member version of that pkey anywhere
    in the table, return its index (even if a limited-member
    version of the pkey exists earlier in the table).

B. If the full member version is not in the table,
    but the limited-member version is in the table,
    return the index of the limited pkey.

Signed-off-by: Liran Liss <liranl@mellanox.com>
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/infiniband/core/cache.c  |   14 +++++++++++---
 drivers/infiniband/core/device.c |   17 +++++++++++++----
 2 files changed, 24 insertions(+), 7 deletions(-)

Comments

Doug Ledford Sept. 11, 2012, 4:52 p.m. UTC | #1
On 8/3/2012 4:40 AM, Jack Morgenstein wrote:
> Enhance the cached and non-cached pkey table lookups to enable limited and full
> members of the same pkey to co-exist in the pkey table.
> 
> This is necessary for SRIOV to allow for a scheme where some guests would have the full
> membership pkey in their virtual pkey table, where other guests on the same hypervisor
> would have the limited one. In that sense, its an extension of the IBTA model for
> non virtualized nodes.

OK, maybe I'm not getting something, but I'm curious why we always pick
the full pkey in preference to the partial pkey.  Shouldn't we pick the
pkey that's appropriate for the vHCA sending the message?

Also, given the rule of least surprise, don't you think it would be best
to rename this function ib_find_cached_full_or_parital_pkey and in your
next patch instead of naming it ib_find_exact_pkey just call that one
ib_find_cached_pkey?

> To accomplish this, we need both the limited and full membership pkeys to be present
> in the master's (hypervisor physical port) pkey table.
> 
> The algorithm for supporting pkey tables which contain both the limited and the full
> membership versions of the same pkey works as follows:
> 
> When scanning the pkey table for a 15 bit pkey:
> 
> A. If there is a full member version of that pkey anywhere
>     in the table, return its index (even if a limited-member
>     version of the pkey exists earlier in the table).
> 
> B. If the full member version is not in the table,
>     but the limited-member version is in the table,
>     return the index of the limited pkey.
> 
> Signed-off-by: Liran Liss <liranl@mellanox.com>
> Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
> ---
>  drivers/infiniband/core/cache.c  |   14 +++++++++++---
>  drivers/infiniband/core/device.c |   17 +++++++++++++----
>  2 files changed, 24 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
> index 9353992..0f2f2b7 100644
> --- a/drivers/infiniband/core/cache.c
> +++ b/drivers/infiniband/core/cache.c
> @@ -167,6 +167,7 @@ int ib_find_cached_pkey(struct ib_device *device,
>  	unsigned long flags;
>  	int i;
>  	int ret = -ENOENT;
> +	int partial_ix = -1;
>  
>  	if (port_num < start_port(device) || port_num > end_port(device))
>  		return -EINVAL;
> @@ -179,10 +180,17 @@ int ib_find_cached_pkey(struct ib_device *device,
>  
>  	for (i = 0; i < cache->table_len; ++i)
>  		if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) {
> -			*index = i;
> -			ret = 0;
> -			break;
> +			if (cache->table[i] & 0x8000) {
> +				*index = i;
> +				ret = 0;
> +				break;
> +			} else
> +				partial_ix = i;
>  		}
> +	if (ret && partial_ix >= 0) {
> +		*index = partial_ix;
> +		ret = 0;
> +	}
>  
>  	read_unlock_irqrestore(&device->cache.lock, flags);
>  
> diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
> index e711de4..a645c68 100644
> --- a/drivers/infiniband/core/device.c
> +++ b/drivers/infiniband/core/device.c
> @@ -707,18 +707,27 @@ int ib_find_pkey(struct ib_device *device,
>  {
>  	int ret, i;
>  	u16 tmp_pkey;
> +	int partial_ix = -1;
>  
>  	for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) {
>  		ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
>  		if (ret)
>  			return ret;
> -
>  		if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) {
> -			*index = i;
> -			return 0;
> +			/* if there is full-member pkey take it.*/
> +			if (tmp_pkey & 0x8000) {
> +				*index = i;
> +				return 0;
> +			}
> +			if (partial_ix < 0)
> +				partial_ix = i;
>  		}
>  	}
> -
> +	/*no full-member, if exists take the limited*/
> +	if (partial_ix >= 0) {
> +		*index = partial_ix;
> +		return 0;
> +	}
>  	return -ENOENT;
>  }
>  EXPORT_SYMBOL(ib_find_pkey);
>
jackm Sept. 12, 2012, 7:56 a.m. UTC | #2
On Tuesday 11 September 2012 19:52, Doug Ledford wrote:
> On 8/3/2012 4:40 AM, Jack Morgenstein wrote:
> > Enhance the cached and non-cached pkey table lookups to enable limited and full
> > members of the same pkey to co-exist in the pkey table.
> > 
> > This is necessary for SRIOV to allow for a scheme where some guests would have the full
> > membership pkey in their virtual pkey table, where other guests on the same hypervisor
> > would have the limited one. In that sense, its an extension of the IBTA model for
> > non virtualized nodes.
> 
> OK, maybe I'm not getting something, but I'm curious why we always pick
> the full pkey in preference to the partial pkey.
> 
The pkey model that we are working with is the following:
- guests do NOT have both limited and full membership forms of a given pkey;
  they have either the limited member or the full member

- The guests will see a virtualized pkey table, where pkeys appearing in that table
  are "cherry-picked" by the administrator from the underlying physical (device) pkey
  table contents.  The slot (or index) occupied by a mapped pkey in the virtual table
  is also determined by the administrator.
  The Hypervisor (primary phys function or PPF), which sees the physical pkey table used
  for the mappings, will therefor have both forms of a given pkey -- with the limited form
  mapped to some slot in a guest's pkey table, and the full form mapped to some slot in
  a different guest's pkey table.

- The only reason that a pkey table would contain both full and limited membership versions
  of a given pkey is to support SRIOV guest pkey paravirtualization.  In "native" mode, there
  is no reason to have both forms in a given pkey table. (The driver clearly reflects this
  in its original implementation of "ib_find_cached_pkey" -- it returns the index of the first
  15-bit pkey it finds in the table, and assumes that these 15 bits are unique in the table).

Thus, on guests ib_find_cached_pkey will operate as before, since the 15-bit pkey in the guest
table will be unique.

On the Hypervisor, however, we assume that if both versions of the pkey are in its pkey table,
then for its own infiniband operation (as opposed to performing its pkey virtualizing function),
it should operate with the highest membership type in its table for a given 15-bit pkey.

> Shouldn't we pick the
> pkey that's appropriate for the vHCA sending the message?

We do. When QPs on the guests are created, the modify-qp commands are not executed on the guest,
but rather are passed to the PPF for processing. The PPF replaces the guest-provided virtual pkey-index
value with the appropriate physical pkey-index value.  See procedure "update_pkey_index" in file
resource_tracker.c, and all the places it is called (i.e., in the wrapper functions for the various
modify-qp firmware commands).

> Also, given the rule of least surprise, don't you think it would be best
> to rename this function ib_find_cached_full_or_parital_pkey and in your
> next patch instead of naming it ib_find_exact_pkey just call that one
> ib_find_cached_pkey?
The naming of "ib_find_cached_pkey" refers to the 15-bit pkey value. I also
did not want to change all the places that "ib_find_cached_pkey" is used, to
call "ib_find_cached_full_or_partial_pkey" instead.

I also think it is more confusing to use the old name "ib_find_cached_pkey" in
a completely new (and incompatible) way.

Maybe I can change "ib_find_exact_pkey" to "ib_find_16_bit_pkey" ?

In this case, "ib_find_cached_pkey" will be backwards compatible and will find the
appropriate 15-bit pkey of either form (with the proviso that if both forms of the
pkey are in the table, it will return the full membership pkey index).

The ib_find_16_bit_pkey() looks for the exact match, and returns error if it does not find
it (-- specifically, if it is looking for one membership form, and only the other form is in
the table, it will return error).

> > To accomplish this, we need both the limited and full membership pkeys to be present
> > in the master's (hypervisor physical port) pkey table.
> > 
> > The algorithm for supporting pkey tables which contain both the limited and the full
> > membership versions of the same pkey works as follows:
> > 
> > When scanning the pkey table for a 15 bit pkey:
> > 
> > A. If there is a full member version of that pkey anywhere
> >     in the table, return its index (even if a limited-member
> >     version of the pkey exists earlier in the table).
> > 
> > B. If the full member version is not in the table,
> >     but the limited-member version is in the table,
> >     return the index of the limited pkey.
> > 
> > Signed-off-by: Liran Liss <liranl@mellanox.com>
> > Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
> > Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford Sept. 12, 2012, 4:48 p.m. UTC | #3
On 09/12/2012 03:56 AM, Jack Morgenstein wrote:
> On the Hypervisor, however, we assume that if both versions of the pkey are in its pkey table,
> then for its own infiniband operation (as opposed to performing its pkey virtualizing function),
> it should operate with the highest membership type in its table for a given 15-bit pkey.

That's what I was looking for.  So, how can you know this assumption is
correct?  It seems to me that if someone wanted to restrict membership
of the hypervisor as part of a security lockdown, then give full
membership to a guest because that guest is some high security, single
task guest, then this assumption would break things (the user would be
able to assign the full membership key to the guest OK, but regardless
of how they wanted the hypervisor to be subscribed to that particular
pkey, it would always get the full membership from the guest).

>> Shouldn't we pick the
>> pkey that's appropriate for the vHCA sending the message?
> 
> We do. When QPs on the guests are created, the modify-qp commands are not executed on the guest,
> but rather are passed to the PPF for processing. The PPF replaces the guest-provided virtual pkey-index
> value with the appropriate physical pkey-index value.  See procedure "update_pkey_index" in file
> resource_tracker.c, and all the places it is called (i.e., in the wrapper functions for the various
> modify-qp firmware commands).

That's fine for the guest, but I don't see how this solves the
assumption issue.  My concern is that I could see a valid scenario that
a user might want to implement for security reasons that I think makes
your assumption above incorrect.
jackm Sept. 13, 2012, 7:35 a.m. UTC | #4
On Wednesday 12 September 2012 19:48, Doug Ledford wrote:
> > On the Hypervisor, however, we assume that if both versions of the pkey are in its pkey table,
> > then for its own infiniband operation (as opposed to performing its pkey virtualizing function),
> > it should operate with the highest membership type in its table for a given 15-bit pkey.
> 
> That's what I was looking for.  So, how can you know this assumption is
> correct?  It seems to me that if someone wanted to restrict membership
> of the hypervisor as part of a security lockdown, then give full
> membership to a guest because that guest is some high security, single
> task guest, then this assumption would break things (the user would be
> able to assign the full membership key to the guest OK, but regardless
> of how they wanted the hypervisor to be subscribed to that particular
> pkey, it would always get the full membership from the guest).
> 
This issue, unfortunately, opens up a real "can of worms".
ib_find_cached_pkey() is used by the CM in determining the (15-bit) p_key
to be used for the connection (although 16-bit pkeys are placed in the
CM_REQ message (see IB Spec 1.2.1, table 99, page 667). The REQ handler on
the remote side finds the index in its pkey table which contains the
(15-bit) pkey enclosed in the REQ message.  This index is then used
when creating the local RC qp as its pkey_index.
AFAIK, no check is performed regarding compatible membership forms (i.e.,
that at least one of the two sides has the full membership form of the
15-bit pkey) -- it is the network administrator's responsibility to see
that the pkey configurations are correct.

Up to now, the ib_core driver has assumed that only 1 membership form per
15-bit pkey is contained in the local pkey table.

Now that both forms may exist in the Hypervisor's table, we do have a problem.
1. We cannot depend on ib_find_cached_pkey simply finding the first 15-bit pkey
   which matches its search, since we cannot depend on the order of full vs limited
   membership forms appearing in the pkey table (the order is not controllable -- see
   the example at the end of this post).
2. We therefore need a consistent policy for preferentially retrieving either the full
   or the partial member for a given 15-bit pkey.
3. This policy must be configurable by the administrator per host, per HCA.

The problems that I see:
1. What if the Hypervisor needs to have limited membership for some connections, but
   full membership for others?  Such a split policy is complex to implement -- it would
   require specifying PER PKEY whether the preferential return should be full, or should
   be limited.
2. What if the Hypervisor has several HCAs, and wishes a different policy for each HCA?

There may be more problems as well.  Liran?

In any event, the issue of multiple pkey forms in a given pkey table is actually separate
from SRIOV (it's just that SRIOV needs multiple forms, so the issue came up).
Solving it will require changes in the ib core driver.

I don't yet have a concrete proposal to fix this at present.  Any ideas would be appreciated.

-Jack

P.S. We have the same issue in procedure ib_find_pkey(), also in this patch, in core/device.c .
This procedure is used in lots of places: ipoib, core/multicast, sa_query.

I seem to recall that there were problems with IPoIB when partial membership pkeys are used.
Liran, do you recall something?

======================================
It is also impossible to demand that the pkey table be ordered so that if both limited
and full pkeys are present, then the full membership value always appears first.

A simple example:
  1. Fill the entire pkey table with full-membership pkeys. Say pkey A and pkey B are both
     in the table, and that pkey A appears first.

  2. Delete full-membership pkey A, and add the limited-membership value of Pkey B. This
     new limited-pkey-B will be entered into the pkey table by OpenSM in the position just
     vacated by the deleted pkey A -- it is the only available slot!.

  3. The Limited-membership pkey-B value will therefore unavoidably appear in the table
     before the full-membership pkey-B value.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz Sept. 13, 2012, 8:18 a.m. UTC | #5
On 13/09/2012 10:35, Jack Morgenstein wrote:
> I seem to recall that there were problems with IPoIB when partial membership pkeys are used.

There are some issues in the overall solution, since ARPs sent over the 
broadcast group reach
also nodes with partial membership their HCA generated pkey violation 
errors which is a bit annoying,
the IBTA has been dealing with this for a while, not sure where it stands.

But, all in all, the approach taken by IPoIB is to always use the full 
form of a pkey e.g 0x8001
and not 0x0001 when dealing with the core helper functions and within 
ipoib internally.

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz Sept. 13, 2012, 8:20 a.m. UTC | #6
On 13/09/2012 10:35, Jack Morgenstein wrote:
> I seem to recall that there were problems with IPoIB when partial membership pkeys are used.

There are some issues in the overall solution, since ARPs sent over the 
broadcast group reach
also nodes with partial membership their HCA generated pkey violation 
errors which is a bit annoying,
the IBTA has been dealing with this for a while, not sure where it 
stands. I'm not aware to other issues.

All in all, the approach taken by IPoIB is to always use the full form 
of a pkey e.g 0x8001
and not 0x0001 when dealing with the core helper functions and within 
ipoib internally and
externally  -- e.g in the sysfs (and also in the rtnl link ops I'm 
adding now) interfaces - of add/delete child and
show the pkey.

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz Sept. 13, 2012, 3:53 p.m. UTC | #7
On 11/09/2012 19:52, Doug Ledford wrote:
> On 8/3/2012 4:40 AM, Jack Morgenstein wrote:
>> >Enhance the cached and non-cached pkey table lookups to enable limited and full
>> >members of the same pkey to co-exist in the pkey table.
>> >
>> >This is necessary for SRIOV to allow for a scheme where some guests would have the full
>> >membership pkey in their virtual pkey table, where other guests on the same hypervisor
>> >would have the limited one. In that sense, its an extension of the IBTA model for
>> >non virtualized nodes.
> OK, maybe I'm not getting something, but I'm curious why we always pick
> the full pkey in preference to the partial pkey.  Shouldn't we pick the
> pkey that's appropriate for the vHCA sending the message?
>
> Also, given the rule of least surprise, don't you think it would be best
> to rename this function ib_find_cached_full_or_parital_pkey and in your
> next patch instead of naming it ib_find_exact_pkey just call that one
> ib_find_cached_pkey?
>

reply from Liran, our architect:

The physical PKey table can contain both full and partial memberships of 
the same Pkey.
This is needed to serve 2 VFs that are granted access to the same PKey, 
albeit with different membership types.

Indeed, there is no reason that the virtual PKey table of any *single* 
VF will contain both membership types.However, the PF PKey table 
certainly can because it mirrors the physical Pkey table.This patch is 
required for ULPs running on top of the PF.

--Liran
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz Sept. 13, 2012, 4 p.m. UTC | #8
On 13/09/2012 18:53, Or Gerlitz wrote:
> The physical PKey table can contain both full and partial memberships 
> of the same Pkey.
> This is needed to serve 2 VFs that are granted access to the same 
> PKey, albeit with different membership types. 

Example use case -- RDMA or IPoIB network storage (file/block) server 
running on a VM
and clients  running on other VMs. Some of the client VMs may be located 
on the
same hypervisor node as the server, and we want the clients to use limited
partition such that they can't talk to each other on the storage partition.

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 9353992..0f2f2b7 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -167,6 +167,7 @@  int ib_find_cached_pkey(struct ib_device *device,
 	unsigned long flags;
 	int i;
 	int ret = -ENOENT;
+	int partial_ix = -1;
 
 	if (port_num < start_port(device) || port_num > end_port(device))
 		return -EINVAL;
@@ -179,10 +180,17 @@  int ib_find_cached_pkey(struct ib_device *device,
 
 	for (i = 0; i < cache->table_len; ++i)
 		if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) {
-			*index = i;
-			ret = 0;
-			break;
+			if (cache->table[i] & 0x8000) {
+				*index = i;
+				ret = 0;
+				break;
+			} else
+				partial_ix = i;
 		}
+	if (ret && partial_ix >= 0) {
+		*index = partial_ix;
+		ret = 0;
+	}
 
 	read_unlock_irqrestore(&device->cache.lock, flags);
 
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index e711de4..a645c68 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -707,18 +707,27 @@  int ib_find_pkey(struct ib_device *device,
 {
 	int ret, i;
 	u16 tmp_pkey;
+	int partial_ix = -1;
 
 	for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) {
 		ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
 		if (ret)
 			return ret;
-
 		if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) {
-			*index = i;
-			return 0;
+			/* if there is full-member pkey take it.*/
+			if (tmp_pkey & 0x8000) {
+				*index = i;
+				return 0;
+			}
+			if (partial_ix < 0)
+				partial_ix = i;
 		}
 	}
-
+	/*no full-member, if exists take the limited*/
+	if (partial_ix >= 0) {
+		*index = partial_ix;
+		return 0;
+	}
 	return -ENOENT;
 }
 EXPORT_SYMBOL(ib_find_pkey);