diff mbox

[rdma-next,11/29] IB/rxe: Allocation pool for RDMA objects

Message ID 1464886657-14258-12-git-send-email-monis@mellanox.com (mailing list archive)
State Superseded
Headers show

Commit Message

Moni Shoua June 2, 2016, 4:57 p.m. UTC
Manage and allocate pool of objects with given limit on number of
elements.  Gets parameters from rxe_type_info. Pool elements are
allocated out of a slab cache.  Objects that are using this facility
are: PD, QP, SRQ, CQ, MR, FMR, MW, etc.

Signed-off-by: Kamal Heib <kamalh@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: Moni Shoua <monis@mellanox.com>
Reviewed-by: Haggai Eran <haggaie@mellanox.com>
---
 drivers/infiniband/hw/rxe/rxe_pool.c | 510 +++++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/rxe/rxe_pool.h | 164 +++++++++++
 2 files changed, 674 insertions(+)
 create mode 100644 drivers/infiniband/hw/rxe/rxe_pool.c
 create mode 100644 drivers/infiniband/hw/rxe/rxe_pool.h

Comments

Steve Wise June 15, 2016, 4:17 p.m. UTC | #1
> Manage and allocate pool of objects with given limit on number of
> elements.  Gets parameters from rxe_type_info. Pool elements are
> allocated out of a slab cache.  Objects that are using this facility
> are: PD, QP, SRQ, CQ, MR, FMR, MW, etc.
> 
> Signed-off-by: Kamal Heib <kamalh@mellanox.com>
> Signed-off-by: Amir Vadai <amirv@mellanox.com>
> Signed-off-by: Moni Shoua <monis@mellanox.com>
> Reviewed-by: Haggai Eran <haggaie@mellanox.com>
> ---
>  drivers/infiniband/hw/rxe/rxe_pool.c | 510
> +++++++++++++++++++++++++++++++++++
>  drivers/infiniband/hw/rxe/rxe_pool.h | 164 +++++++++++
>  2 files changed, 674 insertions(+)
>  create mode 100644 drivers/infiniband/hw/rxe/rxe_pool.c
>  create mode 100644 drivers/infiniband/hw/rxe/rxe_pool.h
> 
> diff --git a/drivers/infiniband/hw/rxe/rxe_pool.c
> b/drivers/infiniband/hw/rxe/rxe_pool.c
> new file mode 100644
> index 0000000..5a7da6b
> --- /dev/null
> +++ b/drivers/infiniband/hw/rxe/rxe_pool.c
> @@ -0,0 +1,510 @@
> +/*
> + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
> + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
> + *
> + * This software is available to you under a choice of one of two
> + * licenses.  You may choose to be licensed under the terms of the GNU
> + * General Public License (GPL) Version 2, available from the file
> + * COPYING in the main directory of this source tree, or the
> + * OpenIB.org BSD license below:
> + *
> + *	   Redistribution and use in source and binary forms, with or
> + *	   without modification, are permitted provided that the following
> + *	   conditions are met:
> + *
> + *		- Redistributions of source code must retain the above
> + *		  copyright notice, this list of conditions and the following
> + *		  disclaimer.
> + *
> + *		- Redistributions in binary form must reproduce the above
> + *		  copyright notice, this list of conditions and the following
> + *		  disclaimer in the documentation and/or other materials
> + *		  provided with the distribution.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
> HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + */
> +
> +#include "rxe.h"
> +#include "rxe_loc.h"
> +
> +/* info about object pools
> + * note that mr, fmr and mw share a single index space
> + * so that one can map an lkey to the correct type of object
> + */
> +struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = {
> +	[RXE_TYPE_UC] = {
> +		.name		= "uc",
> +		.size		= sizeof(struct rxe_ucontext),
> +	},
> +	[RXE_TYPE_PD] = {
> +		.name		= "pd",
> +		.size		= sizeof(struct rxe_pd),
> +	},
> +	[RXE_TYPE_AH] = {
> +		.name		= "ah",
> +		.size		= sizeof(struct rxe_ah),
> +		.flags		= RXE_POOL_ATOMIC,
> +	},
> +	[RXE_TYPE_SRQ] = {
> +		.name		= "srq",
> +		.size		= sizeof(struct rxe_srq),
> +		.flags		= RXE_POOL_INDEX,
> +		.min_index	= RXE_MIN_SRQ_INDEX,
> +		.max_index	= RXE_MAX_SRQ_INDEX,
> +	},
> +	[RXE_TYPE_QP] = {
> +		.name		= "qp",
> +		.size		= sizeof(struct rxe_qp),
> +		.cleanup	= rxe_qp_cleanup,
> +		.flags		= RXE_POOL_INDEX,
> +		.min_index	= RXE_MIN_QP_INDEX,
> +		.max_index	= RXE_MAX_QP_INDEX,
> +	},
> +	[RXE_TYPE_CQ] = {
> +		.name		= "cq",
> +		.size		= sizeof(struct rxe_cq),
> +		.cleanup	= rxe_cq_cleanup,
> +	},
> +	[RXE_TYPE_MR] = {
> +		.name		= "mr",
> +		.size		= sizeof(struct rxe_mem),
> +		.cleanup	= rxe_mem_cleanup,
> +		.flags		= RXE_POOL_INDEX,
> +		.max_index	= RXE_MAX_MR_INDEX,
> +		.min_index	= RXE_MIN_MR_INDEX,
> +	},
> +	[RXE_TYPE_FMR] = {
> +		.name		= "fmr",
> +		.size		= sizeof(struct rxe_mem),
> +		.cleanup	= rxe_mem_cleanup,
> +		.flags		= RXE_POOL_INDEX,
> +		.max_index	= RXE_MAX_FMR_INDEX,
> +		.min_index	= RXE_MIN_FMR_INDEX,
> +	},
> +	[RXE_TYPE_MW] = {
> +		.name		= "mw",
> +		.size		= sizeof(struct rxe_mem),
> +		.flags		= RXE_POOL_INDEX,
> +		.max_index	= RXE_MAX_MW_INDEX,
> +		.min_index	= RXE_MIN_MW_INDEX,
> +	},
> +	[RXE_TYPE_MC_GRP] = {
> +		.name		= "mc_grp",
> +		.size		= sizeof(struct rxe_mc_grp),
> +		.cleanup	= rxe_mc_cleanup,
> +		.flags		= RXE_POOL_KEY,
> +		.key_offset	= offsetof(struct rxe_mc_grp, mgid),
> +		.key_size	= sizeof(union ib_gid),
> +	},
> +	[RXE_TYPE_MC_ELEM] = {
> +		.name		= "mc_elem",
> +		.size		= sizeof(struct rxe_mc_elem),
> +		.flags		= RXE_POOL_ATOMIC,
> +	},
> +};
> +

Perhaps the above slab names should be prefixed with "rxe-"?  EG: "rxe-cq",
"rxe-qp", etc.  They look very non-descriptive when looking at them in slabinfo:

[root@stevo2 linux-2.6]# head /proc/slabinfo
slabinfo - version: 2.1
# name            <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>
: tunables <limit> <batchcount> <sharedfactor> : slabdata <active_slabs>
<num_slabs> <sharedavail>
uc                     0      0    352   11    1 : tunables   54   27    8 :
slabdata      0      0      0
cq                     6     34    240   17    1 : tunables  120   60    8 :
slabdata      2      2      0
qp                     4      6   1920    2    1 : tunables   24   12    8 :
slabdata      3      3      0
pd                     4     36    112   36    1 : tunables  120   60    8 :
slabdata      1      1      0


Also when using rxe with nvmf (among other bugs I'm chasing down), when you
unload ib_rxe, there are allocated cq objects left in the the slab cache:

kmem_cache_destroy cq: Slab cache still has objects
CPU: 5 PID: 4147 Comm: rmmod Tainted: G            E   4.7.0-rc2-nvmf-all.3+rxe+
#51
Hardware name: Supermicro X9DR3-F/X9DR3-F, BIOS 3.2a 07/09/2015
 0000000000000000 ffff881007b33d98 ffffffff812d1359 ffff88103bd65098
 ffffea00378983d0 ffff881007b33da8 ffff881078b32240 ffff881007b33df8
 ffffffff8117619c ffff881007b33da8 ffff881007b33da8 ffff88103bec6960
Call Trace:
 [<ffffffff812d1359>] dump_stack+0x51/0x78
 [<ffffffff8117619c>] kmem_cache_destroy+0x12c/0x150
 [<ffffffffa04c62cc>] rxe_cache_exit+0x1c/0x40 [ib_rxe]
 [<ffffffffa04ce4cf>] rxe_module_exit+0x13/0x23 [ib_rxe]
 [<ffffffff810e1d15>] SyS_delete_module+0x185/0x1d0
 [<ffffffff8100278e>] ? syscall_trace_enter_phase2+0x6e/0x190
 [<ffffffff81002915>] ? syscall_trace_enter+0x65/0x70
 [<ffffffff81002d4d>] do_syscall_64+0x6d/0x160
 [<ffffffff8161857c>] entry_SYSCALL64_slow_path+0x25/0x25
rxe: unloaded

Steve.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise June 15, 2016, 4:45 p.m. UTC | #2
> Also when using rxe with nvmf (among other bugs I'm chasing down), when you
> unload ib_rxe, there are allocated cq objects left in the the slab cache:
> 

I see the same thing running iser over rxe.  So I think somehow cq objects are
getting leaked in rxe...

Steve.


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Moni Shoua June 15, 2016, 4:52 p.m. UTC | #3
Hi Steve
Thanks for the review

>
> Perhaps the above slab names should be prefixed with "rxe-"?  EG: "rxe-cq",
> "rxe-qp", etc.  They look very non-descriptive when looking at them in slabinfo:
>
Agree. I'll change in next series

> [root@stevo2 linux-2.6]# head /proc/slabinfo
> slabinfo - version: 2.1
> # name            <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>
> : tunables <limit> <batchcount> <sharedfactor> : slabdata <active_slabs>
> <num_slabs> <sharedavail>
> uc                     0      0    352   11    1 : tunables   54   27    8 :
> slabdata      0      0      0
> cq                     6     34    240   17    1 : tunables  120   60    8 :
> slabdata      2      2      0
> qp                     4      6   1920    2    1 : tunables   24   12    8 :
> slabdata      3      3      0
> pd                     4     36    112   36    1 : tunables  120   60    8 :
> slabdata      1      1      0
>
>
> Also when using rxe with nvmf (among other bugs I'm chasing down), when you
> unload ib_rxe, there are allocated cq objects left in the the slab cache:
>
Do you have a simple reproducing scenario?
I'm not sure I'll be able to fix until I submit the next series but
I'll try to investigage it right after

> kmem_cache_destroy cq: Slab cache still has objects
> CPU: 5 PID: 4147 Comm: rmmod Tainted: G            E   4.7.0-rc2-nvmf-all.3+rxe+
> #51
> Hardware name: Supermicro X9DR3-F/X9DR3-F, BIOS 3.2a 07/09/2015
>  0000000000000000 ffff881007b33d98 ffffffff812d1359 ffff88103bd65098
>  ffffea00378983d0 ffff881007b33da8 ffff881078b32240 ffff881007b33df8
>  ffffffff8117619c ffff881007b33da8 ffff881007b33da8 ffff88103bec6960
> Call Trace:
>  [<ffffffff812d1359>] dump_stack+0x51/0x78
>  [<ffffffff8117619c>] kmem_cache_destroy+0x12c/0x150
>  [<ffffffffa04c62cc>] rxe_cache_exit+0x1c/0x40 [ib_rxe]
>  [<ffffffffa04ce4cf>] rxe_module_exit+0x13/0x23 [ib_rxe]
>  [<ffffffff810e1d15>] SyS_delete_module+0x185/0x1d0
>  [<ffffffff8100278e>] ? syscall_trace_enter_phase2+0x6e/0x190
>  [<ffffffff81002915>] ? syscall_trace_enter+0x65/0x70
>  [<ffffffff81002d4d>] do_syscall_64+0x6d/0x160
>  [<ffffffff8161857c>] entry_SYSCALL64_slow_path+0x25/0x25
> rxe: unloaded
>
> Steve.
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Moni Shoua June 15, 2016, 4:54 p.m. UTC | #4
>
> I see the same thing running iser over rxe.  So I think somehow cq objects are
> getting leaked in rxe...
>
OK, I'll try to see if I can reproduce with iser
thanks
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
yonatan cohen June 19, 2016, 3:01 p.m. UTC | #5
On 6/15/2016 7:45 PM, Steve Wise wrote:
>> Also when using rxe with nvmf (among other bugs I'm chasing down), when you
>> unload ib_rxe, there are allocated cq objects left in the the slab cache:
>>
>
> I see the same thing running iser over rxe.  So I think somehow cq objects are
> getting leaked in rxe...
>

Hello Steve,
can you please supply the test you used over iser to reproduce this ?
ive tried :
fio --rw=randread --bs=64k --numjobs=4 --iodepth=8 --runtime=30 
--time_based --loops=1 --ioengine=libaio --direct=1 --invalidate=1 
--fsync_on_close=1 --randrepeat=1 --norandommap --exitall --name task1 
--filename=/dev/sdb
but failed to reproduce.
thanks, yonatan.

> Steve.
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Steve Wise June 20, 2016, 9:18 p.m. UTC | #6
> On 6/15/2016 7:45 PM, Steve Wise wrote:
> >> Also when using rxe with nvmf (among other bugs I'm chasing down), when you
> >> unload ib_rxe, there are allocated cq objects left in the the slab cache:
> >>
> >
> > I see the same thing running iser over rxe.  So I think somehow cq objects
are
> > getting leaked in rxe...
> >
> 
> Hello Steve,
> can you please supply the test you used over iser to reproduce this ?
> ive tried :
> fio --rw=randread --bs=64k --numjobs=4 --iodepth=8 --runtime=30
> --time_based --loops=1 --ioengine=libaio --direct=1 --invalidate=1
> --fsync_on_close=1 --randrepeat=1 --norandommap --exitall --name task1
> --filename=/dev/sdb
> but failed to reproduce.
> thanks, yonatan.

Hey Yonatan,

So far I haven't been able to reproduce this over iser again.  I'll email if I
get a reliable reproducer. 

Steve.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/hw/rxe/rxe_pool.c b/drivers/infiniband/hw/rxe/rxe_pool.c
new file mode 100644
index 0000000..5a7da6b
--- /dev/null
+++ b/drivers/infiniband/hw/rxe/rxe_pool.c
@@ -0,0 +1,510 @@ 
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *	   Redistribution and use in source and binary forms, with or
+ *	   without modification, are permitted provided that the following
+ *	   conditions are met:
+ *
+ *		- Redistributions of source code must retain the above
+ *		  copyright notice, this list of conditions and the following
+ *		  disclaimer.
+ *
+ *		- Redistributions in binary form must reproduce the above
+ *		  copyright notice, this list of conditions and the following
+ *		  disclaimer in the documentation and/or other materials
+ *		  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+/* info about object pools
+ * note that mr, fmr and mw share a single index space
+ * so that one can map an lkey to the correct type of object
+ */
+struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = {
+	[RXE_TYPE_UC] = {
+		.name		= "uc",
+		.size		= sizeof(struct rxe_ucontext),
+	},
+	[RXE_TYPE_PD] = {
+		.name		= "pd",
+		.size		= sizeof(struct rxe_pd),
+	},
+	[RXE_TYPE_AH] = {
+		.name		= "ah",
+		.size		= sizeof(struct rxe_ah),
+		.flags		= RXE_POOL_ATOMIC,
+	},
+	[RXE_TYPE_SRQ] = {
+		.name		= "srq",
+		.size		= sizeof(struct rxe_srq),
+		.flags		= RXE_POOL_INDEX,
+		.min_index	= RXE_MIN_SRQ_INDEX,
+		.max_index	= RXE_MAX_SRQ_INDEX,
+	},
+	[RXE_TYPE_QP] = {
+		.name		= "qp",
+		.size		= sizeof(struct rxe_qp),
+		.cleanup	= rxe_qp_cleanup,
+		.flags		= RXE_POOL_INDEX,
+		.min_index	= RXE_MIN_QP_INDEX,
+		.max_index	= RXE_MAX_QP_INDEX,
+	},
+	[RXE_TYPE_CQ] = {
+		.name		= "cq",
+		.size		= sizeof(struct rxe_cq),
+		.cleanup	= rxe_cq_cleanup,
+	},
+	[RXE_TYPE_MR] = {
+		.name		= "mr",
+		.size		= sizeof(struct rxe_mem),
+		.cleanup	= rxe_mem_cleanup,
+		.flags		= RXE_POOL_INDEX,
+		.max_index	= RXE_MAX_MR_INDEX,
+		.min_index	= RXE_MIN_MR_INDEX,
+	},
+	[RXE_TYPE_FMR] = {
+		.name		= "fmr",
+		.size		= sizeof(struct rxe_mem),
+		.cleanup	= rxe_mem_cleanup,
+		.flags		= RXE_POOL_INDEX,
+		.max_index	= RXE_MAX_FMR_INDEX,
+		.min_index	= RXE_MIN_FMR_INDEX,
+	},
+	[RXE_TYPE_MW] = {
+		.name		= "mw",
+		.size		= sizeof(struct rxe_mem),
+		.flags		= RXE_POOL_INDEX,
+		.max_index	= RXE_MAX_MW_INDEX,
+		.min_index	= RXE_MIN_MW_INDEX,
+	},
+	[RXE_TYPE_MC_GRP] = {
+		.name		= "mc_grp",
+		.size		= sizeof(struct rxe_mc_grp),
+		.cleanup	= rxe_mc_cleanup,
+		.flags		= RXE_POOL_KEY,
+		.key_offset	= offsetof(struct rxe_mc_grp, mgid),
+		.key_size	= sizeof(union ib_gid),
+	},
+	[RXE_TYPE_MC_ELEM] = {
+		.name		= "mc_elem",
+		.size		= sizeof(struct rxe_mc_elem),
+		.flags		= RXE_POOL_ATOMIC,
+	},
+};
+
+static inline char *pool_name(struct rxe_pool *pool)
+{
+	return rxe_type_info[pool->type].name;
+}
+
+static inline struct kmem_cache *pool_cache(struct rxe_pool *pool)
+{
+	return rxe_type_info[pool->type].cache;
+}
+
+static inline enum rxe_elem_type rxe_type(void *arg)
+{
+	struct rxe_pool_entry *elem = arg;
+
+	return elem->pool->type;
+}
+
+int rxe_cache_init(void)
+{
+	int err;
+	int i;
+	size_t size;
+	struct rxe_type_info *type;
+
+	for (i = 0; i < RXE_NUM_TYPES; i++) {
+		type = &rxe_type_info[i];
+		size = ALIGN(type->size, RXE_POOL_ALIGN);
+		type->cache = kmem_cache_create(type->name, size,
+				RXE_POOL_ALIGN,
+				RXE_POOL_CACHE_FLAGS, NULL);
+		if (!type->cache) {
+			pr_err("Unable to init kmem cache for %s\n",
+			       type->name);
+			err = -ENOMEM;
+			goto err1;
+		}
+	}
+
+	return 0;
+
+err1:
+	while (--i >= 0) {
+		kmem_cache_destroy(type->cache);
+		type->cache = NULL;
+	}
+
+	return err;
+}
+
+void rxe_cache_exit(void)
+{
+	int i;
+	struct rxe_type_info *type;
+
+	for (i = 0; i < RXE_NUM_TYPES; i++) {
+		type = &rxe_type_info[i];
+		kmem_cache_destroy(type->cache);
+		type->cache = NULL;
+	}
+}
+
+static int rxe_pool_init_index(struct rxe_pool *pool, u32 max, u32 min)
+{
+	int err = 0;
+	size_t size;
+
+	if ((max - min + 1) < pool->max_elem) {
+		pr_warn("not enough indices for max_elem\n");
+		err = -EINVAL;
+		goto out;
+	}
+
+	pool->max_index = max;
+	pool->min_index = min;
+
+	size = BITS_TO_LONGS(max - min + 1) * sizeof(long);
+	pool->table = kmalloc(size, GFP_KERNEL);
+	if (!pool->table) {
+		pr_warn("no memory for bit table\n");
+		err = -ENOMEM;
+		goto out;
+	}
+
+	pool->table_size = size;
+	bitmap_zero(pool->table, max - min + 1);
+
+out:
+	return err;
+}
+
+int rxe_pool_init(
+	struct rxe_dev		*rxe,
+	struct rxe_pool		*pool,
+	enum rxe_elem_type	type,
+	unsigned		max_elem)
+{
+	int			err = 0;
+	size_t			size = rxe_type_info[type].size;
+
+	memset(pool, 0, sizeof(*pool));
+
+	pool->rxe		= rxe;
+	pool->type		= type;
+	pool->max_elem		= max_elem;
+	pool->elem_size		= ALIGN(size, RXE_POOL_ALIGN);
+	pool->flags		= rxe_type_info[type].flags;
+	pool->tree		= RB_ROOT;
+	pool->cleanup		= rxe_type_info[type].cleanup;
+
+	atomic_set(&pool->num_elem, 0);
+
+	kref_init(&pool->ref_cnt);
+
+	spin_lock_init(&pool->pool_lock);
+
+	if (rxe_type_info[type].flags & RXE_POOL_INDEX) {
+		err = rxe_pool_init_index(pool,
+					  rxe_type_info[type].max_index,
+					  rxe_type_info[type].min_index);
+		if (err)
+			goto out;
+	}
+
+	if (rxe_type_info[type].flags & RXE_POOL_KEY) {
+		pool->key_offset = rxe_type_info[type].key_offset;
+		pool->key_size = rxe_type_info[type].key_size;
+	}
+
+	pool->state = rxe_pool_valid;
+
+out:
+	return err;
+}
+
+static void rxe_pool_release(struct kref *kref)
+{
+	struct rxe_pool *pool = container_of(kref, struct rxe_pool, ref_cnt);
+
+	pool->state = rxe_pool_invalid;
+	kfree(pool->table);
+}
+
+static void rxe_pool_put(struct rxe_pool *pool)
+{
+	kref_put(&pool->ref_cnt, rxe_pool_release);
+}
+
+int rxe_pool_cleanup(struct rxe_pool *pool)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+	pool->state = rxe_pool_invalid;
+	if (atomic_read(&pool->num_elem) > 0)
+		pr_warn("%s pool destroyed with unfree'd elem\n",
+			pool_name(pool));
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+	rxe_pool_put(pool);
+
+	return 0;
+}
+
+static u32 alloc_index(struct rxe_pool *pool)
+{
+	u32 index;
+	u32 range = pool->max_index - pool->min_index + 1;
+
+	index = find_next_zero_bit(pool->table, range, pool->last);
+	if (index >= range)
+		index = find_first_zero_bit(pool->table, range);
+
+	set_bit(index, pool->table);
+	pool->last = index;
+	return index + pool->min_index;
+}
+
+static void insert_index(struct rxe_pool *pool, struct rxe_pool_entry *new)
+{
+	struct rb_node **link = &pool->tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct rxe_pool_entry *elem;
+
+	while (*link) {
+		parent = *link;
+		elem = rb_entry(parent, struct rxe_pool_entry, node);
+
+		if (elem->index == new->index) {
+			pr_warn("element already exists!\n");
+			goto out;
+		}
+
+		if (elem->index > new->index)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, &pool->tree);
+out:
+	return;
+}
+
+static void insert_key(struct rxe_pool *pool, struct rxe_pool_entry *new)
+{
+	struct rb_node **link = &pool->tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct rxe_pool_entry *elem;
+	int cmp;
+
+	while (*link) {
+		parent = *link;
+		elem = rb_entry(parent, struct rxe_pool_entry, node);
+
+		cmp = memcmp((u8 *)elem + pool->key_offset,
+			     (u8 *)new + pool->key_offset, pool->key_size);
+
+		if (cmp == 0) {
+			pr_warn("key already exists!\n");
+			goto out;
+		}
+
+		if (cmp > 0)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, &pool->tree);
+out:
+	return;
+}
+
+void rxe_add_key(void *arg, void *key)
+{
+	struct rxe_pool_entry *elem = arg;
+	struct rxe_pool *pool = elem->pool;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+	memcpy((u8 *)elem + pool->key_offset, key, pool->key_size);
+	insert_key(pool, elem);
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+}
+
+void rxe_drop_key(void *arg)
+{
+	struct rxe_pool_entry *elem = arg;
+	struct rxe_pool *pool = elem->pool;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+	rb_erase(&elem->node, &pool->tree);
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+}
+
+void rxe_add_index(void *arg)
+{
+	struct rxe_pool_entry *elem = arg;
+	struct rxe_pool *pool = elem->pool;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+	elem->index = alloc_index(pool);
+	insert_index(pool, elem);
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+}
+
+void rxe_drop_index(void *arg)
+{
+	struct rxe_pool_entry *elem = arg;
+	struct rxe_pool *pool = elem->pool;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+	clear_bit(elem->index - pool->min_index, pool->table);
+	rb_erase(&elem->node, &pool->tree);
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+}
+
+void *rxe_alloc(struct rxe_pool *pool)
+{
+	struct rxe_pool_entry *elem;
+	unsigned long flags;
+
+	might_sleep_if(!(pool->flags & RXE_POOL_ATOMIC));
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+	if (pool->state != rxe_pool_valid) {
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+		return NULL;
+	}
+	kref_get(&pool->ref_cnt);
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+	kref_get(&pool->rxe->ref_cnt);
+
+	if (atomic_inc_return(&pool->num_elem) > pool->max_elem) {
+		atomic_dec(&pool->num_elem);
+		rxe_dev_put(pool->rxe);
+		rxe_pool_put(pool);
+		return NULL;
+	}
+
+	elem = kmem_cache_zalloc(pool_cache(pool),
+				 (pool->flags & RXE_POOL_ATOMIC) ?
+				 GFP_ATOMIC : GFP_KERNEL);
+
+	elem->pool = pool;
+	kref_init(&elem->ref_cnt);
+
+	return elem;
+}
+
+void rxe_elem_release(struct kref *kref)
+{
+	struct rxe_pool_entry *elem =
+		container_of(kref, struct rxe_pool_entry, ref_cnt);
+	struct rxe_pool *pool = elem->pool;
+
+	if (pool->cleanup)
+		pool->cleanup(elem);
+
+	kmem_cache_free(pool_cache(pool), elem);
+	atomic_dec(&pool->num_elem);
+	rxe_dev_put(pool->rxe);
+	rxe_pool_put(pool);
+}
+
+void *rxe_pool_get_index(struct rxe_pool *pool, u32 index)
+{
+	struct rb_node *node = NULL;
+	struct rxe_pool_entry *elem = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+
+	if (pool->state != rxe_pool_valid)
+		goto out;
+
+	node = pool->tree.rb_node;
+
+	while (node) {
+		elem = rb_entry(node, struct rxe_pool_entry, node);
+
+		if (elem->index > index)
+			node = node->rb_left;
+		else if (elem->index < index)
+			node = node->rb_right;
+		else
+			break;
+	}
+
+	if (node)
+		kref_get(&elem->ref_cnt);
+
+out:
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+	return node ? (void *)elem : NULL;
+}
+
+void *rxe_pool_get_key(struct rxe_pool *pool, void *key)
+{
+	struct rb_node *node = NULL;
+	struct rxe_pool_entry *elem = NULL;
+	int cmp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+
+	if (pool->state != rxe_pool_valid)
+		goto out;
+
+	node = pool->tree.rb_node;
+
+	while (node) {
+		elem = rb_entry(node, struct rxe_pool_entry, node);
+
+		cmp = memcmp((u8 *)elem + pool->key_offset,
+			     key, pool->key_size);
+
+		if (cmp > 0)
+			node = node->rb_left;
+		else if (cmp < 0)
+			node = node->rb_right;
+		else
+			break;
+	}
+
+	if (node)
+		kref_get(&elem->ref_cnt);
+
+out:
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+	return node ? ((void *)elem) : NULL;
+}
diff --git a/drivers/infiniband/hw/rxe/rxe_pool.h b/drivers/infiniband/hw/rxe/rxe_pool.h
new file mode 100644
index 0000000..43476b5
--- /dev/null
+++ b/drivers/infiniband/hw/rxe/rxe_pool.h
@@ -0,0 +1,164 @@ 
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *	   Redistribution and use in source and binary forms, with or
+ *	   without modification, are permitted provided that the following
+ *	   conditions are met:
+ *
+ *		- Redistributions of source code must retain the above
+ *		  copyright notice, this list of conditions and the following
+ *		  disclaimer.
+ *
+ *		- Redistributions in binary form must reproduce the above
+ *		  copyright notice, this list of conditions and the following
+ *		  disclaimer in the documentation and/or other materials
+ *		  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_POOL_H
+#define RXE_POOL_H
+
+#define RXE_POOL_ALIGN		(16)
+#define RXE_POOL_CACHE_FLAGS	(0)
+
+enum rxe_pool_flags {
+	RXE_POOL_ATOMIC		= BIT(0),
+	RXE_POOL_INDEX		= BIT(1),
+	RXE_POOL_KEY		= BIT(2),
+};
+
+enum rxe_elem_type {
+	RXE_TYPE_UC,
+	RXE_TYPE_PD,
+	RXE_TYPE_AH,
+	RXE_TYPE_SRQ,
+	RXE_TYPE_QP,
+	RXE_TYPE_CQ,
+	RXE_TYPE_MR,
+	RXE_TYPE_MW,
+	RXE_TYPE_FMR,
+	RXE_TYPE_MC_GRP,
+	RXE_TYPE_MC_ELEM,
+	RXE_NUM_TYPES,		/* keep me last */
+};
+
+struct rxe_type_info {
+	char			*name;
+	size_t			size;
+	void			(*cleanup)(void *obj);
+	enum rxe_pool_flags	flags;
+	u32			max_index;
+	u32			min_index;
+	size_t			key_offset;
+	size_t			key_size;
+	struct kmem_cache	*cache;
+};
+
+extern struct rxe_type_info rxe_type_info[];
+
+enum rxe_pool_state {
+	rxe_pool_invalid,
+	rxe_pool_valid,
+};
+
+struct rxe_pool_entry {
+	struct rxe_pool		*pool;
+	struct kref		ref_cnt;
+	struct list_head	list;
+
+	/* only used if indexed or keyed */
+	struct rb_node		node;
+	u32			index;
+};
+
+struct rxe_pool {
+	struct rxe_dev		*rxe;
+	spinlock_t              pool_lock; /* pool spinlock */
+	size_t			elem_size;
+	struct kref		ref_cnt;
+	void			(*cleanup)(void *obj);
+	enum rxe_pool_state	state;
+	enum rxe_pool_flags	flags;
+	enum rxe_elem_type	type;
+
+	unsigned int		max_elem;
+	atomic_t		num_elem;
+
+	/* only used if indexed or keyed */
+	struct rb_root		tree;
+	unsigned long		*table;
+	size_t			table_size;
+	u32			max_index;
+	u32			min_index;
+	u32			last;
+	size_t			key_offset;
+	size_t			key_size;
+};
+
+/* initialize slab caches for managed objects */
+int rxe_cache_init(void);
+
+/* cleanup slab caches for managed objects */
+void rxe_cache_exit(void);
+
+/* initialize a pool of objects with given limit on
+ * number of elements. gets parameters from rxe_type_info
+ * pool elements will be allocated out of a slab cache
+ */
+int rxe_pool_init(struct rxe_dev *rxe, struct rxe_pool *pool,
+		  enum rxe_elem_type type, u32 max_elem);
+
+/* free resources from object pool */
+int rxe_pool_cleanup(struct rxe_pool *pool);
+
+/* allocate an object from pool */
+void *rxe_alloc(struct rxe_pool *pool);
+
+/* assign an index to an indexed object and insert object into
+ *  pool's rb tree
+ */
+void rxe_add_index(void *elem);
+
+/* drop an index and remove object from rb tree */
+void rxe_drop_index(void *elem);
+
+/* assign a key to a keyed object and insert object into
+ *  pool's rb tree
+ */
+void rxe_add_key(void *elem, void *key);
+
+/* remove elem from rb tree */
+void rxe_drop_key(void *elem);
+
+/* lookup an indexed object from index. takes a reference on object */
+void *rxe_pool_get_index(struct rxe_pool *pool, u32 index);
+
+/* lookup keyed object from key. takes a reference on the object */
+void *rxe_pool_get_key(struct rxe_pool *pool, void *key);
+
+/* cleanup an object when all references are dropped */
+void rxe_elem_release(struct kref *kref);
+
+/* take a reference on an object */
+#define rxe_add_ref(elem) kref_get(&(elem)->pelem.ref_cnt)
+
+/* drop a reference on an object */
+#define rxe_drop_ref(elem) kref_put(&(elem)->pelem.ref_cnt, rxe_elem_release)
+
+#endif /* RXE_POOL_H */