diff mbox series

[for-next,v10,06/11] RDMA/rxe: Add wait_for_completion to pool objects

Message ID 20220225195750.37802-7-rpearsonhpe@gmail.com (mailing list archive)
State Superseded
Delegated to: Jason Gunthorpe
Headers show
Series Fix race conditions in rxe_pool | expand

Commit Message

Bob Pearson Feb. 25, 2022, 7:57 p.m. UTC
Reference counting for object deletion can cause an object to
wait for something else to happen before an object gets deleted.
The destroy verbs can then return to rdma-core with the object still
holding references. Adding wait_for_completion in this path
prevents this.

Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
---
 drivers/infiniband/sw/rxe/rxe_mr.c    |  2 +-
 drivers/infiniband/sw/rxe/rxe_mw.c    |  2 +-
 drivers/infiniband/sw/rxe/rxe_pool.c  | 49 +++++++++++++++++++++------
 drivers/infiniband/sw/rxe/rxe_pool.h  |  6 ++++
 drivers/infiniband/sw/rxe/rxe_verbs.c | 23 +++++++------
 5 files changed, 59 insertions(+), 23 deletions(-)

Comments

Jason Gunthorpe Feb. 28, 2022, 5:05 p.m. UTC | #1
On Fri, Feb 25, 2022 at 01:57:46PM -0600, Bob Pearson wrote:
>  int __rxe_add_ref(struct rxe_pool_elem *elem)
> @@ -262,3 +258,36 @@ int __rxe_drop_ref(struct rxe_pool_elem *elem)
>  	return kref_put_lock_irqsave(&elem->ref_cnt, rxe_elem_release,
>  			&pool->xa.xa_lock);

Also can't touch the xa_lock to do stuff like this,

> +int __rxe_drop_wait(struct rxe_pool_elem *elem)

I think I would call this something else since it it basically unconditionally
frees the memory.

> +{
> +	struct rxe_pool *pool = elem->pool;
> +	static int timeout = RXE_POOL_TIMEOUT;
> +	int ret;
> +
> +	__rxe_drop_ref(elem);

> +	if (timeout) {
> +		ret = wait_for_completion_timeout(&elem->complete, timeout);
> +		if (!ret) {
> +			pr_warn("Timed out waiting for %s#%d\n",
> +				pool->name + 4, elem->index);

This is a WARN_ON event, kernel is broken, and you should leak the
memory rather than cause memory corruption.

> +			if (++pool->timeouts == RXE_MAX_POOL_TIMEOUTS) {
> +				timeout = 0;
> +				pr_warn("Reached max %s timeouts.\n",
> +					pool->name + 4);
> +			}

Why?

> +		}
> +	}
> +
> +	if (pool->cleanup)
> +		pool->cleanup(elem);
> +
> +	if (pool->flags & RXE_POOL_ALLOC)
> +		kfree(elem->obj);
> +
> +	atomic_dec(&pool->num_elem);
> +
> +	return ret;

And we return a failure code but freed the memory? This shouldn't fail

But the idea is right..

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index 157cfb710a7e..245785def608 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -692,7 +692,7 @@  int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 	rxe_disable(mr);
 	mr->state = RXE_MR_STATE_INVALID;
 	rxe_drop_ref(mr_pd(mr));
-	rxe_drop_ref(mr);
+	rxe_drop_wait(mr);
 
 	return 0;
 }
diff --git a/drivers/infiniband/sw/rxe/rxe_mw.c b/drivers/infiniband/sw/rxe/rxe_mw.c
index e4d207f24eba..c45d832d7427 100644
--- a/drivers/infiniband/sw/rxe/rxe_mw.c
+++ b/drivers/infiniband/sw/rxe/rxe_mw.c
@@ -63,8 +63,8 @@  int rxe_dealloc_mw(struct ib_mw *ibmw)
 	rxe_do_dealloc_mw(mw);
 	spin_unlock_bh(&mw->lock);
 
-	rxe_drop_ref(mw);
 	rxe_drop_ref(pd);
+	rxe_drop_wait(mw);
 
 	return 0;
 }
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
index 19d8fb77b166..1d1e10290991 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.c
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -6,6 +6,8 @@ 
 
 #include "rxe.h"
 
+#define RXE_POOL_TIMEOUT	(200)
+#define RXE_MAX_POOL_TIMEOUTS	(3)
 #define RXE_POOL_ALIGN		(16)
 
 static const struct rxe_type_info {
@@ -123,7 +125,7 @@  void rxe_pool_cleanup(struct rxe_pool *pool)
 	} while (elem);
 
 	if (elem_count || obj_count)
-		pr_warn("Freed %d indices & %d objects from pool %s\n",
+		pr_warn("Freed %d indices and %d objects from pool %s\n",
 			elem_count, obj_count, pool->name + 4);
 }
 
@@ -151,6 +153,7 @@  void *rxe_alloc(struct rxe_pool *pool)
 	elem->pool = pool;
 	elem->obj = obj;
 	kref_init(&elem->ref_cnt);
+	init_completion(&elem->complete);
 
 	err = xa_alloc_cyclic_bh(&pool->xa, &elem->index, elem, pool->limit,
 			&pool->next, GFP_KERNEL);
@@ -182,6 +185,7 @@  int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem)
 	elem->pool = pool;
 	elem->obj = (u8 *)elem - pool->elem_offset;
 	kref_init(&elem->ref_cnt);
+	init_completion(&elem->complete);
 
 	err = xa_alloc_cyclic_bh(&pool->xa, &elem->index, elem, pool->limit,
 			&pool->next, GFP_KERNEL);
@@ -218,20 +222,12 @@  static void rxe_elem_release(struct kref *kref, unsigned long flags)
 	struct rxe_pool_elem *elem =
 		container_of(kref, struct rxe_pool_elem, ref_cnt);
 	struct rxe_pool *pool = elem->pool;
-	void *obj;
 
 	__xa_erase(&pool->xa, elem->index);
 
 	spin_unlock_irqrestore(&pool->xa.xa_lock, flags);
-	if (pool->cleanup)
-		pool->cleanup(elem);
-
-	if (pool->flags & RXE_POOL_ALLOC) {
-		obj = elem->obj;
-		kfree(obj);
-	}
 
-	atomic_dec(&pool->num_elem);
+	complete(&elem->complete);
 }
 
 int __rxe_add_ref(struct rxe_pool_elem *elem)
@@ -262,3 +258,36 @@  int __rxe_drop_ref(struct rxe_pool_elem *elem)
 	return kref_put_lock_irqsave(&elem->ref_cnt, rxe_elem_release,
 			&pool->xa.xa_lock);
 }
+
+
+int __rxe_drop_wait(struct rxe_pool_elem *elem)
+{
+	struct rxe_pool *pool = elem->pool;
+	static int timeout = RXE_POOL_TIMEOUT;
+	int ret;
+
+	__rxe_drop_ref(elem);
+
+	if (timeout) {
+		ret = wait_for_completion_timeout(&elem->complete, timeout);
+		if (!ret) {
+			pr_warn("Timed out waiting for %s#%d\n",
+				pool->name + 4, elem->index);
+			if (++pool->timeouts == RXE_MAX_POOL_TIMEOUTS) {
+				timeout = 0;
+				pr_warn("Reached max %s timeouts.\n",
+					pool->name + 4);
+			}
+		}
+	}
+
+	if (pool->cleanup)
+		pool->cleanup(elem);
+
+	if (pool->flags & RXE_POOL_ALLOC)
+		kfree(elem->obj);
+
+	atomic_dec(&pool->num_elem);
+
+	return ret;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h
index b963c300d636..f98d2950bb9f 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.h
+++ b/drivers/infiniband/sw/rxe/rxe_pool.h
@@ -28,6 +28,7 @@  struct rxe_pool_elem {
 	void			*obj;
 	struct kref		ref_cnt;
 	struct list_head	list;
+	struct completion	complete;
 	u32			index;
 	bool			enabled;
 };
@@ -38,6 +39,7 @@  struct rxe_pool {
 	void			(*cleanup)(struct rxe_pool_elem *elem);
 	enum rxe_pool_flags	flags;
 	enum rxe_elem_type	type;
+	unsigned int		timeouts;
 
 	unsigned int		max_elem;
 	atomic_t		num_elem;
@@ -86,4 +88,8 @@  int __rxe_drop_ref(struct rxe_pool_elem *elem);
 
 #define rxe_disable(obj) ((obj)->elem.enabled = false)
 
+int __rxe_drop_wait(struct rxe_pool_elem *elem);
+
+#define rxe_drop_wait(obj) __rxe_drop_wait(&(obj)->elem)
+
 #endif /* RXE_POOL_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 077be3eb9763..ced6972396c4 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -115,7 +115,7 @@  static void rxe_dealloc_ucontext(struct ib_ucontext *ibuc)
 {
 	struct rxe_ucontext *uc = to_ruc(ibuc);
 
-	rxe_drop_ref(uc);
+	rxe_drop_wait(uc);
 }
 
 static int rxe_port_immutable(struct ib_device *dev, u32 port_num,
@@ -149,7 +149,7 @@  static int rxe_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
 	struct rxe_pd *pd = to_rpd(ibpd);
 
-	rxe_drop_ref(pd);
+	rxe_drop_wait(pd);
 
 	return 0;
 }
@@ -189,7 +189,7 @@  static int rxe_create_ah(struct ib_ah *ibah,
 		err = copy_to_user(&uresp->ah_num, &ah->ah_num,
 					 sizeof(uresp->ah_num));
 		if (err) {
-			rxe_drop_ref(ah);
+			rxe_drop_wait(ah);
 			return -EFAULT;
 		}
 	} else if (ah->is_user) {
@@ -233,8 +233,8 @@  static int rxe_destroy_ah(struct ib_ah *ibah, u32 flags)
 	struct rxe_ah *ah = to_rah(ibah);
 
 	rxe_disable(ah);
+	rxe_drop_wait(ah);
 
-	rxe_drop_ref(ah);
 	return 0;
 }
 
@@ -322,7 +322,7 @@  static int rxe_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init,
 
 err2:
 	rxe_drop_ref(pd);
-	rxe_drop_ref(srq);
+	rxe_drop_wait(srq);
 err1:
 	return err;
 }
@@ -382,7 +382,8 @@  static int rxe_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
 		rxe_queue_cleanup(srq->rq.queue);
 
 	rxe_drop_ref(srq->pd);
-	rxe_drop_ref(srq);
+	rxe_drop_wait(srq);
+
 	return 0;
 }
 
@@ -454,7 +455,7 @@  static int rxe_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init,
 	return 0;
 
 qp_init:
-	rxe_drop_ref(qp);
+	rxe_drop_wait(qp);
 	return err;
 }
 
@@ -509,7 +510,7 @@  static int rxe_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 
 	rxe_disable(qp);
 	rxe_qp_destroy(qp);
-	rxe_drop_ref(qp);
+	rxe_drop_wait(qp);
 	return 0;
 }
 
@@ -822,7 +823,7 @@  static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 
 
 	rxe_cq_disable(cq);
-	rxe_drop_ref(cq);
+	rxe_drop_wait(cq);
 
 	return 0;
 }
@@ -954,7 +955,7 @@  static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd,
 
 err3:
 	rxe_drop_ref(pd);
-	rxe_drop_ref(mr);
+	rxe_drop_wait(mr);
 err2:
 	return ERR_PTR(err);
 }
@@ -988,7 +989,7 @@  static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
 
 err2:
 	rxe_drop_ref(pd);
-	rxe_drop_ref(mr);
+	rxe_drop_wait(mr);
 err1:
 	return ERR_PTR(err);
 }