diff mbox series

[2/6] RDMA/mlx5: Fix a race with mlx5_ib_update_xlt on an implicit MR

Message ID 20191001153821.23621-3-jgg@ziepe.ca (mailing list archive)
State Mainlined
Commit f28b1932eaae183b80bd8c7abecae167a0e5c61a
Delegated to: Jason Gunthorpe
Headers show
Series Bug fixes for odp | expand

Commit Message

Jason Gunthorpe Oct. 1, 2019, 3:38 p.m. UTC
From: Jason Gunthorpe <jgg@mellanox.com>

mlx5_ib_update_xlt() must be protected against parallel free of the MR it
is accessing, also it must be called single threaded while updating the
HW. Otherwise we can have races of the form:

    CPU0                               CPU1
  mlx5_ib_update_xlt()
   mlx5_odp_populate_klm()
     odp_lookup() == NULL
     pklm = ZAP
                                      implicit_mr_get_data()
 				        implicit_mr_alloc()
 					  <update interval tree>
					mlx5_ib_update_xlt
					  mlx5_odp_populate_klm()
					    odp_lookup() != NULL
					    pklm = VALID
					   mlx5_ib_post_send_wait()

    mlx5_ib_post_send_wait() // Replaces VALID with ZAP

This can be solved by putting both the SRCU and the umem_mutex lock around
every call to mlx5_ib_update_xlt(). This ensures that the content of the
interval tree relavent to mlx5_odp_populate_klm() (ie mr->parent == mr)
will not change while it is running, and thus the posted WRs to update the
KLM will always reflect the correct information.

The race above will resolve by either having CPU1 wait till CPU0 completes
the ZAP or CPU0 will run after the add and instead store VALID.

The pagefault path adding children already holds the umem_mutex and SRCU,
so the only missed lock is during MR destruction.

Fixes: 81713d3788d2 ("IB/mlx5: Add implicit MR support")
Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/odp.c | 34 ++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

Comments

Leon Romanovsky Oct. 2, 2019, 8:18 a.m. UTC | #1
On Tue, Oct 01, 2019 at 12:38:17PM -0300, Jason Gunthorpe wrote:
> From: Jason Gunthorpe <jgg@mellanox.com>
>
> mlx5_ib_update_xlt() must be protected against parallel free of the MR it
> is accessing, also it must be called single threaded while updating the
> HW. Otherwise we can have races of the form:
>
>     CPU0                               CPU1
>   mlx5_ib_update_xlt()
>    mlx5_odp_populate_klm()
>      odp_lookup() == NULL
>      pklm = ZAP
>                                       implicit_mr_get_data()
>  				        implicit_mr_alloc()
>  					  <update interval tree>
> 					mlx5_ib_update_xlt
> 					  mlx5_odp_populate_klm()
> 					    odp_lookup() != NULL
> 					    pklm = VALID
> 					   mlx5_ib_post_send_wait()
>
>     mlx5_ib_post_send_wait() // Replaces VALID with ZAP
>
> This can be solved by putting both the SRCU and the umem_mutex lock around
> every call to mlx5_ib_update_xlt(). This ensures that the content of the
> interval tree relavent to mlx5_odp_populate_klm() (ie mr->parent == mr)
> will not change while it is running, and thus the posted WRs to update the
> KLM will always reflect the correct information.
>
> The race above will resolve by either having CPU1 wait till CPU0 completes
> the ZAP or CPU0 will run after the add and instead store VALID.
>
> The pagefault path adding children already holds the umem_mutex and SRCU,
> so the only missed lock is during MR destruction.
>
> Fixes: 81713d3788d2 ("IB/mlx5: Add implicit MR support")
> Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
> ---
>  drivers/infiniband/hw/mlx5/odp.c | 34 ++++++++++++++++++++++++++++++--
>  1 file changed, 32 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
> index 2e9b4306179745..3401c06b7e54f5 100644
> --- a/drivers/infiniband/hw/mlx5/odp.c
> +++ b/drivers/infiniband/hw/mlx5/odp.c
> @@ -178,6 +178,29 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
>  		return;
>  	}
>
> +	/*
> +	 * The locking here is pretty subtle. Ideally the implicit children
> +	 * list would be protected by the umem_mutex, however that is not
> +	 * possible. Instead this uses a weaker update-then-lock pattern:
> +	 *
> +	 *  srcu_read_lock()
> +	 *    <change children list>
> +	 *    mutex_lock(umem_mutex)
> +	 *     mlx5_ib_update_xlt()
> +	 *    mutex_unlock(umem_mutex)
> +	 *    destroy lkey
> +	 *
> +	 * ie any change the children list must be followed by the locked
> +	 * update_xlt before destroying.
> +	 *
> +	 * The umem_mutex provides the acquire/release semantic needed to make
> +	 * the children list visible to a racing thread. While SRCU is not
> +	 * technically required, using it gives consistent use of the SRCU
> +	 * locking around the children list.
> +	 */
> +	lockdep_assert_held(&to_ib_umem_odp(mr->umem)->umem_mutex);
> +	lockdep_assert_held(&mr->dev->mr_srcu);
> +
>  	odp = odp_lookup(offset * MLX5_IMR_MTT_SIZE,
>  			 nentries * MLX5_IMR_MTT_SIZE, mr);
>
> @@ -202,15 +225,22 @@ static void mr_leaf_free_action(struct work_struct *work)
>  	struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work);
>  	int idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
>  	struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent;
> +	struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
> +	int srcu_key;
>
>  	mr->parent = NULL;
>  	synchronize_srcu(&mr->dev->mr_srcu);

Are you sure that this line is still needed?

>
> -	ib_umem_odp_release(odp);
> -	if (imr->live)
> +	if (imr->live) {
> +		srcu_key = srcu_read_lock(&mr->dev->mr_srcu);
> +		mutex_lock(&odp_imr->umem_mutex);
>  		mlx5_ib_update_xlt(imr, idx, 1, 0,
>  				   MLX5_IB_UPD_XLT_INDIRECT |
>  				   MLX5_IB_UPD_XLT_ATOMIC);
> +		mutex_unlock(&odp_imr->umem_mutex);
> +		srcu_read_unlock(&mr->dev->mr_srcu, srcu_key);
> +	}
> +	ib_umem_odp_release(odp);
>  	mlx5_mr_cache_free(mr->dev, mr);
>
>  	if (atomic_dec_and_test(&imr->num_leaf_free))
> --
> 2.23.0
>
Jason Gunthorpe Oct. 2, 2019, 2:39 p.m. UTC | #2
On Wed, Oct 02, 2019 at 11:18:26AM +0300, Leon Romanovsky wrote:
> > @@ -202,15 +225,22 @@ static void mr_leaf_free_action(struct work_struct *work)
> >  	struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work);
> >  	int idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
> >  	struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent;
> > +	struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
> > +	int srcu_key;
> >
> >  	mr->parent = NULL;
> >  	synchronize_srcu(&mr->dev->mr_srcu);
> 
> Are you sure that this line is still needed?

Yes, in this case the mr->parent is the SRCU 'update' and it blocks
seeing this MR in the pagefault handler.

It is necessary before calling ib_umem_odp_release below that frees
the memory

Jason
Leon Romanovsky Oct. 2, 2019, 3:41 p.m. UTC | #3
On Wed, Oct 02, 2019 at 11:39:28AM -0300, Jason Gunthorpe wrote:
> On Wed, Oct 02, 2019 at 11:18:26AM +0300, Leon Romanovsky wrote:
> > > @@ -202,15 +225,22 @@ static void mr_leaf_free_action(struct work_struct *work)
> > >  	struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work);
> > >  	int idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
> > >  	struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent;
> > > +	struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
> > > +	int srcu_key;
> > >
> > >  	mr->parent = NULL;
> > >  	synchronize_srcu(&mr->dev->mr_srcu);
> >
> > Are you sure that this line is still needed?
>
> Yes, in this case the mr->parent is the SRCU 'update' and it blocks
> seeing this MR in the pagefault handler.
>
> It is necessary before calling ib_umem_odp_release below that frees
> the memory

sorry for not being clear, I thought that synchronize_srcu() should be
moved after your read_lock/unlock additions to reuse grace period.

Thanks

>
> Jason
Jason Gunthorpe Oct. 3, 2019, 12:48 p.m. UTC | #4
On Wed, Oct 02, 2019 at 06:41:14PM +0300, Leon Romanovsky wrote:
> On Wed, Oct 02, 2019 at 11:39:28AM -0300, Jason Gunthorpe wrote:
> > On Wed, Oct 02, 2019 at 11:18:26AM +0300, Leon Romanovsky wrote:
> > > > @@ -202,15 +225,22 @@ static void mr_leaf_free_action(struct work_struct *work)
> > > >  	struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work);
> > > >  	int idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
> > > >  	struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent;
> > > > +	struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
> > > > +	int srcu_key;
> > > >
> > > >  	mr->parent = NULL;
> > > >  	synchronize_srcu(&mr->dev->mr_srcu);
> > >
> > > Are you sure that this line is still needed?
> >
> > Yes, in this case the mr->parent is the SRCU 'update' and it blocks
> > seeing this MR in the pagefault handler.
> >
> > It is necessary before calling ib_umem_odp_release below that frees
> > the memory
> 
> sorry for not being clear, I thought that synchronize_srcu() should be
> moved after your read_lock/unlock additions to reuse grace period.

It has to be before to ensure that the page fault handler does not
undo the invaliate below with new pages

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 2e9b4306179745..3401c06b7e54f5 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -178,6 +178,29 @@  void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
 		return;
 	}
 
+	/*
+	 * The locking here is pretty subtle. Ideally the implicit children
+	 * list would be protected by the umem_mutex, however that is not
+	 * possible. Instead this uses a weaker update-then-lock pattern:
+	 *
+	 *  srcu_read_lock()
+	 *    <change children list>
+	 *    mutex_lock(umem_mutex)
+	 *     mlx5_ib_update_xlt()
+	 *    mutex_unlock(umem_mutex)
+	 *    destroy lkey
+	 *
+	 * ie any change the children list must be followed by the locked
+	 * update_xlt before destroying.
+	 *
+	 * The umem_mutex provides the acquire/release semantic needed to make
+	 * the children list visible to a racing thread. While SRCU is not
+	 * technically required, using it gives consistent use of the SRCU
+	 * locking around the children list.
+	 */
+	lockdep_assert_held(&to_ib_umem_odp(mr->umem)->umem_mutex);
+	lockdep_assert_held(&mr->dev->mr_srcu);
+
 	odp = odp_lookup(offset * MLX5_IMR_MTT_SIZE,
 			 nentries * MLX5_IMR_MTT_SIZE, mr);
 
@@ -202,15 +225,22 @@  static void mr_leaf_free_action(struct work_struct *work)
 	struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work);
 	int idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
 	struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent;
+	struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
+	int srcu_key;
 
 	mr->parent = NULL;
 	synchronize_srcu(&mr->dev->mr_srcu);
 
-	ib_umem_odp_release(odp);
-	if (imr->live)
+	if (imr->live) {
+		srcu_key = srcu_read_lock(&mr->dev->mr_srcu);
+		mutex_lock(&odp_imr->umem_mutex);
 		mlx5_ib_update_xlt(imr, idx, 1, 0,
 				   MLX5_IB_UPD_XLT_INDIRECT |
 				   MLX5_IB_UPD_XLT_ATOMIC);
+		mutex_unlock(&odp_imr->umem_mutex);
+		srcu_read_unlock(&mr->dev->mr_srcu, srcu_key);
+	}
+	ib_umem_odp_release(odp);
 	mlx5_mr_cache_free(mr->dev, mr);
 
 	if (atomic_dec_and_test(&imr->num_leaf_free))