diff mbox series

[for-next] RDMA/rxe: Fix "Replace red-black trees by xarrays"

Message ID 20220410223939.3769-1-rpearsonhpe@gmail.com (mailing list archive)
State Superseded
Headers show
Series [for-next] RDMA/rxe: Fix "Replace red-black trees by xarrays" | expand

Commit Message

Bob Pearson April 10, 2022, 10:39 p.m. UTC
The referenced commit causes lockdep warnings by using the
default spin_lock in xa_alloc_cyclic and xa_erase which
include calls to xa_lock()/xa_unlock() while at the same time
explicitly calling xa_lock_irqsave() in rxe_pool_get_index().

The latter is required to handle some object lookups correctly. The
immediate fix is to explicitly use xa_lock_irqsave() everywhere.

This commit replaces xa_alloc_cyclic() by __xa_alloc_cyclic() and
xa_erase() by __xa_erase() and explicitly lock these calls with
xa_lock_irqsave().

This commit will be reverted later when the read side operations
in rxe_pool.c will be converted to rcu_read_locks which will not
require locking the write side operations with irqsave locks.

This commit fixes the "WARNING: Inconsistent lock state" bug in
blktests. The recent revert patch from Bart fixes the other
bug in blktests with very long delays.

Fixes: 3225717f6dfa ("RDMA/rxe: Replace red-black trees by carrays")
Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
---
 drivers/infiniband/sw/rxe/rxe_pool.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

Comments

Bart Van Assche April 11, 2022, 3:06 a.m. UTC | #1
On 4/10/22 15:39, Bob Pearson wrote:
> Fixes: 3225717f6dfa ("RDMA/rxe: Replace red-black trees by carrays")
                                                              ^^^^^^^
                                                              xarrays?

> @@ -138,8 +140,10 @@ void *rxe_alloc(struct rxe_pool *pool)
>   	elem->obj = obj;
>   	kref_init(&elem->ref_cnt);
>   
> -	err = xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit,
> +	xa_lock_irqsave(xa, flags);
> +	err = __xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit,
>   			      &pool->next, GFP_KERNEL);
> +	xa_unlock_irqrestore(xa, flags);

Please take a look at the xas_unlock_type() and xas_lock_type() calls in 
__xas_nomem(). I think that the above change will trigger a GFP_KERNEL 
allocation with interrupts disabled. My understanding is that GFP_KERNEL 
allocations may sleep and hence that the above code may cause 
__xas_nomem() to sleep with interrupts disabled. I don't think that is 
allowed.

Thanks,

Bart.
Bob Pearson April 11, 2022, 3:13 a.m. UTC | #2
On 4/10/22 22:06, Bart Van Assche wrote:
> On 4/10/22 15:39, Bob Pearson wrote:
>> Fixes: 3225717f6dfa ("RDMA/rxe: Replace red-black trees by carrays")
>                                                              ^^^^^^^
>                                                              xarrays?
> 
>> @@ -138,8 +140,10 @@ void *rxe_alloc(struct rxe_pool *pool)
>>       elem->obj = obj;
>>       kref_init(&elem->ref_cnt);
>>   -    err = xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit,
>> +    xa_lock_irqsave(xa, flags);
>> +    err = __xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit,
>>                     &pool->next, GFP_KERNEL);
>> +    xa_unlock_irqrestore(xa, flags);
> 
> Please take a look at the xas_unlock_type() and xas_lock_type() calls in __xas_nomem(). I think that the above change will trigger a GFP_KERNEL allocation with interrupts disabled. My understanding is that GFP_KERNEL allocations may sleep and hence that the above code may cause __xas_nomem() to sleep with interrupts disabled. I don't think that is allowed.
> 
> Thanks,
> 
> Bart.

You're right. I missed that. Zhu wants to write the patch so hopefully he's on top of that.
For now we could use GFP_ATOMIC.

Bob
Zhu Yanjun April 11, 2022, 3:15 a.m. UTC | #3
On Mon, Apr 11, 2022 at 11:06 AM Bart Van Assche <bvanassche@acm.org> wrote:
>
> On 4/10/22 15:39, Bob Pearson wrote:
> > Fixes: 3225717f6dfa ("RDMA/rxe: Replace red-black trees by carrays")
>                                                               ^^^^^^^
>                                                               xarrays?
>
> > @@ -138,8 +140,10 @@ void *rxe_alloc(struct rxe_pool *pool)
> >       elem->obj = obj;
> >       kref_init(&elem->ref_cnt);
> >
> > -     err = xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit,
> > +     xa_lock_irqsave(xa, flags);
> > +     err = __xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit,
> >                             &pool->next, GFP_KERNEL);
> > +     xa_unlock_irqrestore(xa, flags);
>
> Please take a look at the xas_unlock_type() and xas_lock_type() calls in
> __xas_nomem(). I think that the above change will trigger a GFP_KERNEL
> allocation with interrupts disabled. My understanding is that GFP_KERNEL
> allocations may sleep and hence that the above code may cause
> __xas_nomem() to sleep with interrupts disabled. I don't think that is
> allowed.

Thanks. I will send V2.

Zhu Yanjun

>
> Thanks,
>
> Bart.
Jason Gunthorpe April 11, 2022, 11:38 a.m. UTC | #4
On Sun, Apr 10, 2022 at 10:13:16PM -0500, Bob Pearson wrote:
> On 4/10/22 22:06, Bart Van Assche wrote:
> > On 4/10/22 15:39, Bob Pearson wrote:
> >> Fixes: 3225717f6dfa ("RDMA/rxe: Replace red-black trees by carrays")
> >                                                              ^^^^^^^
> >                                                              xarrays?
> > 
> >> @@ -138,8 +140,10 @@ void *rxe_alloc(struct rxe_pool *pool)
> >>       elem->obj = obj;
> >>       kref_init(&elem->ref_cnt);
> >>   -    err = xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit,
> >> +    xa_lock_irqsave(xa, flags);
> >> +    err = __xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit,
> >>                     &pool->next, GFP_KERNEL);
> >> +    xa_unlock_irqrestore(xa, flags);
> > 
> > Please take a look at the xas_unlock_type() and xas_lock_type() calls in __xas_nomem(). I think that the above change will trigger a GFP_KERNEL allocation with interrupts disabled. My understanding is that GFP_KERNEL allocations may sleep and hence that the above code may cause __xas_nomem() to sleep with interrupts disabled. I don't think that is allowed.
> > 
> > Thanks,
> > 
> > Bart.
> 
> You're right. I missed that. Zhu wants to write the patch so hopefully he's on top of that.
> For now we could use GFP_ATOMIC.

Yes, you cannot use irq_save varients here. You have to know your
calling context is non-atomic already and use the irq wrapper.

Jason
Pearson, Robert B April 11, 2022, 3:52 p.m. UTC | #5
> Yes, you cannot use irq_save variants here. You have to know your calling context is non-atomic already and use the irq wrapper.

Not sure why. If you call irqsave in a non-atomic context doesn't it behave the same as irq? I.e. flags = 0.
xarray provides xa_lock_xxx() for people to use. Are you saying I have to call xa_alloc_cyclic_xxx() instead which is the same thing?

The problem is I've gotten gun shy about people calling into the verbs APIs in strange contexts. The rules don't seem to be written
down. If I could be certain that no one ever is allowed to call a verbs API in an interrupt then this is correct but how do I know that?

Bob
Jason Gunthorpe April 11, 2022, 4:02 p.m. UTC | #6
On Mon, Apr 11, 2022 at 03:52:23PM +0000, Pearson, Robert B wrote:
> 
> > Yes, you cannot use irq_save variants here. You have to know your calling context is non-atomic already and use the irq wrapper.
> 
> Not sure why. If you call irqsave in a non-atomic context doesn't it
> behave the same as irq? I.e. flags = 0.  xarray provides
> xa_lock_xxx() for people to use. Are you saying I have to call
> xa_alloc_cyclic_xxx() instead which is the same thing?

The xa_lock is a magic thing, when you call a __xa_*(.., GFP_KERNEL)
type function then it will unlock and relock the xa_lock internally.

Thus you cannot wrapper an irqsave lock across the GFP_KERNEL call
sites because XA doesn't know the 'flags' to be able to properly
unlock it.

> The problem is I've gotten gun shy about people calling into the
> verbs APIs in strange contexts. The rules don't seem to be written
> down. If I could be certain that no one ever is allowed to call a
> verbs API in an interrupt then this is correct but how do I know
> that?

rxe used GFP_KERNEL so it already has assumed it is a process context.

Jason
Zhu Yanjun April 12, 2022, 1:32 p.m. UTC | #7
在 2022/4/12 0:02, Jason Gunthorpe 写道:
> On Mon, Apr 11, 2022 at 03:52:23PM +0000, Pearson, Robert B wrote:
>>
>>> Yes, you cannot use irq_save variants here. You have to know your calling context is non-atomic already and use the irq wrapper.
>>
>> Not sure why. If you call irqsave in a non-atomic context doesn't it
>> behave the same as irq? I.e. flags = 0.  xarray provides
>> xa_lock_xxx() for people to use. Are you saying I have to call
>> xa_alloc_cyclic_xxx() instead which is the same thing?
> 
> The xa_lock is a magic thing, when you call a __xa_*(.., GFP_KERNEL)
> type function then it will unlock and relock the xa_lock internally.
> 
> Thus you cannot wrapper an irqsave lock across the GFP_KERNEL call
> sites because XA doesn't know the 'flags' to be able to properly
> unlock it.
> 
>> The problem is I've gotten gun shy about people calling into the
>> verbs APIs in strange contexts. The rules don't seem to be written
>> down. If I could be certain that no one ever is allowed to call a
>> verbs API in an interrupt then this is correct but how do I know
>> that?
> 
> rxe used GFP_KERNEL so it already has assumed it is a process context.
Got it.

__xa_alloc_cyclic(..., GFP_NOWAIT) will unlock xa lock internally. But 
it does not handle flags. So the irq is still disabled. Because 
GFP_NOWAIT is used in __xa_alloc_cyclic, it will not sleep.

__xa_alloc_cyclic will lock xa lock and return to xa_unlock_irqrestore.

So the following code should be OK?

         xa_lock_irqsave(&pool->xa, flags);
         err = __xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit,
                                 &pool->next, GFP_NOWAIT);
         xa_unlock_irqrestore(&pool->xa, flags);


Zhu Yanjun

> 
> Jason
Jason Gunthorpe April 12, 2022, 1:39 p.m. UTC | #8
On Tue, Apr 12, 2022 at 09:32:25PM +0800, Yanjun Zhu wrote:
> 在 2022/4/12 0:02, Jason Gunthorpe 写道:
> > On Mon, Apr 11, 2022 at 03:52:23PM +0000, Pearson, Robert B wrote:
> > > 
> > > > Yes, you cannot use irq_save variants here. You have to know your calling context is non-atomic already and use the irq wrapper.
> > > 
> > > Not sure why. If you call irqsave in a non-atomic context doesn't it
> > > behave the same as irq? I.e. flags = 0.  xarray provides
> > > xa_lock_xxx() for people to use. Are you saying I have to call
> > > xa_alloc_cyclic_xxx() instead which is the same thing?
> > 
> > The xa_lock is a magic thing, when you call a __xa_*(.., GFP_KERNEL)
> > type function then it will unlock and relock the xa_lock internally.
> > 
> > Thus you cannot wrapper an irqsave lock across the GFP_KERNEL call
> > sites because XA doesn't know the 'flags' to be able to properly
> > unlock it.
> > 
> > > The problem is I've gotten gun shy about people calling into the
> > > verbs APIs in strange contexts. The rules don't seem to be written
> > > down. If I could be certain that no one ever is allowed to call a
> > > verbs API in an interrupt then this is correct but how do I know
> > > that?
> > 
> > rxe used GFP_KERNEL so it already has assumed it is a process context.
> Got it.
> 
> __xa_alloc_cyclic(..., GFP_NOWAIT) will unlock xa lock internally. But it
> does not handle flags. So the irq is still disabled. Because GFP_NOWAIT is
> used in __xa_alloc_cyclic, it will not sleep.
> 
> __xa_alloc_cyclic will lock xa lock and return to xa_unlock_irqrestore.
> 
> So the following code should be OK?
> 
>         xa_lock_irqsave(&pool->xa, flags);
>         err = __xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit,
>                                 &pool->next, GFP_NOWAIT);
>         xa_unlock_irqrestore(&pool->xa, flags);

No, use GFP_KERNEL an use the proper irq or bh version of
xa_alloc_cyclic.

save is only required if you don't know the contex you are in, and
here we know we are in a process context.

Jason
Zhu Yanjun April 12, 2022, 1:49 p.m. UTC | #9
在 2022/4/12 21:39, Jason Gunthorpe 写道:
> On Tue, Apr 12, 2022 at 09:32:25PM +0800, Yanjun Zhu wrote:
>> 在 2022/4/12 0:02, Jason Gunthorpe 写道:
>>> On Mon, Apr 11, 2022 at 03:52:23PM +0000, Pearson, Robert B wrote:
>>>>> Yes, you cannot use irq_save variants here. You have to know your calling context is non-atomic already and use the irq wrapper.
>>>> Not sure why. If you call irqsave in a non-atomic context doesn't it
>>>> behave the same as irq? I.e. flags = 0.  xarray provides
>>>> xa_lock_xxx() for people to use. Are you saying I have to call
>>>> xa_alloc_cyclic_xxx() instead which is the same thing?
>>> The xa_lock is a magic thing, when you call a __xa_*(.., GFP_KERNEL)
>>> type function then it will unlock and relock the xa_lock internally.
>>>
>>> Thus you cannot wrapper an irqsave lock across the GFP_KERNEL call
>>> sites because XA doesn't know the 'flags' to be able to properly
>>> unlock it.
>>>
>>>> The problem is I've gotten gun shy about people calling into the
>>>> verbs APIs in strange contexts. The rules don't seem to be written
>>>> down. If I could be certain that no one ever is allowed to call a
>>>> verbs API in an interrupt then this is correct but how do I know
>>>> that?
>>> rxe used GFP_KERNEL so it already has assumed it is a process context.
>> Got it.
>>
>> __xa_alloc_cyclic(..., GFP_NOWAIT) will unlock xa lock internally. But it
>> does not handle flags. So the irq is still disabled. Because GFP_NOWAIT is
>> used in __xa_alloc_cyclic, it will not sleep.
>>
>> __xa_alloc_cyclic will lock xa lock and return to xa_unlock_irqrestore.
>>
>> So the following code should be OK?
>>
>>          xa_lock_irqsave(&pool->xa, flags);
>>          err = __xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit,
>>                                  &pool->next, GFP_NOWAIT);
>>          xa_unlock_irqrestore(&pool->xa, flags);
> No, use GFP_KERNEL an use the proper irq or bh version of
> xa_alloc_cyclic.
>
> save is only required if you don't know the contex you are in, and
> here we know we are in a process context.

if we use xa_lock_irq/xa_unlock_irq or xa_lock_bh/xa_unlock_bh instead 
of xa_unlock_irqrestore,

the warning as below will appear. This means that __rxe_add_to_pool 
disables softirq, but fpu_clone enables softirq.

"
Apr 12 16:24:53 kernel: softirqs last  enabled at (13086): 
[<ffffffff91830d26>] fpu_clone+0xf6/0x570
Apr 12 16:24:53 kernel: softirqs last disabled at (13129): 
[<ffffffffc077f319>] __rxe_add_to_pool+0x49/0xa0 [rdma_rxe]
"

As such, it is better to use xa_unlock_irqrestore + 
__xa_alloc(...,GFP_ATOMIC/GFP_NOWAIT).

Zhu Yanjun

>
> Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
index 87066d04ed18..440f96af213b 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.c
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -118,7 +118,9 @@  void rxe_pool_cleanup(struct rxe_pool *pool)
 
 void *rxe_alloc(struct rxe_pool *pool)
 {
+	struct xarray *xa = &pool->xa;
 	struct rxe_pool_elem *elem;
+	unsigned long flags;
 	void *obj;
 	int err;
 
@@ -138,8 +140,10 @@  void *rxe_alloc(struct rxe_pool *pool)
 	elem->obj = obj;
 	kref_init(&elem->ref_cnt);
 
-	err = xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit,
+	xa_lock_irqsave(xa, flags);
+	err = __xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit,
 			      &pool->next, GFP_KERNEL);
+	xa_unlock_irqrestore(xa, flags);
 	if (err)
 		goto err_free;
 
@@ -154,6 +158,8 @@  void *rxe_alloc(struct rxe_pool *pool)
 
 int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem)
 {
+	struct xarray *xa = &pool->xa;
+	unsigned long flags;
 	int err;
 
 	if (WARN_ON(pool->flags & RXE_POOL_ALLOC))
@@ -166,8 +172,10 @@  int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem)
 	elem->obj = (u8 *)elem - pool->elem_offset;
 	kref_init(&elem->ref_cnt);
 
-	err = xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit,
+	xa_lock_irqsave(xa, flags);
+	err = __xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit,
 			      &pool->next, GFP_KERNEL);
+	xa_unlock_irqrestore(xa, flags);
 	if (err)
 		goto err_cnt;
 
@@ -200,8 +208,12 @@  static void rxe_elem_release(struct kref *kref)
 {
 	struct rxe_pool_elem *elem = container_of(kref, typeof(*elem), ref_cnt);
 	struct rxe_pool *pool = elem->pool;
+	struct xarray *xa = &pool->xa;
+	unsigned long flags;
 
-	xa_erase(&pool->xa, elem->index);
+	xa_lock_irqsave(xa, flags);
+	__xa_erase(&pool->xa, elem->index);
+	xa_unlock_irqrestore(xa, flags);
 
 	if (pool->cleanup)
 		pool->cleanup(elem);