diff mbox series

[net-next,v2] Avoid traversing addrconf hash on ifdown

Message ID 20241108052559.2926114-1-gnaaman@drivenets.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [net-next,v2] Avoid traversing addrconf hash on ifdown | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 3 this patch: 3
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers success CCed 6 of 6 maintainers
netdev/build_clang success Errors and warnings before: 3 this patch: 3
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4 this patch: 4
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 57 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 1 this patch: 1
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-11-11--21-00 (tests: 787)

Commit Message

Gilad Naaman Nov. 8, 2024, 5:25 a.m. UTC
struct inet6_dev already has a list of addresses owned by the device,
enabling us to traverse this much shorter list, instead of scanning
the entire hash-table.

Signed-off-by: Gilad Naaman <gnaaman@drivenets.com>
---
Changes in v2:
 - Remove double BH sections
 - Styling fixes (extra {}, extra newline)
---
 net/ipv6/addrconf.c | 38 +++++++++++++++++---------------------
 1 file changed, 17 insertions(+), 21 deletions(-)

Comments

Vadim Fedorenko Nov. 9, 2024, 3 p.m. UTC | #1
On 08/11/2024 05:25, Gilad Naaman wrote:
> struct inet6_dev already has a list of addresses owned by the device,
> enabling us to traverse this much shorter list, instead of scanning
> the entire hash-table.
> 
> Signed-off-by: Gilad Naaman <gnaaman@drivenets.com>
> ---
> Changes in v2:
>   - Remove double BH sections
>   - Styling fixes (extra {}, extra newline)
> ---
>   net/ipv6/addrconf.c | 38 +++++++++++++++++---------------------
>   1 file changed, 17 insertions(+), 21 deletions(-)
> 
> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
> index d0a99710d65d..c6fbd634912a 100644
> --- a/net/ipv6/addrconf.c
> +++ b/net/ipv6/addrconf.c
> @@ -3846,12 +3846,12 @@ static int addrconf_ifdown(struct net_device *dev, bool unregister)
>   {
>   	unsigned long event = unregister ? NETDEV_UNREGISTER : NETDEV_DOWN;
>   	struct net *net = dev_net(dev);
> -	struct inet6_dev *idev;
>   	struct inet6_ifaddr *ifa;
>   	LIST_HEAD(tmp_addr_list);
> +	struct inet6_dev *idev;
>   	bool keep_addr = false;
>   	bool was_ready;
> -	int state, i;
> +	int state;
>   
>   	ASSERT_RTNL();
>   
> @@ -3890,28 +3890,24 @@ static int addrconf_ifdown(struct net_device *dev, bool unregister)
>   	}
>   
>   	/* Step 2: clear hash table */
> -	for (i = 0; i < IN6_ADDR_HSIZE; i++) {
> -		struct hlist_head *h = &net->ipv6.inet6_addr_lst[i];
> +	read_lock_bh(&idev->lock);
 > +	spin_lock(&net->ipv6.addrconf_hash_lock);>
> -		spin_lock_bh(&net->ipv6.addrconf_hash_lock);
> -restart:
> -		hlist_for_each_entry_rcu(ifa, h, addr_lst) {
> -			if (ifa->idev == idev) {
> -				addrconf_del_dad_work(ifa);
> -				/* combined flag + permanent flag decide if
> -				 * address is retained on a down event
> -				 */
> -				if (!keep_addr ||
> -				    !(ifa->flags & IFA_F_PERMANENT) ||
> -				    addr_is_local(&ifa->addr)) {
> -					hlist_del_init_rcu(&ifa->addr_lst);
> -					goto restart;
> -				}
> -			}
> -		}
> -		spin_unlock_bh(&net->ipv6.addrconf_hash_lock);
> +	list_for_each_entry(ifa, &idev->addr_list, if_list) {
> +		addrconf_del_dad_work(ifa);
> +
> +		/* combined flag + permanent flag decide if
> +		 * address is retained on a down event
> +		 */
> +		if (!keep_addr ||
> +		    !(ifa->flags & IFA_F_PERMANENT) ||
> +		    addr_is_local(&ifa->addr))
> +			hlist_del_init_rcu(&ifa->addr_lst);
>   	}
>   
> +	spin_unlock(&net->ipv6.addrconf_hash_lock);
> +	read_unlock_bh(&idev->lock);

Why is this read lock needed here? spinlock addrconf_hash_lock will
block any RCU grace period to happen, so we can safely traverse
idev->addr_list with list_for_each_entry_rcu()...

> +
>   	write_lock_bh(&idev->lock);

if we are trying to protect idev->addr_list against addition, then we
have to extend write_lock scope. Otherwise it may happen that another
thread will grab write lock between read_unlock and write_lock.

Am I missing something?
Gilad Naaman Nov. 10, 2024, 6:53 a.m. UTC | #2
> > -		spin_unlock_bh(&net->ipv6.addrconf_hash_lock);
> > +	list_for_each_entry(ifa, &idev->addr_list, if_list) {
> > +		addrconf_del_dad_work(ifa);
> > +
> > +		/* combined flag + permanent flag decide if
> > +		 * address is retained on a down event
> > +		 */
> > +		if (!keep_addr ||
> > +		    !(ifa->flags & IFA_F_PERMANENT) ||
> > +		    addr_is_local(&ifa->addr))
> > +			hlist_del_init_rcu(&ifa->addr_lst);
> >   	}
> >   
> > +	spin_unlock(&net->ipv6.addrconf_hash_lock);
> > +	read_unlock_bh(&idev->lock);
> 
> Why is this read lock needed here? spinlock addrconf_hash_lock will
> block any RCU grace period to happen, so we can safely traverse
> idev->addr_list with list_for_each_entry_rcu()...

Oh, sorry, I didn't realize the hash lock encompasses this one;
although it seems obvious in retrospect.

> > +
> >   	write_lock_bh(&idev->lock);
> 
> if we are trying to protect idev->addr_list against addition, then we
> have to extend write_lock scope. Otherwise it may happen that another
> thread will grab write lock between read_unlock and write_lock.
> 
> Am I missing something?

I wanted to ensure that access to `idev->addr_list` is performed under lock,
the same way it is done immediately afterwards;
No particular reason not to extend the existing lock, I just didn't think
about it.

For what it's worth, the original code didn't have this protection either,
since the another thread could have grabbed the lock between
`spin_unlock_bh(&net->ipv6.addrconf_hash_lock);` of the last loop iteration,
and the `write_lock`.

Should I extend the write_lock upwards, or just leave it off?

Thank you for your time,
Gilad
Vadim Fedorenko Nov. 10, 2024, 10:31 p.m. UTC | #3
On 10/11/2024 06:53, Gilad Naaman wrote:
>>> -		spin_unlock_bh(&net->ipv6.addrconf_hash_lock);
>>> +	list_for_each_entry(ifa, &idev->addr_list, if_list) {
>>> +		addrconf_del_dad_work(ifa);
>>> +
>>> +		/* combined flag + permanent flag decide if
>>> +		 * address is retained on a down event
>>> +		 */
>>> +		if (!keep_addr ||
>>> +		    !(ifa->flags & IFA_F_PERMANENT) ||
>>> +		    addr_is_local(&ifa->addr))
>>> +			hlist_del_init_rcu(&ifa->addr_lst);
>>>    	}
>>>    
>>> +	spin_unlock(&net->ipv6.addrconf_hash_lock);
>>> +	read_unlock_bh(&idev->lock);
>>
>> Why is this read lock needed here? spinlock addrconf_hash_lock will
>> block any RCU grace period to happen, so we can safely traverse
>> idev->addr_list with list_for_each_entry_rcu()...
> 
> Oh, sorry, I didn't realize the hash lock encompasses this one;
> although it seems obvious in retrospect.
> 
>>> +
>>>    	write_lock_bh(&idev->lock);
>>
>> if we are trying to protect idev->addr_list against addition, then we
>> have to extend write_lock scope. Otherwise it may happen that another
>> thread will grab write lock between read_unlock and write_lock.
>>
>> Am I missing something?
> 
> I wanted to ensure that access to `idev->addr_list` is performed under lock,
> the same way it is done immediately afterwards;
> No particular reason not to extend the existing lock, I just didn't think
> about it.
> 
> For what it's worth, the original code didn't have this protection either,
> since the another thread could have grabbed the lock between
> `spin_unlock_bh(&net->ipv6.addrconf_hash_lock);` of the last loop iteration,
> and the `write_lock`.
> 
> Should I extend the write_lock upwards, or just leave it off?

Well, you are doing write manipulation with the list, which is protected
by read-write lock. I would expect this lock to be held in write mode. 
And you have to protect hash map at the same time. So yes, write_lock
and spin_lock altogether, I believe.

> 
> Thank you for your time,
> Gilad
Gilad Naaman Nov. 11, 2024, 5:21 a.m. UTC | #4
> On 10/11/2024 06:53, Gilad Naaman wrote:
> >>> -           spin_unlock_bh(&net->ipv6.addrconf_hash_lock);
> >>> +   list_for_each_entry(ifa, &idev->addr_list, if_list) {
> >>> +           addrconf_del_dad_work(ifa);
> >>> +
> >>> +           /* combined flag + permanent flag decide if
> >>> +            * address is retained on a down event
> >>> +            */
> >>> +           if (!keep_addr ||
> >>> +               !(ifa->flags & IFA_F_PERMANENT) ||
> >>> +               addr_is_local(&ifa->addr))
> >>> +                   hlist_del_init_rcu(&ifa->addr_lst);
> >>>     }
> >>>
> >>> +   spin_unlock(&net->ipv6.addrconf_hash_lock);
> >>> +   read_unlock_bh(&idev->lock);
> >>
> >> Why is this read lock needed here? spinlock addrconf_hash_lock will
> >> block any RCU grace period to happen, so we can safely traverse
> >> idev->addr_list with list_for_each_entry_rcu()...
> >
> > Oh, sorry, I didn't realize the hash lock encompasses this one;
> > although it seems obvious in retrospect.
> >
> >>> +
> >>>     write_lock_bh(&idev->lock);
> >>
> >> if we are trying to protect idev->addr_list against addition, then we
> >> have to extend write_lock scope. Otherwise it may happen that another
> >> thread will grab write lock between read_unlock and write_lock.
> >>
> >> Am I missing something?
> >
> > I wanted to ensure that access to `idev->addr_list` is performed under lock,
> > the same way it is done immediately afterwards;
> > No particular reason not to extend the existing lock, I just didn't think
> > about it.
> >
> > For what it's worth, the original code didn't have this protection either,
> > since the another thread could have grabbed the lock between
> > `spin_unlock_bh(&net->ipv6.addrconf_hash_lock);` of the last loop iteration,
> > and the `write_lock`.
> >
> > Should I extend the write_lock upwards, or just leave it off?
> 
> Well, you are doing write manipulation with the list, which is protected
> by read-write lock. I would expect this lock to be held in write mode.
> And you have to protect hash map at the same time. So yes, write_lock
> and spin_lock altogether, I believe.
> 

Note that within the changed lines, the list itself is only iterated-on,
not manipulated.
The changes are to the `addr_lst` list, which is the hashtable, not the
list this lock protects.

I'll send v3 with the write-lock extended.
Thank you!
Vadim Fedorenko Nov. 11, 2024, 12:07 p.m. UTC | #5
On 11/11/2024 05:21, Gilad Naaman wrote:
>> On 10/11/2024 06:53, Gilad Naaman wrote:
>>>>> -           spin_unlock_bh(&net->ipv6.addrconf_hash_lock);
>>>>> +   list_for_each_entry(ifa, &idev->addr_list, if_list) {
>>>>> +           addrconf_del_dad_work(ifa);
>>>>> +
>>>>> +           /* combined flag + permanent flag decide if
>>>>> +            * address is retained on a down event
>>>>> +            */
>>>>> +           if (!keep_addr ||
>>>>> +               !(ifa->flags & IFA_F_PERMANENT) ||
>>>>> +               addr_is_local(&ifa->addr))
>>>>> +                   hlist_del_init_rcu(&ifa->addr_lst);
>>>>>      }
>>>>>
>>>>> +   spin_unlock(&net->ipv6.addrconf_hash_lock);
>>>>> +   read_unlock_bh(&idev->lock);
>>>>
>>>> Why is this read lock needed here? spinlock addrconf_hash_lock will
>>>> block any RCU grace period to happen, so we can safely traverse
>>>> idev->addr_list with list_for_each_entry_rcu()...
>>>
>>> Oh, sorry, I didn't realize the hash lock encompasses this one;
>>> although it seems obvious in retrospect.
>>>
>>>>> +
>>>>>      write_lock_bh(&idev->lock);
>>>>
>>>> if we are trying to protect idev->addr_list against addition, then we
>>>> have to extend write_lock scope. Otherwise it may happen that another
>>>> thread will grab write lock between read_unlock and write_lock.
>>>>
>>>> Am I missing something?
>>>
>>> I wanted to ensure that access to `idev->addr_list` is performed under lock,
>>> the same way it is done immediately afterwards;
>>> No particular reason not to extend the existing lock, I just didn't think
>>> about it.
>>>
>>> For what it's worth, the original code didn't have this protection either,
>>> since the another thread could have grabbed the lock between
>>> `spin_unlock_bh(&net->ipv6.addrconf_hash_lock);` of the last loop iteration,
>>> and the `write_lock`.
>>>
>>> Should I extend the write_lock upwards, or just leave it off?
>>
>> Well, you are doing write manipulation with the list, which is protected
>> by read-write lock. I would expect this lock to be held in write mode.
>> And you have to protect hash map at the same time. So yes, write_lock
>> and spin_lock altogether, I believe.
>>
> 
> Note that within the changed lines, the list itself is only iterated-on,
> not manipulated.
> The changes are to the `addr_lst` list, which is the hashtable, not the
> list this lock protects.
> 
> I'll send v3 with the write-lock extended.
> Thank you!

Reading it one more time, I'm not quite sure that locking hashmap
spinlock under idev->lock in write mode is a good idea... We have to
think more about it, maybe ask for another opinion. Looks like RTNL
should protect idev->addr_list from modification while idev->lock is
more about changes to idev, not only about addr_list.

@Eric could you please shed some light on the locking schema here?
Paolo Abeni Nov. 12, 2024, 2:41 p.m. UTC | #6
On 11/11/24 13:07, Vadim Fedorenko wrote:
> On 11/11/2024 05:21, Gilad Naaman wrote:
>>> On 10/11/2024 06:53, Gilad Naaman wrote:
>>>>>> -           spin_unlock_bh(&net->ipv6.addrconf_hash_lock);
>>>>>> +   list_for_each_entry(ifa, &idev->addr_list, if_list) {
>>>>>> +           addrconf_del_dad_work(ifa);
>>>>>> +
>>>>>> +           /* combined flag + permanent flag decide if
>>>>>> +            * address is retained on a down event
>>>>>> +            */
>>>>>> +           if (!keep_addr ||
>>>>>> +               !(ifa->flags & IFA_F_PERMANENT) ||
>>>>>> +               addr_is_local(&ifa->addr))
>>>>>> +                   hlist_del_init_rcu(&ifa->addr_lst);
>>>>>>      }
>>>>>>
>>>>>> +   spin_unlock(&net->ipv6.addrconf_hash_lock);
>>>>>> +   read_unlock_bh(&idev->lock);
>>>>>
>>>>> Why is this read lock needed here? spinlock addrconf_hash_lock will
>>>>> block any RCU grace period to happen, so we can safely traverse
>>>>> idev->addr_list with list_for_each_entry_rcu()...
>>>>
>>>> Oh, sorry, I didn't realize the hash lock encompasses this one;
>>>> although it seems obvious in retrospect.
>>>>
>>>>>> +
>>>>>>      write_lock_bh(&idev->lock);
>>>>>
>>>>> if we are trying to protect idev->addr_list against addition, then we
>>>>> have to extend write_lock scope. Otherwise it may happen that another
>>>>> thread will grab write lock between read_unlock and write_lock.
>>>>>
>>>>> Am I missing something?
>>>>
>>>> I wanted to ensure that access to `idev->addr_list` is performed under lock,
>>>> the same way it is done immediately afterwards;
>>>> No particular reason not to extend the existing lock, I just didn't think
>>>> about it.
>>>>
>>>> For what it's worth, the original code didn't have this protection either,
>>>> since the another thread could have grabbed the lock between
>>>> `spin_unlock_bh(&net->ipv6.addrconf_hash_lock);` of the last loop iteration,
>>>> and the `write_lock`.
>>>>
>>>> Should I extend the write_lock upwards, or just leave it off?
>>>
>>> Well, you are doing write manipulation with the list, which is protected
>>> by read-write lock. I would expect this lock to be held in write mode.
>>> And you have to protect hash map at the same time. So yes, write_lock
>>> and spin_lock altogether, I believe.
>>>
>>
>> Note that within the changed lines, the list itself is only iterated-on,
>> not manipulated.
>> The changes are to the `addr_lst` list, which is the hashtable, not the
>> list this lock protects.
>>
>> I'll send v3 with the write-lock extended.
>> Thank you!
> 
> Reading it one more time, I'm not quite sure that locking hashmap
> spinlock under idev->lock in write mode is a good idea... We have to
> think more about it, maybe ask for another opinion. Looks like RTNL
> should protect idev->addr_list from modification while idev->lock is
> more about changes to idev, not only about addr_list.
> 
> @Eric could you please shed some light on the locking schema here?

AFAICS idev->addr_list is (write) protected by write_lock(idev->lock),
while net->ipv6.inet6_addr_lst is protected by
spin_lock_bh(&net->ipv6.addrconf_hash_lock).

Extending the write_lock() scope will create a lock dependency between
the hashtable lock and the list lock, which in turn could cause more
problem in the future.

Note that idev->addr_list locking looks a bit fuzzy, as is traversed in
several places under the RCU lock only. I suggest finish the conversion
of idev->addr_list to RCU and do this additional traversal under RCU, too.

Cheers,

Paolo
Vadim Fedorenko Nov. 12, 2024, 4:08 p.m. UTC | #7
On 12/11/2024 14:41, Paolo Abeni wrote:
> On 11/11/24 13:07, Vadim Fedorenko wrote:
>> On 11/11/2024 05:21, Gilad Naaman wrote:
>>>> On 10/11/2024 06:53, Gilad Naaman wrote:
>>>>>>> -           spin_unlock_bh(&net->ipv6.addrconf_hash_lock);
>>>>>>> +   list_for_each_entry(ifa, &idev->addr_list, if_list) {
>>>>>>> +           addrconf_del_dad_work(ifa);
>>>>>>> +
>>>>>>> +           /* combined flag + permanent flag decide if
>>>>>>> +            * address is retained on a down event
>>>>>>> +            */
>>>>>>> +           if (!keep_addr ||
>>>>>>> +               !(ifa->flags & IFA_F_PERMANENT) ||
>>>>>>> +               addr_is_local(&ifa->addr))
>>>>>>> +                   hlist_del_init_rcu(&ifa->addr_lst);
>>>>>>>       }
>>>>>>>
>>>>>>> +   spin_unlock(&net->ipv6.addrconf_hash_lock);
>>>>>>> +   read_unlock_bh(&idev->lock);
>>>>>>
>>>>>> Why is this read lock needed here? spinlock addrconf_hash_lock will
>>>>>> block any RCU grace period to happen, so we can safely traverse
>>>>>> idev->addr_list with list_for_each_entry_rcu()...
>>>>>
>>>>> Oh, sorry, I didn't realize the hash lock encompasses this one;
>>>>> although it seems obvious in retrospect.
>>>>>
>>>>>>> +
>>>>>>>       write_lock_bh(&idev->lock);
>>>>>>
>>>>>> if we are trying to protect idev->addr_list against addition, then we
>>>>>> have to extend write_lock scope. Otherwise it may happen that another
>>>>>> thread will grab write lock between read_unlock and write_lock.
>>>>>>
>>>>>> Am I missing something?
>>>>>
>>>>> I wanted to ensure that access to `idev->addr_list` is performed under lock,
>>>>> the same way it is done immediately afterwards;
>>>>> No particular reason not to extend the existing lock, I just didn't think
>>>>> about it.
>>>>>
>>>>> For what it's worth, the original code didn't have this protection either,
>>>>> since the another thread could have grabbed the lock between
>>>>> `spin_unlock_bh(&net->ipv6.addrconf_hash_lock);` of the last loop iteration,
>>>>> and the `write_lock`.
>>>>>
>>>>> Should I extend the write_lock upwards, or just leave it off?
>>>>
>>>> Well, you are doing write manipulation with the list, which is protected
>>>> by read-write lock. I would expect this lock to be held in write mode.
>>>> And you have to protect hash map at the same time. So yes, write_lock
>>>> and spin_lock altogether, I believe.
>>>>
>>>
>>> Note that within the changed lines, the list itself is only iterated-on,
>>> not manipulated.
>>> The changes are to the `addr_lst` list, which is the hashtable, not the
>>> list this lock protects.
>>>
>>> I'll send v3 with the write-lock extended.
>>> Thank you!
>>
>> Reading it one more time, I'm not quite sure that locking hashmap
>> spinlock under idev->lock in write mode is a good idea... We have to
>> think more about it, maybe ask for another opinion. Looks like RTNL
>> should protect idev->addr_list from modification while idev->lock is
>> more about changes to idev, not only about addr_list.
>>
>> @Eric could you please shed some light on the locking schema here?
> 
> AFAICS idev->addr_list is (write) protected by write_lock(idev->lock),
> while net->ipv6.inet6_addr_lst is protected by
> spin_lock_bh(&net->ipv6.addrconf_hash_lock).
> 
> Extending the write_lock() scope will create a lock dependency between
> the hashtable lock and the list lock, which in turn could cause more
> problem in the future.
> 
> Note that idev->addr_list locking looks a bit fuzzy, as is traversed in
> several places under the RCU lock only.

Yeah, I was confused exactly because of some places using RCU while
others still using read_lock.

> I suggest finish the conversion
> of idev->addr_list to RCU and do this additional traversal under RCU, too.

That sounds reasonable,

Thanks!
Gilad Naaman Nov. 13, 2024, 6:21 a.m. UTC | #8
>On 11/11/24 13:07, Vadim Fedorenko wrote:
>> On 11/11/2024 05:21, Gilad Naaman wrote:
>>>> On 10/11/2024 06:53, Gilad Naaman wrote:
>>>>>>> -           spin_unlock_bh(&net->ipv6.addrconf_hash_lock);
>>>>>>> +   list_for_each_entry(ifa, &idev->addr_list, if_list) {
>>>>>>> +           addrconf_del_dad_work(ifa);
>>>>>>> +
>>>>>>> +           /* combined flag + permanent flag decide if
>>>>>>> +            * address is retained on a down event
>>>>>>> +            */
>>>>>>> +           if (!keep_addr ||
>>>>>>> +               !(ifa->flags & IFA_F_PERMANENT) ||
>>>>>>> +               addr_is_local(&ifa->addr))
>>>>>>> +                   hlist_del_init_rcu(&ifa->addr_lst);
>>>>>>>      }
>>>>>>>
>>>>>>> +   spin_unlock(&net->ipv6.addrconf_hash_lock);
>>>>>>> +   read_unlock_bh(&idev->lock);
>>>>>>
>>>>>> Why is this read lock needed here? spinlock addrconf_hash_lock will
>>>>>> block any RCU grace period to happen, so we can safely traverse
>>>>>> idev->addr_list with list_for_each_entry_rcu()...
>>>>>
>>>>> Oh, sorry, I didn't realize the hash lock encompasses this one;
>>>>> although it seems obvious in retrospect.
>>>>>
>>>>>>> +
>>>>>>>      write_lock_bh(&idev->lock);
>>>>>>
>>>>>> if we are trying to protect idev->addr_list against addition, then we
>>>>>> have to extend write_lock scope. Otherwise it may happen that another
>>>>>> thread will grab write lock between read_unlock and write_lock.
>>>>>>
>>>>>> Am I missing something?
>>>>>
>>>>> I wanted to ensure that access to `idev->addr_list` is performed under lock,
>>>>> the same way it is done immediately afterwards;
>>>>> No particular reason not to extend the existing lock, I just didn't think
>>>>> about it.
>>>>>
>>>>> For what it's worth, the original code didn't have this protection either,
>>>>> since the another thread could have grabbed the lock between
>>>>> `spin_unlock_bh(&net->ipv6.addrconf_hash_lock);` of the last loop iteration,
>>>>> and the `write_lock`.
>>>>>
>>>>> Should I extend the write_lock upwards, or just leave it off?
>>>>
>>>> Well, you are doing write manipulation with the list, which is protected
>>>> by read-write lock. I would expect this lock to be held in write mode.
>>>> And you have to protect hash map at the same time. So yes, write_lock
>>>> and spin_lock altogether, I believe.
>>>>
>>>
>>> Note that within the changed lines, the list itself is only iterated-on,
>>> not manipulated.
>>> The changes are to the `addr_lst` list, which is the hashtable, not the
>>> list this lock protects.
>>>
>>> I'll send v3 with the write-lock extended.
>>> Thank you!
>> 
>> Reading it one more time, I'm not quite sure that locking hashmap
>> spinlock under idev->lock in write mode is a good idea... We have to
>> think more about it, maybe ask for another opinion. Looks like RTNL
>> should protect idev->addr_list from modification while idev->lock is
>> more about changes to idev, not only about addr_list.
>> 
>> @Eric could you please shed some light on the locking schema here?
>
>AFAICS idev->addr_list is (write) protected by write_lock(idev->lock),
>while net->ipv6.inet6_addr_lst is protected by
>spin_lock_bh(&net->ipv6.addrconf_hash_lock).
>
>Extending the write_lock() scope will create a lock dependency between
>the hashtable lock and the list lock, which in turn could cause more
>problem in the future.
>
>Note that idev->addr_list locking looks a bit fuzzy, as is traversed in
>several places under the RCU lock only. I suggest finish the conversion
>of idev->addr_list to RCU and do this additional traversal under RCU, too.

Sure, no problem.

I've looked over the usage of ->addr_list in this file and there are about four
places where I'm certain I can replace idev->lock with RCU:

 - dev_forward_change
 - inet6_addr_del
 - addrconf_dad_run
 - addrconf_disable_policy_idev

As for the rest, if it's okay to run it by you before submitting a patch:

 - ipv6_link_dev_addr:
   Modifies list directly under write-lock.

 -  __ipv6_get_lladdr & ipv6_inherit_eui64 & ipv6_lonely_lladdr: Traverse in
    reverse. According my (admittedly limited) understanding, this is not
    possible in RCU.

 - addrconf_permanent_addr: Not sure if this can be RCU'd, as there's no
   variant that is both _rcu and _safe.
   If it was safe to keep iterating with just `_rcu`, I'm not sure why
   `_safe` was needed in the first place.

 - addrconf_ifdown & inet6_set_iftoken:
   Seems like write-lock is taken anyway and regardless of the iteration,
   so I'm not sure it would benefit from introducing RCU.

 - check_cleanup_prefix_route:
   I'm conflicted about this one.
   When called from ipv6_del_addr(), the write lock is taken anyway.
   When called from inet6_addr_modify(), the write-lock is taken;
   where a read-lock could have done the job.

   Should this be RCU'd as well?

>Cheers,
>
>Paolo

Cheers
diff mbox series

Patch

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index d0a99710d65d..c6fbd634912a 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3846,12 +3846,12 @@  static int addrconf_ifdown(struct net_device *dev, bool unregister)
 {
 	unsigned long event = unregister ? NETDEV_UNREGISTER : NETDEV_DOWN;
 	struct net *net = dev_net(dev);
-	struct inet6_dev *idev;
 	struct inet6_ifaddr *ifa;
 	LIST_HEAD(tmp_addr_list);
+	struct inet6_dev *idev;
 	bool keep_addr = false;
 	bool was_ready;
-	int state, i;
+	int state;
 
 	ASSERT_RTNL();
 
@@ -3890,28 +3890,24 @@  static int addrconf_ifdown(struct net_device *dev, bool unregister)
 	}
 
 	/* Step 2: clear hash table */
-	for (i = 0; i < IN6_ADDR_HSIZE; i++) {
-		struct hlist_head *h = &net->ipv6.inet6_addr_lst[i];
+	read_lock_bh(&idev->lock);
+	spin_lock(&net->ipv6.addrconf_hash_lock);
 
-		spin_lock_bh(&net->ipv6.addrconf_hash_lock);
-restart:
-		hlist_for_each_entry_rcu(ifa, h, addr_lst) {
-			if (ifa->idev == idev) {
-				addrconf_del_dad_work(ifa);
-				/* combined flag + permanent flag decide if
-				 * address is retained on a down event
-				 */
-				if (!keep_addr ||
-				    !(ifa->flags & IFA_F_PERMANENT) ||
-				    addr_is_local(&ifa->addr)) {
-					hlist_del_init_rcu(&ifa->addr_lst);
-					goto restart;
-				}
-			}
-		}
-		spin_unlock_bh(&net->ipv6.addrconf_hash_lock);
+	list_for_each_entry(ifa, &idev->addr_list, if_list) {
+		addrconf_del_dad_work(ifa);
+
+		/* combined flag + permanent flag decide if
+		 * address is retained on a down event
+		 */
+		if (!keep_addr ||
+		    !(ifa->flags & IFA_F_PERMANENT) ||
+		    addr_is_local(&ifa->addr))
+			hlist_del_init_rcu(&ifa->addr_lst);
 	}
 
+	spin_unlock(&net->ipv6.addrconf_hash_lock);
+	read_unlock_bh(&idev->lock);
+
 	write_lock_bh(&idev->lock);
 
 	addrconf_del_rs_timer(idev);