diff mbox series

[v1,net-next,5/9] phonet: Don't hold RTNL for getaddr_dumpit().

Message ID 20241017183140.43028-6-kuniyu@amazon.com (mailing list archive)
State Accepted
Commit b7d2fc9ad7fe75b536f94409b7f1e90e12e4f44d
Delegated to: Netdev Maintainers
Headers show
Series phonet: Convert all doit() and dumpit() to RCU. | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 5 this patch: 5
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers success CCed 5 of 5 maintainers
netdev/build_clang success Errors and warnings before: 3 this patch: 3
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4 this patch: 4
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 59 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-10-19--00-00 (tests: 777)

Commit Message

Kuniyuki Iwashima Oct. 17, 2024, 6:31 p.m. UTC
getaddr_dumpit() already relies on RCU and does not need RTNL.

Let's use READ_ONCE() for ifindex and register getaddr_dumpit()
with RTNL_FLAG_DUMP_UNLOCKED.

While at it, the retval of getaddr_dumpit() is changed to combine
NLMSG_DONE and save recvmsg() as done in 58a4ff5d77b1 ("phonet: no
longer hold RTNL in route_dumpit()").

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
 net/phonet/pn_netlink.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

Comments

Rémi Denis-Courmont Oct. 17, 2024, 6:49 p.m. UTC | #1
Le torstaina 17. lokakuuta 2024, 21.31.36 EEST Kuniyuki Iwashima a écrit :
> getaddr_dumpit() already relies on RCU and does not need RTNL.
> 
> Let's use READ_ONCE() for ifindex and register getaddr_dumpit()
> with RTNL_FLAG_DUMP_UNLOCKED.
> 
> While at it, the retval of getaddr_dumpit() is changed to combine
> NLMSG_DONE and save recvmsg() as done in 58a4ff5d77b1 ("phonet: no
> longer hold RTNL in route_dumpit()").
> 
> Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> ---
>  net/phonet/pn_netlink.c | 24 +++++++++++++++---------
>  1 file changed, 15 insertions(+), 9 deletions(-)
> 
> diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c
> index 5996141e258f..14928fa04675 100644
> --- a/net/phonet/pn_netlink.c
> +++ b/net/phonet/pn_netlink.c
> @@ -127,14 +127,17 @@ static int fill_addr(struct sk_buff *skb, u32 ifindex,
> u8 addr,
> 
>  static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
> {
> +	int addr_idx = 0, addr_start_idx = cb->args[1];
> +	int dev_idx = 0, dev_start_idx = cb->args[0];
>  	struct phonet_device_list *pndevs;
>  	struct phonet_device *pnd;
> -	int dev_idx = 0, dev_start_idx = cb->args[0];
> -	int addr_idx = 0, addr_start_idx = cb->args[1];
> +	int err = 0;
> 
>  	pndevs = phonet_device_list(sock_net(skb->sk));
> +
>  	rcu_read_lock();
>  	list_for_each_entry_rcu(pnd, &pndevs->list, list) {
> +		DECLARE_BITMAP(addrs, 64);
>  		u8 addr;
> 
>  		if (dev_idx > dev_start_idx)
> @@ -143,23 +146,26 @@ static int getaddr_dumpit(struct sk_buff *skb, struct
> netlink_callback *cb) continue;
> 
>  		addr_idx = 0;
> -		for_each_set_bit(addr, pnd->addrs, 64) {
> +		memcpy(addrs, pnd->addrs, sizeof(pnd->addrs));

Is that really safe? Are we sure that the bit-field writers are atomic w.r.t. 
memcpy() on all platforms? If READ_ONCE is needed for an integer, using 
memcpy() seems sketchy, TBH.

> +
> +		for_each_set_bit(addr, addrs, 64) {
>  			if (addr_idx++ < addr_start_idx)
>  				continue;
> 
> -			if (fill_addr(skb, pnd->netdev->ifindex, addr 
<< 2,
> -					 NETLINK_CB(cb-
>skb).portid,
> -					cb->nlh->nlmsg_seq, 
RTM_NEWADDR) < 0)
> +			err = fill_addr(skb, READ_ONCE(pnd->netdev-
>ifindex),
> +					addr << 2, 
NETLINK_CB(cb->skb).portid,
> +					cb->nlh->nlmsg_seq, 
RTM_NEWADDR);
> +			if (err < 0)
>  				goto out;
>  		}
>  	}
> -
>  out:
>  	rcu_read_unlock();
> +
>  	cb->args[0] = dev_idx;
>  	cb->args[1] = addr_idx;
> 
> -	return skb->len;
> +	return err;
>  }
> 
>  /* Routes handling */
> @@ -298,7 +304,7 @@ static const struct rtnl_msg_handler
> phonet_rtnl_msg_handlers[] __initdata_or_mo {.owner = THIS_MODULE,
> .protocol = PF_PHONET, .msgtype = RTM_DELADDR, .doit = addr_doit, .flags =
> RTNL_FLAG_DOIT_UNLOCKED},
>  	{.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = 
RTM_GETADDR,
> -	 .dumpit = getaddr_dumpit},
> +	 .dumpit = getaddr_dumpit, .flags = RTNL_FLAG_DUMP_UNLOCKED},
>  	{.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = 
RTM_NEWROUTE,
>  	 .doit = route_doit},
>  	{.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = 
RTM_DELROUTE,
Kuniyuki Iwashima Oct. 18, 2024, 5:16 p.m. UTC | #2
From: "Rémi Denis-Courmont" <remi@remlab.net>
Date: Thu, 17 Oct 2024 21:49:18 +0300
> > diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c
> > index 5996141e258f..14928fa04675 100644
> > --- a/net/phonet/pn_netlink.c
> > +++ b/net/phonet/pn_netlink.c
> > @@ -127,14 +127,17 @@ static int fill_addr(struct sk_buff *skb, u32 ifindex,
> > u8 addr,
> > 
> >  static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
> > {
> > +	int addr_idx = 0, addr_start_idx = cb->args[1];
> > +	int dev_idx = 0, dev_start_idx = cb->args[0];
> >  	struct phonet_device_list *pndevs;
> >  	struct phonet_device *pnd;
> > -	int dev_idx = 0, dev_start_idx = cb->args[0];
> > -	int addr_idx = 0, addr_start_idx = cb->args[1];
> > +	int err = 0;
> > 
> >  	pndevs = phonet_device_list(sock_net(skb->sk));
> > +
> >  	rcu_read_lock();
> >  	list_for_each_entry_rcu(pnd, &pndevs->list, list) {
> > +		DECLARE_BITMAP(addrs, 64);
> >  		u8 addr;
> > 
> >  		if (dev_idx > dev_start_idx)
> > @@ -143,23 +146,26 @@ static int getaddr_dumpit(struct sk_buff *skb, struct
> > netlink_callback *cb) continue;
> > 
> >  		addr_idx = 0;
> > -		for_each_set_bit(addr, pnd->addrs, 64) {
> > +		memcpy(addrs, pnd->addrs, sizeof(pnd->addrs));
> 
> Is that really safe? Are we sure that the bit-field writers are atomic w.r.t. 
> memcpy() on all platforms? If READ_ONCE is needed for an integer, using 
> memcpy() seems sketchy, TBH.

I think bit-field read/write need not be atomic here because even
if a data-race happens, for_each_set_bit() iterates each bit, which
is the real data, regardless of whether data-race happened or not.
Rémi Denis-Courmont Oct. 19, 2024, 7:48 a.m. UTC | #3
Le perjantaina 18. lokakuuta 2024, 20.16.29 EEST Kuniyuki Iwashima a écrit :
> From: "Rémi Denis-Courmont" <remi@remlab.net>
> Date: Thu, 17 Oct 2024 21:49:18 +0300
> 
> > > diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c
> > > index 5996141e258f..14928fa04675 100644
> > > --- a/net/phonet/pn_netlink.c
> > > +++ b/net/phonet/pn_netlink.c
> > > @@ -127,14 +127,17 @@ static int fill_addr(struct sk_buff *skb, u32
> > > ifindex, u8 addr,
> > > 
> > >  static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback
> > >  *cb)
> > > 
> > > {
> > > +	int addr_idx = 0, addr_start_idx = cb->args[1];
> > > +	int dev_idx = 0, dev_start_idx = cb->args[0];
> > > 
> > >  	struct phonet_device_list *pndevs;
> > >  	struct phonet_device *pnd;
> > > 
> > > -	int dev_idx = 0, dev_start_idx = cb->args[0];
> > > -	int addr_idx = 0, addr_start_idx = cb->args[1];
> > > +	int err = 0;
> > > 
> > >  	pndevs = phonet_device_list(sock_net(skb->sk));
> > > 
> > > +
> > > 
> > >  	rcu_read_lock();
> > >  	list_for_each_entry_rcu(pnd, &pndevs->list, list) {
> > > 
> > > +		DECLARE_BITMAP(addrs, 64);
> > > 
> > >  		u8 addr;
> > >  		
> > >  		if (dev_idx > dev_start_idx)
> > > 
> > > @@ -143,23 +146,26 @@ static int getaddr_dumpit(struct sk_buff *skb,
> > > struct
> > > netlink_callback *cb) continue;
> > > 
> > >  		addr_idx = 0;
> > > 
> > > -		for_each_set_bit(addr, pnd->addrs, 64) {
> > > +		memcpy(addrs, pnd->addrs, sizeof(pnd->addrs));
> > 
> > Is that really safe? Are we sure that the bit-field writers are atomic
> > w.r.t. memcpy() on all platforms? If READ_ONCE is needed for an integer,
> > using memcpy() seems sketchy, TBH.
> 
> I think bit-field read/write need not be atomic here because even
> if a data-race happens, for_each_set_bit() iterates each bit, which
> is the real data, regardless of whether data-race happened or not.

Err, it looks to me that a corrupt bit would lead to the index getting corrupt 
and addresses getting skipped or repeated. AFAICT, the RTNL lock is still 
needed here.
Kuniyuki Iwashima Oct. 20, 2024, 1:30 a.m. UTC | #4
From: "Rémi Denis-Courmont" <remi@remlab.net>
Date: Sat, 19 Oct 2024 10:48:57 +0300
> > > >  	list_for_each_entry_rcu(pnd, &pndevs->list, list) {
> > > > 
> > > > +		DECLARE_BITMAP(addrs, 64);
> > > > 
> > > >  		u8 addr;
> > > >  		
> > > >  		if (dev_idx > dev_start_idx)
> > > > 
> > > > @@ -143,23 +146,26 @@ static int getaddr_dumpit(struct sk_buff *skb,
> > > > struct
> > > > netlink_callback *cb) continue;
> > > > 
> > > >  		addr_idx = 0;
> > > > 
> > > > -		for_each_set_bit(addr, pnd->addrs, 64) {
> > > > +		memcpy(addrs, pnd->addrs, sizeof(pnd->addrs));
> > > 
> > > Is that really safe? Are we sure that the bit-field writers are atomic
> > > w.r.t. memcpy() on all platforms? If READ_ONCE is needed for an integer,
> > > using memcpy() seems sketchy, TBH.
> > 
> > I think bit-field read/write need not be atomic here because even
> > if a data-race happens, for_each_set_bit() iterates each bit, which
> > is the real data, regardless of whether data-race happened or not.
> 
> Err, it looks to me that a corrupt bit would lead to the index getting corrupt 
> and addresses getting skipped or repeated. AFAICT, the RTNL lock is still 
> needed here.

Let's say pnd->addrs has 0b00010001 as the lowest in 8 bits
and addr_dumpit() and addr_doit() are executed concurrently,
and addr_doit() tries to change it to 0b00010011.

If we have a lock, there could be two results of dumpit().

  1.
  lock()
  dumpit() -> 0b00010001
  unlock()
                   lock()
                   doit()
                   unlock()

  2.
                   lock()
                   doit()
                   unlock()
  lock()
  dumpit() -> 0b00010011
  unlock()

If we don't have a lock and dumpit()'s read and doit()'s write
are split to upper 4 bits (0b0001) and lower 4 bits (0b0011),

there are 6 patterns of occurences, but the results are only
2 patterns and the same with the locked version.

If you get 0b00010001, you can think dumpit() completes earlier,
and the opposite if you get 0b00010011.

This is how we think with lockless algo.  In this case, we do
not require lock to iterate bitmaps.

  1.
  upper half read of dumpit()
  lower half read of dumpit()
                   upper half write of doit() -> 0b0001
                   lower half write of doit() -> 0b0011
  -> 0b00010001

  2.
  upper half read of dumpit()
                   upper half write of doit() -> 0b0001
  lower half read of dumpit()
                   lower half write of doit() -> 0b0011
  -> 0b00010001

  3.
                   upper half write of doit() -> 0b0001
  upper half read of dumpit()
  lower half read of dumpit()
                   lower half write of doit() -> 0b0011
  -> 0b00010001

  4.
                   upper half write of doit() -> 0b0001
                   lower half write of doit() -> 0b0011
  upper half read of dumpit()
  lower half read of dumpit()
  -> 0b00010011

  5.
                   upper half write of doit() -> 0b0001
  upper half read of dumpit()
                   lower half write of doit() -> 0b0011
  lower half read of dumpit()
  -> 0b00010011

  6.
  upper half read of dumpit()
                   upper half write of doit() -> 0b0001
                   lower half write of doit() -> 0b0011
  lower half read of dumpit()
  -> 0b00010011
Paolo Abeni Oct. 23, 2024, 11:04 a.m. UTC | #5
On 10/19/24 09:48, Rémi Denis-Courmont wrote:
> Le perjantaina 18. lokakuuta 2024, 20.16.29 EEST Kuniyuki Iwashima a écrit :
>> From: "Rémi Denis-Courmont" <remi@remlab.net>
>> Date: Thu, 17 Oct 2024 21:49:18 +0300
>>
>>>> diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c
>>>> index 5996141e258f..14928fa04675 100644
>>>> --- a/net/phonet/pn_netlink.c
>>>> +++ b/net/phonet/pn_netlink.c
>>>> @@ -127,14 +127,17 @@ static int fill_addr(struct sk_buff *skb, u32
>>>> ifindex, u8 addr,
>>>>
>>>>  static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback
>>>>  *cb)
>>>>
>>>> {
>>>> +	int addr_idx = 0, addr_start_idx = cb->args[1];
>>>> +	int dev_idx = 0, dev_start_idx = cb->args[0];
>>>>
>>>>  	struct phonet_device_list *pndevs;
>>>>  	struct phonet_device *pnd;
>>>>
>>>> -	int dev_idx = 0, dev_start_idx = cb->args[0];
>>>> -	int addr_idx = 0, addr_start_idx = cb->args[1];
>>>> +	int err = 0;
>>>>
>>>>  	pndevs = phonet_device_list(sock_net(skb->sk));
>>>>
>>>> +
>>>>
>>>>  	rcu_read_lock();
>>>>  	list_for_each_entry_rcu(pnd, &pndevs->list, list) {
>>>>
>>>> +		DECLARE_BITMAP(addrs, 64);
>>>>
>>>>  		u8 addr;
>>>>  		
>>>>  		if (dev_idx > dev_start_idx)
>>>>
>>>> @@ -143,23 +146,26 @@ static int getaddr_dumpit(struct sk_buff *skb,
>>>> struct
>>>> netlink_callback *cb) continue;
>>>>
>>>>  		addr_idx = 0;
>>>>
>>>> -		for_each_set_bit(addr, pnd->addrs, 64) {
>>>> +		memcpy(addrs, pnd->addrs, sizeof(pnd->addrs));
>>>
>>> Is that really safe? Are we sure that the bit-field writers are atomic
>>> w.r.t. memcpy() on all platforms? If READ_ONCE is needed for an integer,
>>> using memcpy() seems sketchy, TBH.
>>
>> I think bit-field read/write need not be atomic here because even
>> if a data-race happens, for_each_set_bit() iterates each bit, which
>> is the real data, regardless of whether data-race happened or not.
> 
> Err, it looks to me that a corrupt bit would lead to the index getting corrupt 
> and addresses getting skipped or repeated. AFAICT, the RTNL lock is still 
> needed here.

To wrap-up Kuniyuki's reply: addresses can't be repeated in dump. They
can be 'skipped' meaning the dump can race with writer reading an 'old'
address bitmask, still not containing the 'new' address. Exactly as
could happen with racing dump/writer both protected by the lock.

The bottom line is that this code looks safe to me.

Thanks,

Paolo
Paolo Abeni Oct. 23, 2024, 11:16 a.m. UTC | #6
On 10/23/24 13:04, Paolo Abeni wrote:
> On 10/19/24 09:48, Rémi Denis-Courmont wrote:
>>> I think bit-field read/write need not be atomic here because even
>>> if a data-race happens, for_each_set_bit() iterates each bit, which
>>> is the real data, regardless of whether data-race happened or not.
>>
>> Err, it looks to me that a corrupt bit would lead to the index getting corrupt 
>> and addresses getting skipped or repeated. AFAICT, the RTNL lock is still 
>> needed here.
> 
> To wrap-up Kuniyuki's reply: addresses can't be repeated in dump. They
> can be 'skipped' meaning the dump can race with writer reading an 'old'
> address bitmask, still not containing the 'new' address. Exactly as
> could happen with racing dump/writer both protected by the lock.
> 
> The bottom line is that this code looks safe to me.

I'm sorry, I forgot to ask the obvious question: @Rémi, are you ok with
this explanation and patch as-is?

Thanks!

Paolo
Eric Dumazet Oct. 23, 2024, 3:30 p.m. UTC | #7
On Thu, Oct 17, 2024 at 8:33 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
>
> getaddr_dumpit() already relies on RCU and does not need RTNL.
>
> Let's use READ_ONCE() for ifindex and register getaddr_dumpit()
> with RTNL_FLAG_DUMP_UNLOCKED.
>
> While at it, the retval of getaddr_dumpit() is changed to combine
> NLMSG_DONE and save recvmsg() as done in 58a4ff5d77b1 ("phonet: no
> longer hold RTNL in route_dumpit()").
>
> Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> ---

Reviewed-by: Eric Dumazet <edumazet@google.com>
diff mbox series

Patch

diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c
index 5996141e258f..14928fa04675 100644
--- a/net/phonet/pn_netlink.c
+++ b/net/phonet/pn_netlink.c
@@ -127,14 +127,17 @@  static int fill_addr(struct sk_buff *skb, u32 ifindex, u8 addr,
 
 static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	int addr_idx = 0, addr_start_idx = cb->args[1];
+	int dev_idx = 0, dev_start_idx = cb->args[0];
 	struct phonet_device_list *pndevs;
 	struct phonet_device *pnd;
-	int dev_idx = 0, dev_start_idx = cb->args[0];
-	int addr_idx = 0, addr_start_idx = cb->args[1];
+	int err = 0;
 
 	pndevs = phonet_device_list(sock_net(skb->sk));
+
 	rcu_read_lock();
 	list_for_each_entry_rcu(pnd, &pndevs->list, list) {
+		DECLARE_BITMAP(addrs, 64);
 		u8 addr;
 
 		if (dev_idx > dev_start_idx)
@@ -143,23 +146,26 @@  static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
 			continue;
 
 		addr_idx = 0;
-		for_each_set_bit(addr, pnd->addrs, 64) {
+		memcpy(addrs, pnd->addrs, sizeof(pnd->addrs));
+
+		for_each_set_bit(addr, addrs, 64) {
 			if (addr_idx++ < addr_start_idx)
 				continue;
 
-			if (fill_addr(skb, pnd->netdev->ifindex, addr << 2,
-					 NETLINK_CB(cb->skb).portid,
-					cb->nlh->nlmsg_seq, RTM_NEWADDR) < 0)
+			err = fill_addr(skb, READ_ONCE(pnd->netdev->ifindex),
+					addr << 2, NETLINK_CB(cb->skb).portid,
+					cb->nlh->nlmsg_seq, RTM_NEWADDR);
+			if (err < 0)
 				goto out;
 		}
 	}
-
 out:
 	rcu_read_unlock();
+
 	cb->args[0] = dev_idx;
 	cb->args[1] = addr_idx;
 
-	return skb->len;
+	return err;
 }
 
 /* Routes handling */
@@ -298,7 +304,7 @@  static const struct rtnl_msg_handler phonet_rtnl_msg_handlers[] __initdata_or_mo
 	{.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_DELADDR,
 	 .doit = addr_doit, .flags = RTNL_FLAG_DOIT_UNLOCKED},
 	{.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_GETADDR,
-	 .dumpit = getaddr_dumpit},
+	 .dumpit = getaddr_dumpit, .flags = RTNL_FLAG_DUMP_UNLOCKED},
 	{.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_NEWROUTE,
 	 .doit = route_doit},
 	{.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_DELROUTE,