Message ID | 20241017183140.43028-6-kuniyu@amazon.com (mailing list archive) |
---|---|
State | Accepted |
Commit | b7d2fc9ad7fe75b536f94409b7f1e90e12e4f44d |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | phonet: Convert all doit() and dumpit() to RCU. | expand |
Le torstaina 17. lokakuuta 2024, 21.31.36 EEST Kuniyuki Iwashima a écrit : > getaddr_dumpit() already relies on RCU and does not need RTNL. > > Let's use READ_ONCE() for ifindex and register getaddr_dumpit() > with RTNL_FLAG_DUMP_UNLOCKED. > > While at it, the retval of getaddr_dumpit() is changed to combine > NLMSG_DONE and save recvmsg() as done in 58a4ff5d77b1 ("phonet: no > longer hold RTNL in route_dumpit()"). > > Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com> > --- > net/phonet/pn_netlink.c | 24 +++++++++++++++--------- > 1 file changed, 15 insertions(+), 9 deletions(-) > > diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c > index 5996141e258f..14928fa04675 100644 > --- a/net/phonet/pn_netlink.c > +++ b/net/phonet/pn_netlink.c > @@ -127,14 +127,17 @@ static int fill_addr(struct sk_buff *skb, u32 ifindex, > u8 addr, > > static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback *cb) > { > + int addr_idx = 0, addr_start_idx = cb->args[1]; > + int dev_idx = 0, dev_start_idx = cb->args[0]; > struct phonet_device_list *pndevs; > struct phonet_device *pnd; > - int dev_idx = 0, dev_start_idx = cb->args[0]; > - int addr_idx = 0, addr_start_idx = cb->args[1]; > + int err = 0; > > pndevs = phonet_device_list(sock_net(skb->sk)); > + > rcu_read_lock(); > list_for_each_entry_rcu(pnd, &pndevs->list, list) { > + DECLARE_BITMAP(addrs, 64); > u8 addr; > > if (dev_idx > dev_start_idx) > @@ -143,23 +146,26 @@ static int getaddr_dumpit(struct sk_buff *skb, struct > netlink_callback *cb) continue; > > addr_idx = 0; > - for_each_set_bit(addr, pnd->addrs, 64) { > + memcpy(addrs, pnd->addrs, sizeof(pnd->addrs)); Is that really safe? Are we sure that the bit-field writers are atomic w.r.t. memcpy() on all platforms? If READ_ONCE is needed for an integer, using memcpy() seems sketchy, TBH. > + > + for_each_set_bit(addr, addrs, 64) { > if (addr_idx++ < addr_start_idx) > continue; > > - if (fill_addr(skb, pnd->netdev->ifindex, addr << 2, > - NETLINK_CB(cb- >skb).portid, > - cb->nlh->nlmsg_seq, RTM_NEWADDR) < 0) > + err = fill_addr(skb, READ_ONCE(pnd->netdev- >ifindex), > + addr << 2, NETLINK_CB(cb->skb).portid, > + cb->nlh->nlmsg_seq, RTM_NEWADDR); > + if (err < 0) > goto out; > } > } > - > out: > rcu_read_unlock(); > + > cb->args[0] = dev_idx; > cb->args[1] = addr_idx; > > - return skb->len; > + return err; > } > > /* Routes handling */ > @@ -298,7 +304,7 @@ static const struct rtnl_msg_handler > phonet_rtnl_msg_handlers[] __initdata_or_mo {.owner = THIS_MODULE, > .protocol = PF_PHONET, .msgtype = RTM_DELADDR, .doit = addr_doit, .flags = > RTNL_FLAG_DOIT_UNLOCKED}, > {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_GETADDR, > - .dumpit = getaddr_dumpit}, > + .dumpit = getaddr_dumpit, .flags = RTNL_FLAG_DUMP_UNLOCKED}, > {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_NEWROUTE, > .doit = route_doit}, > {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_DELROUTE,
From: "Rémi Denis-Courmont" <remi@remlab.net> Date: Thu, 17 Oct 2024 21:49:18 +0300 > > diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c > > index 5996141e258f..14928fa04675 100644 > > --- a/net/phonet/pn_netlink.c > > +++ b/net/phonet/pn_netlink.c > > @@ -127,14 +127,17 @@ static int fill_addr(struct sk_buff *skb, u32 ifindex, > > u8 addr, > > > > static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback *cb) > > { > > + int addr_idx = 0, addr_start_idx = cb->args[1]; > > + int dev_idx = 0, dev_start_idx = cb->args[0]; > > struct phonet_device_list *pndevs; > > struct phonet_device *pnd; > > - int dev_idx = 0, dev_start_idx = cb->args[0]; > > - int addr_idx = 0, addr_start_idx = cb->args[1]; > > + int err = 0; > > > > pndevs = phonet_device_list(sock_net(skb->sk)); > > + > > rcu_read_lock(); > > list_for_each_entry_rcu(pnd, &pndevs->list, list) { > > + DECLARE_BITMAP(addrs, 64); > > u8 addr; > > > > if (dev_idx > dev_start_idx) > > @@ -143,23 +146,26 @@ static int getaddr_dumpit(struct sk_buff *skb, struct > > netlink_callback *cb) continue; > > > > addr_idx = 0; > > - for_each_set_bit(addr, pnd->addrs, 64) { > > + memcpy(addrs, pnd->addrs, sizeof(pnd->addrs)); > > Is that really safe? Are we sure that the bit-field writers are atomic w.r.t. > memcpy() on all platforms? If READ_ONCE is needed for an integer, using > memcpy() seems sketchy, TBH. I think bit-field read/write need not be atomic here because even if a data-race happens, for_each_set_bit() iterates each bit, which is the real data, regardless of whether data-race happened or not.
Le perjantaina 18. lokakuuta 2024, 20.16.29 EEST Kuniyuki Iwashima a écrit : > From: "Rémi Denis-Courmont" <remi@remlab.net> > Date: Thu, 17 Oct 2024 21:49:18 +0300 > > > > diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c > > > index 5996141e258f..14928fa04675 100644 > > > --- a/net/phonet/pn_netlink.c > > > +++ b/net/phonet/pn_netlink.c > > > @@ -127,14 +127,17 @@ static int fill_addr(struct sk_buff *skb, u32 > > > ifindex, u8 addr, > > > > > > static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback > > > *cb) > > > > > > { > > > + int addr_idx = 0, addr_start_idx = cb->args[1]; > > > + int dev_idx = 0, dev_start_idx = cb->args[0]; > > > > > > struct phonet_device_list *pndevs; > > > struct phonet_device *pnd; > > > > > > - int dev_idx = 0, dev_start_idx = cb->args[0]; > > > - int addr_idx = 0, addr_start_idx = cb->args[1]; > > > + int err = 0; > > > > > > pndevs = phonet_device_list(sock_net(skb->sk)); > > > > > > + > > > > > > rcu_read_lock(); > > > list_for_each_entry_rcu(pnd, &pndevs->list, list) { > > > > > > + DECLARE_BITMAP(addrs, 64); > > > > > > u8 addr; > > > > > > if (dev_idx > dev_start_idx) > > > > > > @@ -143,23 +146,26 @@ static int getaddr_dumpit(struct sk_buff *skb, > > > struct > > > netlink_callback *cb) continue; > > > > > > addr_idx = 0; > > > > > > - for_each_set_bit(addr, pnd->addrs, 64) { > > > + memcpy(addrs, pnd->addrs, sizeof(pnd->addrs)); > > > > Is that really safe? Are we sure that the bit-field writers are atomic > > w.r.t. memcpy() on all platforms? If READ_ONCE is needed for an integer, > > using memcpy() seems sketchy, TBH. > > I think bit-field read/write need not be atomic here because even > if a data-race happens, for_each_set_bit() iterates each bit, which > is the real data, regardless of whether data-race happened or not. Err, it looks to me that a corrupt bit would lead to the index getting corrupt and addresses getting skipped or repeated. AFAICT, the RTNL lock is still needed here.
From: "Rémi Denis-Courmont" <remi@remlab.net> Date: Sat, 19 Oct 2024 10:48:57 +0300 > > > > list_for_each_entry_rcu(pnd, &pndevs->list, list) { > > > > > > > > + DECLARE_BITMAP(addrs, 64); > > > > > > > > u8 addr; > > > > > > > > if (dev_idx > dev_start_idx) > > > > > > > > @@ -143,23 +146,26 @@ static int getaddr_dumpit(struct sk_buff *skb, > > > > struct > > > > netlink_callback *cb) continue; > > > > > > > > addr_idx = 0; > > > > > > > > - for_each_set_bit(addr, pnd->addrs, 64) { > > > > + memcpy(addrs, pnd->addrs, sizeof(pnd->addrs)); > > > > > > Is that really safe? Are we sure that the bit-field writers are atomic > > > w.r.t. memcpy() on all platforms? If READ_ONCE is needed for an integer, > > > using memcpy() seems sketchy, TBH. > > > > I think bit-field read/write need not be atomic here because even > > if a data-race happens, for_each_set_bit() iterates each bit, which > > is the real data, regardless of whether data-race happened or not. > > Err, it looks to me that a corrupt bit would lead to the index getting corrupt > and addresses getting skipped or repeated. AFAICT, the RTNL lock is still > needed here. Let's say pnd->addrs has 0b00010001 as the lowest in 8 bits and addr_dumpit() and addr_doit() are executed concurrently, and addr_doit() tries to change it to 0b00010011. If we have a lock, there could be two results of dumpit(). 1. lock() dumpit() -> 0b00010001 unlock() lock() doit() unlock() 2. lock() doit() unlock() lock() dumpit() -> 0b00010011 unlock() If we don't have a lock and dumpit()'s read and doit()'s write are split to upper 4 bits (0b0001) and lower 4 bits (0b0011), there are 6 patterns of occurences, but the results are only 2 patterns and the same with the locked version. If you get 0b00010001, you can think dumpit() completes earlier, and the opposite if you get 0b00010011. This is how we think with lockless algo. In this case, we do not require lock to iterate bitmaps. 1. upper half read of dumpit() lower half read of dumpit() upper half write of doit() -> 0b0001 lower half write of doit() -> 0b0011 -> 0b00010001 2. upper half read of dumpit() upper half write of doit() -> 0b0001 lower half read of dumpit() lower half write of doit() -> 0b0011 -> 0b00010001 3. upper half write of doit() -> 0b0001 upper half read of dumpit() lower half read of dumpit() lower half write of doit() -> 0b0011 -> 0b00010001 4. upper half write of doit() -> 0b0001 lower half write of doit() -> 0b0011 upper half read of dumpit() lower half read of dumpit() -> 0b00010011 5. upper half write of doit() -> 0b0001 upper half read of dumpit() lower half write of doit() -> 0b0011 lower half read of dumpit() -> 0b00010011 6. upper half read of dumpit() upper half write of doit() -> 0b0001 lower half write of doit() -> 0b0011 lower half read of dumpit() -> 0b00010011
On 10/19/24 09:48, Rémi Denis-Courmont wrote: > Le perjantaina 18. lokakuuta 2024, 20.16.29 EEST Kuniyuki Iwashima a écrit : >> From: "Rémi Denis-Courmont" <remi@remlab.net> >> Date: Thu, 17 Oct 2024 21:49:18 +0300 >> >>>> diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c >>>> index 5996141e258f..14928fa04675 100644 >>>> --- a/net/phonet/pn_netlink.c >>>> +++ b/net/phonet/pn_netlink.c >>>> @@ -127,14 +127,17 @@ static int fill_addr(struct sk_buff *skb, u32 >>>> ifindex, u8 addr, >>>> >>>> static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback >>>> *cb) >>>> >>>> { >>>> + int addr_idx = 0, addr_start_idx = cb->args[1]; >>>> + int dev_idx = 0, dev_start_idx = cb->args[0]; >>>> >>>> struct phonet_device_list *pndevs; >>>> struct phonet_device *pnd; >>>> >>>> - int dev_idx = 0, dev_start_idx = cb->args[0]; >>>> - int addr_idx = 0, addr_start_idx = cb->args[1]; >>>> + int err = 0; >>>> >>>> pndevs = phonet_device_list(sock_net(skb->sk)); >>>> >>>> + >>>> >>>> rcu_read_lock(); >>>> list_for_each_entry_rcu(pnd, &pndevs->list, list) { >>>> >>>> + DECLARE_BITMAP(addrs, 64); >>>> >>>> u8 addr; >>>> >>>> if (dev_idx > dev_start_idx) >>>> >>>> @@ -143,23 +146,26 @@ static int getaddr_dumpit(struct sk_buff *skb, >>>> struct >>>> netlink_callback *cb) continue; >>>> >>>> addr_idx = 0; >>>> >>>> - for_each_set_bit(addr, pnd->addrs, 64) { >>>> + memcpy(addrs, pnd->addrs, sizeof(pnd->addrs)); >>> >>> Is that really safe? Are we sure that the bit-field writers are atomic >>> w.r.t. memcpy() on all platforms? If READ_ONCE is needed for an integer, >>> using memcpy() seems sketchy, TBH. >> >> I think bit-field read/write need not be atomic here because even >> if a data-race happens, for_each_set_bit() iterates each bit, which >> is the real data, regardless of whether data-race happened or not. > > Err, it looks to me that a corrupt bit would lead to the index getting corrupt > and addresses getting skipped or repeated. AFAICT, the RTNL lock is still > needed here. To wrap-up Kuniyuki's reply: addresses can't be repeated in dump. They can be 'skipped' meaning the dump can race with writer reading an 'old' address bitmask, still not containing the 'new' address. Exactly as could happen with racing dump/writer both protected by the lock. The bottom line is that this code looks safe to me. Thanks, Paolo
On 10/23/24 13:04, Paolo Abeni wrote: > On 10/19/24 09:48, Rémi Denis-Courmont wrote: >>> I think bit-field read/write need not be atomic here because even >>> if a data-race happens, for_each_set_bit() iterates each bit, which >>> is the real data, regardless of whether data-race happened or not. >> >> Err, it looks to me that a corrupt bit would lead to the index getting corrupt >> and addresses getting skipped or repeated. AFAICT, the RTNL lock is still >> needed here. > > To wrap-up Kuniyuki's reply: addresses can't be repeated in dump. They > can be 'skipped' meaning the dump can race with writer reading an 'old' > address bitmask, still not containing the 'new' address. Exactly as > could happen with racing dump/writer both protected by the lock. > > The bottom line is that this code looks safe to me. I'm sorry, I forgot to ask the obvious question: @Rémi, are you ok with this explanation and patch as-is? Thanks! Paolo
On Thu, Oct 17, 2024 at 8:33 PM Kuniyuki Iwashima <kuniyu@amazon.com> wrote: > > getaddr_dumpit() already relies on RCU and does not need RTNL. > > Let's use READ_ONCE() for ifindex and register getaddr_dumpit() > with RTNL_FLAG_DUMP_UNLOCKED. > > While at it, the retval of getaddr_dumpit() is changed to combine > NLMSG_DONE and save recvmsg() as done in 58a4ff5d77b1 ("phonet: no > longer hold RTNL in route_dumpit()"). > > Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com> > --- Reviewed-by: Eric Dumazet <edumazet@google.com>
diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c index 5996141e258f..14928fa04675 100644 --- a/net/phonet/pn_netlink.c +++ b/net/phonet/pn_netlink.c @@ -127,14 +127,17 @@ static int fill_addr(struct sk_buff *skb, u32 ifindex, u8 addr, static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { + int addr_idx = 0, addr_start_idx = cb->args[1]; + int dev_idx = 0, dev_start_idx = cb->args[0]; struct phonet_device_list *pndevs; struct phonet_device *pnd; - int dev_idx = 0, dev_start_idx = cb->args[0]; - int addr_idx = 0, addr_start_idx = cb->args[1]; + int err = 0; pndevs = phonet_device_list(sock_net(skb->sk)); + rcu_read_lock(); list_for_each_entry_rcu(pnd, &pndevs->list, list) { + DECLARE_BITMAP(addrs, 64); u8 addr; if (dev_idx > dev_start_idx) @@ -143,23 +146,26 @@ static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback *cb) continue; addr_idx = 0; - for_each_set_bit(addr, pnd->addrs, 64) { + memcpy(addrs, pnd->addrs, sizeof(pnd->addrs)); + + for_each_set_bit(addr, addrs, 64) { if (addr_idx++ < addr_start_idx) continue; - if (fill_addr(skb, pnd->netdev->ifindex, addr << 2, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, RTM_NEWADDR) < 0) + err = fill_addr(skb, READ_ONCE(pnd->netdev->ifindex), + addr << 2, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, RTM_NEWADDR); + if (err < 0) goto out; } } - out: rcu_read_unlock(); + cb->args[0] = dev_idx; cb->args[1] = addr_idx; - return skb->len; + return err; } /* Routes handling */ @@ -298,7 +304,7 @@ static const struct rtnl_msg_handler phonet_rtnl_msg_handlers[] __initdata_or_mo {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_DELADDR, .doit = addr_doit, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_GETADDR, - .dumpit = getaddr_dumpit}, + .dumpit = getaddr_dumpit, .flags = RTNL_FLAG_DUMP_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_NEWROUTE, .doit = route_doit}, {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_DELROUTE,
getaddr_dumpit() already relies on RCU and does not need RTNL. Let's use READ_ONCE() for ifindex and register getaddr_dumpit() with RTNL_FLAG_DUMP_UNLOCKED. While at it, the retval of getaddr_dumpit() is changed to combine NLMSG_DONE and save recvmsg() as done in 58a4ff5d77b1 ("phonet: no longer hold RTNL in route_dumpit()"). Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com> --- net/phonet/pn_netlink.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-)