Message ID | 20240403123913.4173904-1-edumazet@google.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | [net-next] ipv6: remove RTNL protection from ip6addrlbl_dump() | expand |
On 4/3/24 6:39 AM, Eric Dumazet wrote: > diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c > index 17ac45aa7194ce9c148ed95e14dd575d17feeb98..961e853543b64cd2060ef693ae3ad32a44780aa1 100644 > --- a/net/ipv6/addrlabel.c > +++ b/net/ipv6/addrlabel.c > @@ -234,7 +234,8 @@ static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp, > hlist_add_head_rcu(&newp->list, &net->ipv6.ip6addrlbl_table.head); > out: > if (!ret) > - net->ipv6.ip6addrlbl_table.seq++; > + WRITE_ONCE(net->ipv6.ip6addrlbl_table.seq, > + net->ipv6.ip6addrlbl_table.seq + 1); > return ret; > } > > @@ -445,7 +446,7 @@ static void ip6addrlbl_putmsg(struct nlmsghdr *nlh, > }; > > static int ip6addrlbl_fill(struct sk_buff *skb, > - struct ip6addrlbl_entry *p, > + const struct ip6addrlbl_entry *p, > u32 lseq, > u32 portid, u32 seq, int event, > unsigned int flags) > @@ -498,7 +499,7 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) > struct net *net = sock_net(skb->sk); > struct ip6addrlbl_entry *p; > int idx = 0, s_idx = cb->args[0]; > - int err; > + int err = 0; > > if (cb->strict_check) { > err = ip6addrlbl_valid_dump_req(nlh, cb->extack); > @@ -510,7 +511,7 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) > hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) { > if (idx >= s_idx) { > err = ip6addrlbl_fill(skb, p, > - net->ipv6.ip6addrlbl_table.seq, > + READ_ONCE(net->ipv6.ip6addrlbl_table.seq), seems like this should be read once on entry, and the same value used for all iterations. > NETLINK_CB(cb->skb).portid, > nlh->nlmsg_seq, > RTM_NEWADDRLABEL, > @@ -522,7 +523,7 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) > } > rcu_read_unlock(); > cb->args[0] = idx; > - return skb->len; > + return err; > } > > static inline int ip6addrlbl_msgsize(void) > @@ -614,7 +615,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, > > rcu_read_lock(); > p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); > - lseq = net->ipv6.ip6addrlbl_table.seq; > + lseq = READ_ONCE(net->ipv6.ip6addrlbl_table.seq); > if (p) > err = ip6addrlbl_fill(skb, p, lseq, > NETLINK_CB(in_skb).portid, > @@ -647,6 +648,7 @@ int __init ipv6_addr_label_rtnl_register(void) > return ret; > ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDRLABEL, > ip6addrlbl_get, > - ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED); > + ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED | > + RTNL_FLAG_DUMP_UNLOCKED); > return ret; > }
On Wed, Apr 3, 2024 at 6:13 PM David Ahern <dsahern@kernel.org> wrote: > > On 4/3/24 6:39 AM, Eric Dumazet wrote: > > diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c > > index 17ac45aa7194ce9c148ed95e14dd575d17feeb98..961e853543b64cd2060ef693ae3ad32a44780aa1 100644 > > --- a/net/ipv6/addrlabel.c > > +++ b/net/ipv6/addrlabel.c > > @@ -234,7 +234,8 @@ static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp, > > hlist_add_head_rcu(&newp->list, &net->ipv6.ip6addrlbl_table.head); > > out: > > if (!ret) > > - net->ipv6.ip6addrlbl_table.seq++; > > + WRITE_ONCE(net->ipv6.ip6addrlbl_table.seq, > > + net->ipv6.ip6addrlbl_table.seq + 1); > > return ret; > > } > > > > @@ -445,7 +446,7 @@ static void ip6addrlbl_putmsg(struct nlmsghdr *nlh, > > }; > > > > static int ip6addrlbl_fill(struct sk_buff *skb, > > - struct ip6addrlbl_entry *p, > > + const struct ip6addrlbl_entry *p, > > u32 lseq, > > u32 portid, u32 seq, int event, > > unsigned int flags) > > @@ -498,7 +499,7 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) > > struct net *net = sock_net(skb->sk); > > struct ip6addrlbl_entry *p; > > int idx = 0, s_idx = cb->args[0]; > > - int err; > > + int err = 0; > > > > if (cb->strict_check) { > > err = ip6addrlbl_valid_dump_req(nlh, cb->extack); > > @@ -510,7 +511,7 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) > > hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) { > > if (idx >= s_idx) { > > err = ip6addrlbl_fill(skb, p, > > - net->ipv6.ip6addrlbl_table.seq, > > + READ_ONCE(net->ipv6.ip6addrlbl_table.seq), > > seems like this should be read once on entry, and the same value used > for all iterations. I thought of that, but this will miss any update done concurrently (if all entries fit in a single skb) It is unclear to me what user space can do with ifal_seq There is no clear signal like : Initial seq _before_ the dump, and seq seen _after_ the dump.
On 4/3/24 10:25 AM, Eric Dumazet wrote: > On Wed, Apr 3, 2024 at 6:13 PM David Ahern <dsahern@kernel.org> wrote: >> >> On 4/3/24 6:39 AM, Eric Dumazet wrote: >>> diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c >>> index 17ac45aa7194ce9c148ed95e14dd575d17feeb98..961e853543b64cd2060ef693ae3ad32a44780aa1 100644 >>> --- a/net/ipv6/addrlabel.c >>> +++ b/net/ipv6/addrlabel.c >>> @@ -234,7 +234,8 @@ static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp, >>> hlist_add_head_rcu(&newp->list, &net->ipv6.ip6addrlbl_table.head); >>> out: >>> if (!ret) >>> - net->ipv6.ip6addrlbl_table.seq++; >>> + WRITE_ONCE(net->ipv6.ip6addrlbl_table.seq, >>> + net->ipv6.ip6addrlbl_table.seq + 1); >>> return ret; >>> } >>> >>> @@ -445,7 +446,7 @@ static void ip6addrlbl_putmsg(struct nlmsghdr *nlh, >>> }; >>> >>> static int ip6addrlbl_fill(struct sk_buff *skb, >>> - struct ip6addrlbl_entry *p, >>> + const struct ip6addrlbl_entry *p, >>> u32 lseq, >>> u32 portid, u32 seq, int event, >>> unsigned int flags) >>> @@ -498,7 +499,7 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) >>> struct net *net = sock_net(skb->sk); >>> struct ip6addrlbl_entry *p; >>> int idx = 0, s_idx = cb->args[0]; >>> - int err; >>> + int err = 0; >>> >>> if (cb->strict_check) { >>> err = ip6addrlbl_valid_dump_req(nlh, cb->extack); >>> @@ -510,7 +511,7 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) >>> hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) { >>> if (idx >= s_idx) { >>> err = ip6addrlbl_fill(skb, p, >>> - net->ipv6.ip6addrlbl_table.seq, >>> + READ_ONCE(net->ipv6.ip6addrlbl_table.seq), >> >> seems like this should be read once on entry, and the same value used >> for all iterations. > > I thought of that, but this will miss any update done concurrently (if > all entries fit in a single skb) > > It is unclear to me what user space can do with ifal_seq > > There is no clear signal like : > > Initial seq _before_ the dump, and seq seen _after_ the dump. Since this a known change in behavior (however slight), it would be best to add something in the commit message or a code comment so if someone hits a problem and does a git bisect the problem and solution are clear.
On Wed, Apr 3, 2024 at 7:49 PM David Ahern <dsahern@kernel.org> wrote: > Since this a known change in behavior (however slight), it would be best > to add something in the commit message or a code comment so if someone > hits a problem and does a git bisect the problem and solution are clear. Ok. The cb->seq / nl_dump_check_consistent() protocol could be implemented later, if necessary.
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 17ac45aa7194ce9c148ed95e14dd575d17feeb98..961e853543b64cd2060ef693ae3ad32a44780aa1 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -234,7 +234,8 @@ static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp, hlist_add_head_rcu(&newp->list, &net->ipv6.ip6addrlbl_table.head); out: if (!ret) - net->ipv6.ip6addrlbl_table.seq++; + WRITE_ONCE(net->ipv6.ip6addrlbl_table.seq, + net->ipv6.ip6addrlbl_table.seq + 1); return ret; } @@ -445,7 +446,7 @@ static void ip6addrlbl_putmsg(struct nlmsghdr *nlh, }; static int ip6addrlbl_fill(struct sk_buff *skb, - struct ip6addrlbl_entry *p, + const struct ip6addrlbl_entry *p, u32 lseq, u32 portid, u32 seq, int event, unsigned int flags) @@ -498,7 +499,7 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) struct net *net = sock_net(skb->sk); struct ip6addrlbl_entry *p; int idx = 0, s_idx = cb->args[0]; - int err; + int err = 0; if (cb->strict_check) { err = ip6addrlbl_valid_dump_req(nlh, cb->extack); @@ -510,7 +511,7 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) { if (idx >= s_idx) { err = ip6addrlbl_fill(skb, p, - net->ipv6.ip6addrlbl_table.seq, + READ_ONCE(net->ipv6.ip6addrlbl_table.seq), NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWADDRLABEL, @@ -522,7 +523,7 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) } rcu_read_unlock(); cb->args[0] = idx; - return skb->len; + return err; } static inline int ip6addrlbl_msgsize(void) @@ -614,7 +615,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, rcu_read_lock(); p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); - lseq = net->ipv6.ip6addrlbl_table.seq; + lseq = READ_ONCE(net->ipv6.ip6addrlbl_table.seq); if (p) err = ip6addrlbl_fill(skb, p, lseq, NETLINK_CB(in_skb).portid, @@ -647,6 +648,7 @@ int __init ipv6_addr_label_rtnl_register(void) return ret; ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get, - ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED); + ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED | + RTNL_FLAG_DUMP_UNLOCKED); return ret; }
No longer hold RTNL while calling ip6addrlbl_dump() ("ip addrlabel show") ip6addrlbl_dump() was already mostly relying on RCU anyway. Add READ_ONCE()/WRITE_ONCE() annotations around net->ipv6.ip6addrlbl_table.seq Also change return value for a completed dump, so that NLMSG_DONE can be appended to current skb, saving one recvmsg() system call. Signed-off-by: Eric Dumazet <edumazet@google.com> --- net/ipv6/addrlabel.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-)