Message ID | 20231214212443.3638210-3-dw@davidwei.uk (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | netdevsim: link and forward skbs between ports | expand |
Thu, Dec 14, 2023 at 10:24:41PM CET, dw@davidwei.uk wrote: >Forward skbs sent from one netdevsim port to its connected netdevsim >port using dev_forward_skb, in a spirit similar to veth. Perhaps better to write "dev_forward_skb()" to make obvious you talk about function. > >Signed-off-by: David Wei <dw@davidwei.uk> >--- > drivers/net/netdevsim/netdev.c | 23 ++++++++++++++++++----- > 1 file changed, 18 insertions(+), 5 deletions(-) > >diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c >index e290c54b0e70..c5f53b1dbdcc 100644 >--- a/drivers/net/netdevsim/netdev.c >+++ b/drivers/net/netdevsim/netdev.c >@@ -29,19 +29,33 @@ > static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev) > { > struct netdevsim *ns = netdev_priv(dev); >+ struct netdevsim *peer_ns; >+ int ret = NETDEV_TX_OK; > >+ rcu_read_lock(); Why do you need to be in rcu read locked section here? > if (!nsim_ipsec_tx(ns, skb)) >- goto out; >+ goto err; Not sure why you need to rename the label. Why "out" is not okay? > > u64_stats_update_begin(&ns->syncp); > ns->tx_packets++; > ns->tx_bytes += skb->len; > u64_stats_update_end(&ns->syncp); > >-out: >- dev_kfree_skb(skb); >+ peer_ns = rcu_dereference(ns->peer); >+ if (!peer_ns) >+ goto err; This is definitelly not an error path, "err" label name is misleading. >+ >+ skb_tx_timestamp(skb); >+ if (unlikely(dev_forward_skb(peer_ns->netdev, skb) == NET_RX_DROP)) >+ ret = NET_XMIT_DROP; Hmm, can't you track dropped packets in ns->tx_dropped and expose in nsim_get_stats64() ? > >- return NETDEV_TX_OK; >+ rcu_read_unlock(); >+ return ret; >+ >+err: >+ rcu_read_unlock(); >+ dev_kfree_skb(skb); >+ return ret; > } > > static void nsim_set_rx_mode(struct net_device *dev) >@@ -302,7 +316,6 @@ static void nsim_setup(struct net_device *dev) > eth_hw_addr_random(dev); > > dev->tx_queue_len = 0; >- dev->flags |= IFF_NOARP; > dev->flags &= ~IFF_MULTICAST; > dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | > IFF_NO_QUEUE; >-- >2.39.3 >
On 2023-12-15 02:45, Jiri Pirko wrote: > Thu, Dec 14, 2023 at 10:24:41PM CET, dw@davidwei.uk wrote: >> Forward skbs sent from one netdevsim port to its connected netdevsim >> port using dev_forward_skb, in a spirit similar to veth. > > Perhaps better to write "dev_forward_skb()" to make obvious you talk > about function. Sorry, it's a bad habit at this point :) > > >> >> Signed-off-by: David Wei <dw@davidwei.uk> >> --- >> drivers/net/netdevsim/netdev.c | 23 ++++++++++++++++++----- >> 1 file changed, 18 insertions(+), 5 deletions(-) >> >> diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c >> index e290c54b0e70..c5f53b1dbdcc 100644 >> --- a/drivers/net/netdevsim/netdev.c >> +++ b/drivers/net/netdevsim/netdev.c >> @@ -29,19 +29,33 @@ >> static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev) >> { >> struct netdevsim *ns = netdev_priv(dev); >> + struct netdevsim *peer_ns; >> + int ret = NETDEV_TX_OK; >> >> + rcu_read_lock(); > > Why do you need to be in rcu read locked section here? So the RCU protected pointer `peer` does not change during the critical section. Veth does something similar in its xmit() for its peer. > > >> if (!nsim_ipsec_tx(ns, skb)) >> - goto out; >> + goto err; > > Not sure why you need to rename the label. Why "out" is not okay? > >> >> u64_stats_update_begin(&ns->syncp); >> ns->tx_packets++; >> ns->tx_bytes += skb->len; >> u64_stats_update_end(&ns->syncp); >> >> -out: >> - dev_kfree_skb(skb); >> + peer_ns = rcu_dereference(ns->peer); >> + if (!peer_ns) >> + goto err; > > This is definitelly not an error path, "err" label name is misleading. That's fair, I can change it back. Lots has changed since my original intentions. > > >> + >> + skb_tx_timestamp(skb); >> + if (unlikely(dev_forward_skb(peer_ns->netdev, skb) == NET_RX_DROP)) >> + ret = NET_XMIT_DROP; > > Hmm, can't you track dropped packets in ns->tx_dropped and expose in > nsim_get_stats64() ? I can add this. > > >> >> - return NETDEV_TX_OK; >> + rcu_read_unlock(); >> + return ret; >> + >> +err: >> + rcu_read_unlock(); >> + dev_kfree_skb(skb); >> + return ret; >> } >> >> static void nsim_set_rx_mode(struct net_device *dev) >> @@ -302,7 +316,6 @@ static void nsim_setup(struct net_device *dev) >> eth_hw_addr_random(dev); >> >> dev->tx_queue_len = 0; >> - dev->flags |= IFF_NOARP; >> dev->flags &= ~IFF_MULTICAST; >> dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | >> IFF_NO_QUEUE; >> -- >> 2.39.3 >>
Fri, Dec 15, 2023 at 07:31:42PM CET, dw@davidwei.uk wrote: >On 2023-12-15 02:45, Jiri Pirko wrote: >> Thu, Dec 14, 2023 at 10:24:41PM CET, dw@davidwei.uk wrote: >>> Forward skbs sent from one netdevsim port to its connected netdevsim >>> port using dev_forward_skb, in a spirit similar to veth. >> >> Perhaps better to write "dev_forward_skb()" to make obvious you talk >> about function. > >Sorry, it's a bad habit at this point :) > >> >> >>> >>> Signed-off-by: David Wei <dw@davidwei.uk> >>> --- >>> drivers/net/netdevsim/netdev.c | 23 ++++++++++++++++++----- >>> 1 file changed, 18 insertions(+), 5 deletions(-) >>> >>> diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c >>> index e290c54b0e70..c5f53b1dbdcc 100644 >>> --- a/drivers/net/netdevsim/netdev.c >>> +++ b/drivers/net/netdevsim/netdev.c >>> @@ -29,19 +29,33 @@ >>> static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev) >>> { >>> struct netdevsim *ns = netdev_priv(dev); >>> + struct netdevsim *peer_ns; >>> + int ret = NETDEV_TX_OK; >>> >>> + rcu_read_lock(); >> >> Why do you need to be in rcu read locked section here? > >So the RCU protected pointer `peer` does not change during the critical >section. Veth does something similar in its xmit() for its peer. RCU does not work like this. Please check out the documentation. > >> >> >>> if (!nsim_ipsec_tx(ns, skb)) >>> - goto out; >>> + goto err; >> >> Not sure why you need to rename the label. Why "out" is not okay? >> >>> >>> u64_stats_update_begin(&ns->syncp); >>> ns->tx_packets++; >>> ns->tx_bytes += skb->len; >>> u64_stats_update_end(&ns->syncp); >>> >>> -out: >>> - dev_kfree_skb(skb); >>> + peer_ns = rcu_dereference(ns->peer); >>> + if (!peer_ns) >>> + goto err; >> >> This is definitelly not an error path, "err" label name is misleading. > >That's fair, I can change it back. Lots has changed since my original >intentions. > >> >> >>> + >>> + skb_tx_timestamp(skb); >>> + if (unlikely(dev_forward_skb(peer_ns->netdev, skb) == NET_RX_DROP)) >>> + ret = NET_XMIT_DROP; >> >> Hmm, can't you track dropped packets in ns->tx_dropped and expose in >> nsim_get_stats64() ? > >I can add this. > >> >> >>> >>> - return NETDEV_TX_OK; >>> + rcu_read_unlock(); >>> + return ret; >>> + >>> +err: >>> + rcu_read_unlock(); >>> + dev_kfree_skb(skb); >>> + return ret; >>> } >>> >>> static void nsim_set_rx_mode(struct net_device *dev) >>> @@ -302,7 +316,6 @@ static void nsim_setup(struct net_device *dev) >>> eth_hw_addr_random(dev); >>> >>> dev->tx_queue_len = 0; >>> - dev->flags |= IFF_NOARP; >>> dev->flags &= ~IFF_MULTICAST; >>> dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | >>> IFF_NO_QUEUE; >>> -- >>> 2.39.3 >>>
On 2023-12-16 01:22, Jiri Pirko wrote: > Fri, Dec 15, 2023 at 07:31:42PM CET, dw@davidwei.uk wrote: >> On 2023-12-15 02:45, Jiri Pirko wrote: >>> Thu, Dec 14, 2023 at 10:24:41PM CET, dw@davidwei.uk wrote: >>>> Forward skbs sent from one netdevsim port to its connected netdevsim >>>> port using dev_forward_skb, in a spirit similar to veth. >>> >>> Perhaps better to write "dev_forward_skb()" to make obvious you talk >>> about function. >> >> Sorry, it's a bad habit at this point :) >> >>> >>> >>>> >>>> Signed-off-by: David Wei <dw@davidwei.uk> >>>> --- >>>> drivers/net/netdevsim/netdev.c | 23 ++++++++++++++++++----- >>>> 1 file changed, 18 insertions(+), 5 deletions(-) >>>> >>>> diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c >>>> index e290c54b0e70..c5f53b1dbdcc 100644 >>>> --- a/drivers/net/netdevsim/netdev.c >>>> +++ b/drivers/net/netdevsim/netdev.c >>>> @@ -29,19 +29,33 @@ >>>> static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev) >>>> { >>>> struct netdevsim *ns = netdev_priv(dev); >>>> + struct netdevsim *peer_ns; >>>> + int ret = NETDEV_TX_OK; >>>> >>>> + rcu_read_lock(); >>> >>> Why do you need to be in rcu read locked section here? >> >> So the RCU protected pointer `peer` does not change during the critical >> section. Veth does something similar in its xmit() for its peer. > > RCU does not work like this. Please check out the documentation. When destroying a netdevsim in nsim_destroy(), rtnl_lock is held which prevents concurrent destruction of netdevsims. Then, unregister_netdevice() will eventually call synchronize_rcu_expedited(). Let's say we have two netdevsims, A linked with B, where A->peer is B and B->peer is A. If we're destroying B in nsim_destroy(), then we first do rcu_assign_pointer(A->peer, NULL). Of course, any read-side critical sections that dereferenced a non-NULL A->peer won't be affected by this update. Then B's nsim_destroy() calls unregister_netdevice(), followed eventually by synchronize_rcu_expedited(). As I understand RCU, this will wait for one RCU grace period, or any nsim_start_xmit() that started _before_ B's rcu_assign_pointer(A->peer, NULL) to complete. RCU docs say that the caller of synchronize_rcu() upon return may be again concurrent w/ another nsim_start_xmit() reader. But they should see NULL for A->peer ptr due to the rcu_assign_pointer(A->peer, NULL) update during B's nsim_destroy(). So after synchronize_rcu() no skb from A should be forwarded to B anymore. In fact, it looks like since v5.0 being in a softirq handler serves as an RCU read-side critical section. So the rcu_read_lock() here in nsim_start_xmit() is actually redundant. I believe this is veth's intention too. There is a comment in veth_dellink() that says the pair of peer devices are guaranteed to be not freed before one RCU grace period. As long as adding/removing links is also under rtnl_lock, I think with RCU guarantees discussed above we will be SMP safe. Does this seem right to you? > > >> >>> >>> >>>> if (!nsim_ipsec_tx(ns, skb)) >>>> - goto out; >>>> + goto err; >>> >>> Not sure why you need to rename the label. Why "out" is not okay? >>> >>>> >>>> u64_stats_update_begin(&ns->syncp); >>>> ns->tx_packets++; >>>> ns->tx_bytes += skb->len; >>>> u64_stats_update_end(&ns->syncp); >>>> >>>> -out: >>>> - dev_kfree_skb(skb); >>>> + peer_ns = rcu_dereference(ns->peer); >>>> + if (!peer_ns) >>>> + goto err; >>> >>> This is definitelly not an error path, "err" label name is misleading. >> >> That's fair, I can change it back. Lots has changed since my original >> intentions. >> >>> >>> >>>> + >>>> + skb_tx_timestamp(skb); >>>> + if (unlikely(dev_forward_skb(peer_ns->netdev, skb) == NET_RX_DROP)) >>>> + ret = NET_XMIT_DROP; >>> >>> Hmm, can't you track dropped packets in ns->tx_dropped and expose in >>> nsim_get_stats64() ? >> >> I can add this. >> >>> >>> >>>> >>>> - return NETDEV_TX_OK; >>>> + rcu_read_unlock(); >>>> + return ret; >>>> + >>>> +err: >>>> + rcu_read_unlock(); >>>> + dev_kfree_skb(skb); >>>> + return ret; >>>> } >>>> >>>> static void nsim_set_rx_mode(struct net_device *dev) >>>> @@ -302,7 +316,6 @@ static void nsim_setup(struct net_device *dev) >>>> eth_hw_addr_random(dev); >>>> >>>> dev->tx_queue_len = 0; >>>> - dev->flags |= IFF_NOARP; >>>> dev->flags &= ~IFF_MULTICAST; >>>> dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | >>>> IFF_NO_QUEUE; >>>> -- >>>> 2.39.3 >>>>
Sun, Dec 17, 2023 at 03:59:53AM CET, dw@davidwei.uk wrote: >On 2023-12-16 01:22, Jiri Pirko wrote: >> Fri, Dec 15, 2023 at 07:31:42PM CET, dw@davidwei.uk wrote: >>> On 2023-12-15 02:45, Jiri Pirko wrote: >>>> Thu, Dec 14, 2023 at 10:24:41PM CET, dw@davidwei.uk wrote: >>>>> Forward skbs sent from one netdevsim port to its connected netdevsim >>>>> port using dev_forward_skb, in a spirit similar to veth. >>>> >>>> Perhaps better to write "dev_forward_skb()" to make obvious you talk >>>> about function. >>> >>> Sorry, it's a bad habit at this point :) >>> >>>> >>>> >>>>> >>>>> Signed-off-by: David Wei <dw@davidwei.uk> >>>>> --- >>>>> drivers/net/netdevsim/netdev.c | 23 ++++++++++++++++++----- >>>>> 1 file changed, 18 insertions(+), 5 deletions(-) >>>>> >>>>> diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c >>>>> index e290c54b0e70..c5f53b1dbdcc 100644 >>>>> --- a/drivers/net/netdevsim/netdev.c >>>>> +++ b/drivers/net/netdevsim/netdev.c >>>>> @@ -29,19 +29,33 @@ >>>>> static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev) >>>>> { >>>>> struct netdevsim *ns = netdev_priv(dev); >>>>> + struct netdevsim *peer_ns; >>>>> + int ret = NETDEV_TX_OK; >>>>> >>>>> + rcu_read_lock(); >>>> >>>> Why do you need to be in rcu read locked section here? >>> >>> So the RCU protected pointer `peer` does not change during the critical >>> section. Veth does something similar in its xmit() for its peer. >> >> RCU does not work like this. Please check out the documentation. > >When destroying a netdevsim in nsim_destroy(), rtnl_lock is held which prevents >concurrent destruction of netdevsims. Then, unregister_netdevice() will >eventually call synchronize_rcu_expedited(). > >Let's say we have two netdevsims, A linked with B, where A->peer is B and >B->peer is A. > >If we're destroying B in nsim_destroy(), then we first do >rcu_assign_pointer(A->peer, NULL). Of course, any read-side critical sections >that dereferenced a non-NULL A->peer won't be affected by this update. > >Then B's nsim_destroy() calls unregister_netdevice(), followed eventually by >synchronize_rcu_expedited(). As I understand RCU, this will wait for one RCU >grace period, or any nsim_start_xmit() that started _before_ B's >rcu_assign_pointer(A->peer, NULL) to complete. > >RCU docs say that the caller of synchronize_rcu() upon return may be again >concurrent w/ another nsim_start_xmit() reader. But they should see NULL for >A->peer ptr due to the rcu_assign_pointer(A->peer, NULL) update during B's >nsim_destroy(). So after synchronize_rcu() no skb from A should be forwarded to >B anymore. > >In fact, it looks like since v5.0 being in a softirq handler serves as an RCU >read-side critical section. So the rcu_read_lock() here in nsim_start_xmit() is >actually redundant. > >I believe this is veth's intention too. There is a comment in veth_dellink() >that says the pair of peer devices are guaranteed to be not freed before one >RCU grace period. > >As long as adding/removing links is also under rtnl_lock, I think with RCU >guarantees discussed above we will be SMP safe. Does this seem right to you? > >> >> >>> >>>> >>>> >>>>> if (!nsim_ipsec_tx(ns, skb)) >>>>> - goto out; >>>>> + goto err; >>>> >>>> Not sure why you need to rename the label. Why "out" is not okay? >>>> >>>>> >>>>> u64_stats_update_begin(&ns->syncp); >>>>> ns->tx_packets++; >>>>> ns->tx_bytes += skb->len; >>>>> u64_stats_update_end(&ns->syncp); >>>>> >>>>> -out: >>>>> - dev_kfree_skb(skb); My point is, why don't take rcu_read_lock() here. >>>>> + peer_ns = rcu_dereference(ns->peer); >>>>> + if (!peer_ns) >>>>> + goto err; >>>> >>>> This is definitelly not an error path, "err" label name is misleading. >>> >>> That's fair, I can change it back. Lots has changed since my original >>> intentions. >>> >>>> >>>> >>>>> + >>>>> + skb_tx_timestamp(skb); >>>>> + if (unlikely(dev_forward_skb(peer_ns->netdev, skb) == NET_RX_DROP)) >>>>> + ret = NET_XMIT_DROP; >>>> >>>> Hmm, can't you track dropped packets in ns->tx_dropped and expose in >>>> nsim_get_stats64() ? >>> >>> I can add this. >>> >>>> >>>> >>>>> >>>>> - return NETDEV_TX_OK; >>>>> + rcu_read_unlock(); >>>>> + return ret; >>>>> + >>>>> +err: >>>>> + rcu_read_unlock(); >>>>> + dev_kfree_skb(skb); >>>>> + return ret; >>>>> } >>>>> >>>>> static void nsim_set_rx_mode(struct net_device *dev) >>>>> @@ -302,7 +316,6 @@ static void nsim_setup(struct net_device *dev) >>>>> eth_hw_addr_random(dev); >>>>> >>>>> dev->tx_queue_len = 0; >>>>> - dev->flags |= IFF_NOARP; >>>>> dev->flags &= ~IFF_MULTICAST; >>>>> dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | >>>>> IFF_NO_QUEUE; >>>>> -- >>>>> 2.39.3 >>>>>
diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index e290c54b0e70..c5f53b1dbdcc 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -29,19 +29,33 @@ static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct netdevsim *ns = netdev_priv(dev); + struct netdevsim *peer_ns; + int ret = NETDEV_TX_OK; + rcu_read_lock(); if (!nsim_ipsec_tx(ns, skb)) - goto out; + goto err; u64_stats_update_begin(&ns->syncp); ns->tx_packets++; ns->tx_bytes += skb->len; u64_stats_update_end(&ns->syncp); -out: - dev_kfree_skb(skb); + peer_ns = rcu_dereference(ns->peer); + if (!peer_ns) + goto err; + + skb_tx_timestamp(skb); + if (unlikely(dev_forward_skb(peer_ns->netdev, skb) == NET_RX_DROP)) + ret = NET_XMIT_DROP; - return NETDEV_TX_OK; + rcu_read_unlock(); + return ret; + +err: + rcu_read_unlock(); + dev_kfree_skb(skb); + return ret; } static void nsim_set_rx_mode(struct net_device *dev) @@ -302,7 +316,6 @@ static void nsim_setup(struct net_device *dev) eth_hw_addr_random(dev); dev->tx_queue_len = 0; - dev->flags |= IFF_NOARP; dev->flags &= ~IFF_MULTICAST; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
Forward skbs sent from one netdevsim port to its connected netdevsim port using dev_forward_skb, in a spirit similar to veth. Signed-off-by: David Wei <dw@davidwei.uk> --- drivers/net/netdevsim/netdev.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-)