diff mbox series

[net-next,07/14] net_sched: sch_ets: implement lockless ets_dump()

Message ID 20240415132054.3822230-8-edumazet@google.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series net_sched: first series for RTNL-less qdisc dumps | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 926 this patch: 926
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers success CCed 7 of 7 maintainers
netdev/build_clang success Errors and warnings before: 937 this patch: 937
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 937 this patch: 937
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 73 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-04-18--00-00 (tests: 961)

Commit Message

Eric Dumazet April 15, 2024, 1:20 p.m. UTC
Instead of relying on RTNL, ets_dump() can use READ_ONCE()
annotations, paired with WRITE_ONCE() ones in ets_change().

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/sched/sch_ets.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

Comments

Simon Horman April 17, 2024, 4:54 p.m. UTC | #1
On Mon, Apr 15, 2024 at 01:20:47PM +0000, Eric Dumazet wrote:
> Instead of relying on RTNL, ets_dump() can use READ_ONCE()
> annotations, paired with WRITE_ONCE() ones in ets_change().
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>  net/sched/sch_ets.c | 25 ++++++++++++++-----------
>  1 file changed, 14 insertions(+), 11 deletions(-)
> 
> diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c
> index 835b4460b44854a803d3054e744702022b7551f4..f80bc05d4c5a5050226e6cfd30fa951c0b61029f 100644
> --- a/net/sched/sch_ets.c
> +++ b/net/sched/sch_ets.c

...

> @@ -658,11 +658,11 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt,
>  			list_del(&q->classes[i].alist);
>  		qdisc_tree_flush_backlog(q->classes[i].qdisc);
>  	}
> -	q->nstrict = nstrict;
> +	WRITE_ONCE(q->nstrict, nstrict);
>  	memcpy(q->prio2band, priomap, sizeof(priomap));

Hi Eric,

I think that writing elements of q->prio2band needs WRITE_ONCE() treatment too.

>  	for (i = 0; i < q->nbands; i++)
> -		q->classes[i].quantum = quanta[i];
> +		WRITE_ONCE(q->classes[i].quantum, quanta[i]);
>  
>  	for (i = oldbands; i < q->nbands; i++) {
>  		q->classes[i].qdisc = queues[i];

...

> @@ -733,6 +733,7 @@ static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb)
>  	struct ets_sched *q = qdisc_priv(sch);
>  	struct nlattr *opts;
>  	struct nlattr *nest;
> +	u8 nbands, nstrict;
>  	int band;
>  	int prio;
>  	int err;

The next few lines of this function are:

	err = ets_offload_dump(sch);
	if (err)
		return err;

Where ets_offload_dump may indirectly call ndo_setup_tc().
And I am concerned that ndo_setup_tc() expects RTNL to be held,
although perhaps that assumption is out of date.

...

> @@ -771,7 +773,8 @@ static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb)
>  		goto nla_err;
>  
>  	for (prio = 0; prio <= TC_PRIO_MAX; prio++) {
> -		if (nla_put_u8(skb, TCA_ETS_PRIOMAP_BAND, q->prio2band[prio]))
> +		if (nla_put_u8(skb, TCA_ETS_PRIOMAP_BAND,
> +			       READ_ONCE(q->prio2band[prio])))
>  			goto nla_err;
>  	}

...
Eric Dumazet April 17, 2024, 5:08 p.m. UTC | #2
On Wed, Apr 17, 2024 at 6:54 PM Simon Horman <horms@kernel.org> wrote:
>
> On Mon, Apr 15, 2024 at 01:20:47PM +0000, Eric Dumazet wrote:
> > Instead of relying on RTNL, ets_dump() can use READ_ONCE()
> > annotations, paired with WRITE_ONCE() ones in ets_change().
> >
> > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > ---
> >  net/sched/sch_ets.c | 25 ++++++++++++++-----------
> >  1 file changed, 14 insertions(+), 11 deletions(-)
> >
> > diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c
> > index 835b4460b44854a803d3054e744702022b7551f4..f80bc05d4c5a5050226e6cfd30fa951c0b61029f 100644
> > --- a/net/sched/sch_ets.c
> > +++ b/net/sched/sch_ets.c
>
> ...
>
> > @@ -658,11 +658,11 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt,
> >                       list_del(&q->classes[i].alist);
> >               qdisc_tree_flush_backlog(q->classes[i].qdisc);
> >       }
> > -     q->nstrict = nstrict;
> > +     WRITE_ONCE(q->nstrict, nstrict);
> >       memcpy(q->prio2band, priomap, sizeof(priomap));
>
> Hi Eric,
>
> I think that writing elements of q->prio2band needs WRITE_ONCE() treatment too.

Not really, these are bytes, a cpu will not write over bytes one bit at a time.

I could add WRITE_ONCE(), but this is overkill IMO.

>
> >       for (i = 0; i < q->nbands; i++)
> > -             q->classes[i].quantum = quanta[i];
> > +             WRITE_ONCE(q->classes[i].quantum, quanta[i]);
> >
> >       for (i = oldbands; i < q->nbands; i++) {
> >               q->classes[i].qdisc = queues[i];
>
> ...
>
> > @@ -733,6 +733,7 @@ static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb)
> >       struct ets_sched *q = qdisc_priv(sch);
> >       struct nlattr *opts;
> >       struct nlattr *nest;
> > +     u8 nbands, nstrict;
> >       int band;
> >       int prio;
> >       int err;
>
> The next few lines of this function are:
>
>         err = ets_offload_dump(sch);
>         if (err)
>                 return err;
>
> Where ets_offload_dump may indirectly call ndo_setup_tc().
> And I am concerned that ndo_setup_tc() expects RTNL to be held,
> although perhaps that assumption is out of date.

Thanks, we will add rtnl locking later only in the helper,
or make sure it can run under RCU.

Note the patch series does not yet remove RTNL locking.

Clearly, masking and setting TCQ_F_OFFLOADED in sch->flags in a dump
operation is not very nice IMO.


>
> ...
>
> > @@ -771,7 +773,8 @@ static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb)
> >               goto nla_err;
> >
> >       for (prio = 0; prio <= TC_PRIO_MAX; prio++) {
> > -             if (nla_put_u8(skb, TCA_ETS_PRIOMAP_BAND, q->prio2band[prio]))
> > +             if (nla_put_u8(skb, TCA_ETS_PRIOMAP_BAND,
> > +                            READ_ONCE(q->prio2band[prio])))
> >                       goto nla_err;
> >       }
>
> ...
Simon Horman April 17, 2024, 5:17 p.m. UTC | #3
On Wed, Apr 17, 2024 at 07:08:17PM +0200, Eric Dumazet wrote:
> On Wed, Apr 17, 2024 at 6:54 PM Simon Horman <horms@kernel.org> wrote:
> >
> > On Mon, Apr 15, 2024 at 01:20:47PM +0000, Eric Dumazet wrote:
> > > Instead of relying on RTNL, ets_dump() can use READ_ONCE()
> > > annotations, paired with WRITE_ONCE() ones in ets_change().
> > >
> > > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > > ---
> > >  net/sched/sch_ets.c | 25 ++++++++++++++-----------
> > >  1 file changed, 14 insertions(+), 11 deletions(-)
> > >
> > > diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c
> > > index 835b4460b44854a803d3054e744702022b7551f4..f80bc05d4c5a5050226e6cfd30fa951c0b61029f 100644
> > > --- a/net/sched/sch_ets.c
> > > +++ b/net/sched/sch_ets.c
> >
> > ...
> >
> > > @@ -658,11 +658,11 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt,
> > >                       list_del(&q->classes[i].alist);
> > >               qdisc_tree_flush_backlog(q->classes[i].qdisc);
> > >       }
> > > -     q->nstrict = nstrict;
> > > +     WRITE_ONCE(q->nstrict, nstrict);
> > >       memcpy(q->prio2band, priomap, sizeof(priomap));
> >
> > Hi Eric,
> >
> > I think that writing elements of q->prio2band needs WRITE_ONCE() treatment too.
> 
> Not really, these are bytes, a cpu will not write over bytes one bit at a time.
> 
> I could add WRITE_ONCE(), but this is overkill IMO.

Thanks, armed with that understanding I'm now happy with this patch.

Reviewed-by: Simon Horman <horms@kernel.org>

> > >       for (i = 0; i < q->nbands; i++)
> > > -             q->classes[i].quantum = quanta[i];
> > > +             WRITE_ONCE(q->classes[i].quantum, quanta[i]);
> > >
> > >       for (i = oldbands; i < q->nbands; i++) {
> > >               q->classes[i].qdisc = queues[i];
> >
> > ...
> >
> > > @@ -733,6 +733,7 @@ static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb)
> > >       struct ets_sched *q = qdisc_priv(sch);
> > >       struct nlattr *opts;
> > >       struct nlattr *nest;
> > > +     u8 nbands, nstrict;
> > >       int band;
> > >       int prio;
> > >       int err;
> >
> > The next few lines of this function are:
> >
> >         err = ets_offload_dump(sch);
> >         if (err)
> >                 return err;
> >
> > Where ets_offload_dump may indirectly call ndo_setup_tc().
> > And I am concerned that ndo_setup_tc() expects RTNL to be held,
> > although perhaps that assumption is out of date.
> 
> Thanks, we will add rtnl locking later only in the helper,
> or make sure it can run under RCU.
> 
> Note the patch series does not yet remove RTNL locking.

Yes, I understand that.
I was more flagging this as something that needs to be addressed.
Sorry for not being clearer.

> Clearly, masking and setting TCQ_F_OFFLOADED in sch->flags in a dump
> operation is not very nice IMO.

Yes, I noticed that too.

...
diff mbox series

Patch

diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c
index 835b4460b44854a803d3054e744702022b7551f4..f80bc05d4c5a5050226e6cfd30fa951c0b61029f 100644
--- a/net/sched/sch_ets.c
+++ b/net/sched/sch_ets.c
@@ -646,7 +646,7 @@  static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt,
 
 	sch_tree_lock(sch);
 
-	q->nbands = nbands;
+	WRITE_ONCE(q->nbands, nbands);
 	for (i = nstrict; i < q->nstrict; i++) {
 		if (q->classes[i].qdisc->q.qlen) {
 			list_add_tail(&q->classes[i].alist, &q->active);
@@ -658,11 +658,11 @@  static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt,
 			list_del(&q->classes[i].alist);
 		qdisc_tree_flush_backlog(q->classes[i].qdisc);
 	}
-	q->nstrict = nstrict;
+	WRITE_ONCE(q->nstrict, nstrict);
 	memcpy(q->prio2band, priomap, sizeof(priomap));
 
 	for (i = 0; i < q->nbands; i++)
-		q->classes[i].quantum = quanta[i];
+		WRITE_ONCE(q->classes[i].quantum, quanta[i]);
 
 	for (i = oldbands; i < q->nbands; i++) {
 		q->classes[i].qdisc = queues[i];
@@ -676,7 +676,7 @@  static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt,
 	for (i = q->nbands; i < oldbands; i++) {
 		qdisc_put(q->classes[i].qdisc);
 		q->classes[i].qdisc = NULL;
-		q->classes[i].quantum = 0;
+		WRITE_ONCE(q->classes[i].quantum, 0);
 		q->classes[i].deficit = 0;
 		gnet_stats_basic_sync_init(&q->classes[i].bstats);
 		memset(&q->classes[i].qstats, 0, sizeof(q->classes[i].qstats));
@@ -733,6 +733,7 @@  static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb)
 	struct ets_sched *q = qdisc_priv(sch);
 	struct nlattr *opts;
 	struct nlattr *nest;
+	u8 nbands, nstrict;
 	int band;
 	int prio;
 	int err;
@@ -745,21 +746,22 @@  static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb)
 	if (!opts)
 		goto nla_err;
 
-	if (nla_put_u8(skb, TCA_ETS_NBANDS, q->nbands))
+	nbands = READ_ONCE(q->nbands);
+	if (nla_put_u8(skb, TCA_ETS_NBANDS, nbands))
 		goto nla_err;
 
-	if (q->nstrict &&
-	    nla_put_u8(skb, TCA_ETS_NSTRICT, q->nstrict))
+	nstrict = READ_ONCE(q->nstrict);
+	if (nstrict && nla_put_u8(skb, TCA_ETS_NSTRICT, nstrict))
 		goto nla_err;
 
-	if (q->nbands > q->nstrict) {
+	if (nbands > nstrict) {
 		nest = nla_nest_start(skb, TCA_ETS_QUANTA);
 		if (!nest)
 			goto nla_err;
 
-		for (band = q->nstrict; band < q->nbands; band++) {
+		for (band = nstrict; band < nbands; band++) {
 			if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND,
-					q->classes[band].quantum))
+					READ_ONCE(q->classes[band].quantum)))
 				goto nla_err;
 		}
 
@@ -771,7 +773,8 @@  static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb)
 		goto nla_err;
 
 	for (prio = 0; prio <= TC_PRIO_MAX; prio++) {
-		if (nla_put_u8(skb, TCA_ETS_PRIOMAP_BAND, q->prio2band[prio]))
+		if (nla_put_u8(skb, TCA_ETS_PRIOMAP_BAND,
+			       READ_ONCE(q->prio2band[prio])))
 			goto nla_err;
 	}