diff mbox series

[RFC,v9,07/11] bpf: net_sched: Allow more optional operators in Qdisc_ops

Message ID 20240714175130.4051012-8-amery.hung@bytedance.com (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series bpf qdisc | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test
bpf/vmtest-bpf-next-VM_Test-4 fail Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 fail Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-10 success Logs for s390x-gcc / test
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-13 fail Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-15 success Logs for x86_64-gcc / test
bpf/vmtest-bpf-next-VM_Test-16 success Logs for x86_64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-17 fail Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-18 fail Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-llvm-17 / test
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-21 fail Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-22 fail Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-llvm-18 / test
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-llvm-18 / veristat
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Guessed tree name to be net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 832 this patch: 832
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 3 maintainers not CCed: kuba@kernel.org edumazet@google.com pabeni@redhat.com
netdev/build_clang success Errors and warnings before: 845 this patch: 845
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 2305 this patch: 2305
netdev/checkpatch warning WARNING: From:/Signed-off-by: email address mismatch: 'From: Amery Hung <ameryhung@gmail.com>' != 'Signed-off-by: Amery Hung <amery.hung@bytedance.com>' WARNING: line length of 97 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Amery Hung July 14, 2024, 5:51 p.m. UTC
So far, init, reset, and destroy are implemented by bpf qdisc infra as
fixed operators that manipulate the watchdog according to the occasion.
This patch allows users to implement these three operators to perform
desired work alongside the predefined ones.

Signed-off-by: Amery Hung <amery.hung@bytedance.com>
---
 include/net/sch_generic.h |  6 ++++++
 net/sched/bpf_qdisc.c     | 20 ++++----------------
 net/sched/sch_api.c       | 11 +++++++++++
 net/sched/sch_generic.c   |  8 ++++++++
 4 files changed, 29 insertions(+), 16 deletions(-)

Comments

Martin KaFai Lau July 26, 2024, 1:15 a.m. UTC | #1
On 7/14/24 10:51 AM, Amery Hung wrote:
> So far, init, reset, and destroy are implemented by bpf qdisc infra as
> fixed operators that manipulate the watchdog according to the occasion.
> This patch allows users to implement these three operators to perform
> desired work alongside the predefined ones.
> 
> Signed-off-by: Amery Hung <amery.hung@bytedance.com>
> ---
>   include/net/sch_generic.h |  6 ++++++
>   net/sched/bpf_qdisc.c     | 20 ++++----------------
>   net/sched/sch_api.c       | 11 +++++++++++
>   net/sched/sch_generic.c   |  8 ++++++++
>   4 files changed, 29 insertions(+), 16 deletions(-)
> 
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index 214ed2e34faa..3041782b7527 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -1359,4 +1359,10 @@ static inline void qdisc_synchronize(const struct Qdisc *q)
>   		msleep(1);
>   }
>   
> +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_BPF_JIT)
> +int bpf_qdisc_init_pre_op(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack);
> +void bpf_qdisc_destroy_post_op(struct Qdisc *sch);
> +void bpf_qdisc_reset_post_op(struct Qdisc *sch);
> +#endif
> +
>   #endif
> diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c
> index eff7559aa346..903b4eb54510 100644
> --- a/net/sched/bpf_qdisc.c
> +++ b/net/sched/bpf_qdisc.c
> @@ -9,9 +9,6 @@
>   static struct bpf_struct_ops bpf_Qdisc_ops;
>   
>   static u32 unsupported_ops[] = {
> -	offsetof(struct Qdisc_ops, init),
> -	offsetof(struct Qdisc_ops, reset),
> -	offsetof(struct Qdisc_ops, destroy),
>   	offsetof(struct Qdisc_ops, change),
>   	offsetof(struct Qdisc_ops, attach),
>   	offsetof(struct Qdisc_ops, change_real_num_tx),
> @@ -36,8 +33,8 @@ static int bpf_qdisc_init(struct btf *btf)
>   	return 0;
>   }
>   
> -static int bpf_qdisc_init_op(struct Qdisc *sch, struct nlattr *opt,
> -			     struct netlink_ext_ack *extack)
> +int bpf_qdisc_init_pre_op(struct Qdisc *sch, struct nlattr *opt,
> +			  struct netlink_ext_ack *extack)
>   {
>   	struct bpf_sched_data *q = qdisc_priv(sch);
>   
> @@ -45,14 +42,14 @@ static int bpf_qdisc_init_op(struct Qdisc *sch, struct nlattr *opt,
>   	return 0;
>   }
>   
> -static void bpf_qdisc_reset_op(struct Qdisc *sch)
> +void bpf_qdisc_reset_post_op(struct Qdisc *sch)
>   {
>   	struct bpf_sched_data *q = qdisc_priv(sch);
>   
>   	qdisc_watchdog_cancel(&q->watchdog);
>   }
>   
> -static void bpf_qdisc_destroy_op(struct Qdisc *sch)
> +void bpf_qdisc_destroy_post_op(struct Qdisc *sch)

The reset_post_ops and destroy_post_op are identical. They only do 
qdisc_watchdog_cancel().

>   {
>   	struct bpf_sched_data *q = qdisc_priv(sch);
>   
> @@ -235,15 +232,6 @@ static int bpf_qdisc_init_member(const struct btf_type *t,
>   			return -EINVAL;
>   		qdisc_ops->static_flags = TCQ_F_BPF;
>   		return 1;
> -	case offsetof(struct Qdisc_ops, init):
> -		qdisc_ops->init = bpf_qdisc_init_op;
> -		return 1;
> -	case offsetof(struct Qdisc_ops, reset):
> -		qdisc_ops->reset = bpf_qdisc_reset_op;
> -		return 1;
> -	case offsetof(struct Qdisc_ops, destroy):
> -		qdisc_ops->destroy = bpf_qdisc_destroy_op;
> -		return 1;
>   	case offsetof(struct Qdisc_ops, peek):
>   		if (!uqdisc_ops->peek)
>   			qdisc_ops->peek = qdisc_peek_dequeued;
> diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
> index 5064b6d2d1ec..9fb9375e2793 100644
> --- a/net/sched/sch_api.c
> +++ b/net/sched/sch_api.c
> @@ -1352,6 +1352,13 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
>   		rcu_assign_pointer(sch->stab, stab);
>   	}
>   
> +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_BPF_JIT)
> +	if (sch->flags & TCQ_F_BPF) {

I can see the reason why this patch is needed. It is a few line changes and they 
are not in the fast path... still weakly not excited about them but I know it 
could be a personal preference.

I think at the very least, instead of adding a new TCQ_F_BPF, let see if the 
"owner == BPF_MODULE_OWNER" test can be reused like how it is done in the 
bpf_try_module_get().


A rough direction I am spinning...

The pre/post is mainly to initialize and cleanup the "struct bpf_sched_data" 
before/after calling the bpf prog.

For the pre (init), there is a ".gen_prologue(...., const struct bpf_prog 
*prog)" in the "bpf_verifier_ops". Take a look at the tc_cls_act_prologue().
It calls a BPF_FUNC_skb_pull_data helper. It potentially can call a kfunc 
bpf_qdisc_watchdog_cancel. However, the gen_prologue is invoked too late in the 
verifier for kfunc calling now. This will need some thoughts and works.

For the post (destroy,reset), there is no "gen_epilogue" now. If 
bpf_qdisc_watchdog_schedule() is not allowed to be called in the ".reset" and 
".destroy" bpf prog. I think it can be changed to pre also? There is a ".filter" 
function in the "struct btf_kfunc_id_set" during the kfunc register.

> +		err = bpf_qdisc_init_pre_op(sch, tca[TCA_OPTIONS], extack);
> +		if (err != 0)
> +			goto err_out4;
> +	}
> +#endif
>   	if (ops->init) {
>   		err = ops->init(sch, tca[TCA_OPTIONS], extack);
>   		if (err != 0)
> @@ -1388,6 +1395,10 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
>   	 */
>   	if (ops->destroy)
>   		ops->destroy(sch);
> +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_BPF_JIT)
> +	if (sch->flags & TCQ_F_BPF)
> +		bpf_qdisc_destroy_post_op(sch);
> +#endif
>   	qdisc_put_stab(rtnl_dereference(sch->stab));
>   err_out3:
>   	lockdep_unregister_key(&sch->root_lock_key);
> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> index 76e4a6efd17c..0ac05665c69f 100644
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -1033,6 +1033,10 @@ void qdisc_reset(struct Qdisc *qdisc)
>   
>   	if (ops->reset)
>   		ops->reset(qdisc);
> +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_BPF_JIT)
> +	if (qdisc->flags & TCQ_F_BPF)
> +		bpf_qdisc_reset_post_op(qdisc);
> +#endif
>   
>   	__skb_queue_purge(&qdisc->gso_skb);
>   	__skb_queue_purge(&qdisc->skb_bad_txq);
> @@ -1076,6 +1080,10 @@ static void __qdisc_destroy(struct Qdisc *qdisc)
>   
>   	if (ops->destroy)
>   		ops->destroy(qdisc);
> +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_BPF_JIT)
> +	if (qdisc->flags & TCQ_F_BPF)
> +		bpf_qdisc_destroy_post_op(qdisc);
> +#endif
>   
>   	lockdep_unregister_key(&qdisc->root_lock_key);
>   	bpf_module_put(ops, ops->owner);
Martin KaFai Lau July 26, 2024, 6:30 p.m. UTC | #2
On 7/25/24 6:15 PM, Martin KaFai Lau wrote:
>> diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
>> index 5064b6d2d1ec..9fb9375e2793 100644
>> --- a/net/sched/sch_api.c
>> +++ b/net/sched/sch_api.c
>> @@ -1352,6 +1352,13 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
>>           rcu_assign_pointer(sch->stab, stab);
>>       }
>> +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_BPF_JIT)
>> +    if (sch->flags & TCQ_F_BPF) {
> 
> I can see the reason why this patch is needed. It is a few line changes and they 
> are not in the fast path... still weakly not excited about them but I know it 
> could be a personal preference.
> 
> I think at the very least, instead of adding a new TCQ_F_BPF, let see if the 
> "owner == BPF_MODULE_OWNER" test can be reused like how it is done in the 
> bpf_try_module_get().
> 
> 
> A rough direction I am spinning...
> 
> The pre/post is mainly to initialize and cleanup the "struct bpf_sched_data" 
> before/after calling the bpf prog.
> 
> For the pre (init), there is a ".gen_prologue(...., const struct bpf_prog 
> *prog)" in the "bpf_verifier_ops". Take a look at the tc_cls_act_prologue().
> It calls a BPF_FUNC_skb_pull_data helper. It potentially can call a kfunc 
> bpf_qdisc_watchdog_cancel. However, the gen_prologue is invoked too late in the 

typo. The kfunc should be s/qdisc_watchdog_cancel/qdisc_watchdog_init/ for the pre.

> verifier for kfunc calling now. This will need some thoughts and works.
> 
> For the post (destroy,reset), there is no "gen_epilogue" now. If 
> bpf_qdisc_watchdog_schedule() is not allowed to be called in the ".reset" and 
> ".destroy" bpf prog. I think it can be changed to pre also? There is a ".filter" 
> function in the "struct btf_kfunc_id_set" during the kfunc register.
> 
>> +        err = bpf_qdisc_init_pre_op(sch, tca[TCA_OPTIONS], extack);
>> +        if (err != 0)
>> +            goto err_out4;
>> +    }
Amery Hung July 26, 2024, 10:30 p.m. UTC | #3
On Thu, Jul 25, 2024 at 6:15 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 7/14/24 10:51 AM, Amery Hung wrote:
> > So far, init, reset, and destroy are implemented by bpf qdisc infra as
> > fixed operators that manipulate the watchdog according to the occasion.
> > This patch allows users to implement these three operators to perform
> > desired work alongside the predefined ones.
> >
> > Signed-off-by: Amery Hung <amery.hung@bytedance.com>
> > ---
> >   include/net/sch_generic.h |  6 ++++++
> >   net/sched/bpf_qdisc.c     | 20 ++++----------------
> >   net/sched/sch_api.c       | 11 +++++++++++
> >   net/sched/sch_generic.c   |  8 ++++++++
> >   4 files changed, 29 insertions(+), 16 deletions(-)
> >
> > diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> > index 214ed2e34faa..3041782b7527 100644
> > --- a/include/net/sch_generic.h
> > +++ b/include/net/sch_generic.h
> > @@ -1359,4 +1359,10 @@ static inline void qdisc_synchronize(const struct Qdisc *q)
> >               msleep(1);
> >   }
> >
> > +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_BPF_JIT)
> > +int bpf_qdisc_init_pre_op(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack);
> > +void bpf_qdisc_destroy_post_op(struct Qdisc *sch);
> > +void bpf_qdisc_reset_post_op(struct Qdisc *sch);
> > +#endif
> > +
> >   #endif
> > diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c
> > index eff7559aa346..903b4eb54510 100644
> > --- a/net/sched/bpf_qdisc.c
> > +++ b/net/sched/bpf_qdisc.c
> > @@ -9,9 +9,6 @@
> >   static struct bpf_struct_ops bpf_Qdisc_ops;
> >
> >   static u32 unsupported_ops[] = {
> > -     offsetof(struct Qdisc_ops, init),
> > -     offsetof(struct Qdisc_ops, reset),
> > -     offsetof(struct Qdisc_ops, destroy),
> >       offsetof(struct Qdisc_ops, change),
> >       offsetof(struct Qdisc_ops, attach),
> >       offsetof(struct Qdisc_ops, change_real_num_tx),
> > @@ -36,8 +33,8 @@ static int bpf_qdisc_init(struct btf *btf)
> >       return 0;
> >   }
> >
> > -static int bpf_qdisc_init_op(struct Qdisc *sch, struct nlattr *opt,
> > -                          struct netlink_ext_ack *extack)
> > +int bpf_qdisc_init_pre_op(struct Qdisc *sch, struct nlattr *opt,
> > +                       struct netlink_ext_ack *extack)
> >   {
> >       struct bpf_sched_data *q = qdisc_priv(sch);
> >
> > @@ -45,14 +42,14 @@ static int bpf_qdisc_init_op(struct Qdisc *sch, struct nlattr *opt,
> >       return 0;
> >   }
> >
> > -static void bpf_qdisc_reset_op(struct Qdisc *sch)
> > +void bpf_qdisc_reset_post_op(struct Qdisc *sch)
> >   {
> >       struct bpf_sched_data *q = qdisc_priv(sch);
> >
> >       qdisc_watchdog_cancel(&q->watchdog);
> >   }
> >
> > -static void bpf_qdisc_destroy_op(struct Qdisc *sch)
> > +void bpf_qdisc_destroy_post_op(struct Qdisc *sch)
>
> The reset_post_ops and destroy_post_op are identical. They only do
> qdisc_watchdog_cancel().
>
> >   {
> >       struct bpf_sched_data *q = qdisc_priv(sch);
> >
> > @@ -235,15 +232,6 @@ static int bpf_qdisc_init_member(const struct btf_type *t,
> >                       return -EINVAL;
> >               qdisc_ops->static_flags = TCQ_F_BPF;
> >               return 1;
> > -     case offsetof(struct Qdisc_ops, init):
> > -             qdisc_ops->init = bpf_qdisc_init_op;
> > -             return 1;
> > -     case offsetof(struct Qdisc_ops, reset):
> > -             qdisc_ops->reset = bpf_qdisc_reset_op;
> > -             return 1;
> > -     case offsetof(struct Qdisc_ops, destroy):
> > -             qdisc_ops->destroy = bpf_qdisc_destroy_op;
> > -             return 1;
> >       case offsetof(struct Qdisc_ops, peek):
> >               if (!uqdisc_ops->peek)
> >                       qdisc_ops->peek = qdisc_peek_dequeued;
> > diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
> > index 5064b6d2d1ec..9fb9375e2793 100644
> > --- a/net/sched/sch_api.c
> > +++ b/net/sched/sch_api.c
> > @@ -1352,6 +1352,13 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
> >               rcu_assign_pointer(sch->stab, stab);
> >       }
> >
> > +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_BPF_JIT)
> > +     if (sch->flags & TCQ_F_BPF) {
>
> I can see the reason why this patch is needed. It is a few line changes and they
> are not in the fast path... still weakly not excited about them but I know it
> could be a personal preference.
>
> I think at the very least, instead of adding a new TCQ_F_BPF, let see if the
> "owner == BPF_MODULE_OWNER" test can be reused like how it is done in the
> bpf_try_module_get().
>

Thanks for the suggestion. Will do.

>
> A rough direction I am spinning...
>
> The pre/post is mainly to initialize and cleanup the "struct bpf_sched_data"
> before/after calling the bpf prog.
>
> For the pre (init), there is a ".gen_prologue(...., const struct bpf_prog
> *prog)" in the "bpf_verifier_ops". Take a look at the tc_cls_act_prologue().
> It calls a BPF_FUNC_skb_pull_data helper. It potentially can call a kfunc
> bpf_qdisc_watchdog_cancel. However, the gen_prologue is invoked too late in the
> verifier for kfunc calling now. This will need some thoughts and works.
>
> For the post (destroy,reset), there is no "gen_epilogue" now. If
> bpf_qdisc_watchdog_schedule() is not allowed to be called in the ".reset" and
> ".destroy" bpf prog. I think it can be changed to pre also? There is a ".filter"
> function in the "struct btf_kfunc_id_set" during the kfunc register.
>

I can see how that would work. The ability to add prologue, epilogue
to struct_ops operators is one thing on my wish list.

Meanwhile, I am not sure whether that should be written in the kernel
or rewritten by the verifier. An argument for keeping it in the kernel
is that the prologue or epilogue can get quite complex and involves
many kernel structures not exposed to the bpf program (pre-defined ops
in Qdisc_ops in v8).

Maybe we can keep the current approach in the initial version as they
are not in the fast path, and then move to (gen_prologue,
gen_epilogue) once the plumbing is done?

> > +             err = bpf_qdisc_init_pre_op(sch, tca[TCA_OPTIONS], extack);
> > +             if (err != 0)
> > +                     goto err_out4;
> > +     }
> > +#endif
> >       if (ops->init) {
> >               err = ops->init(sch, tca[TCA_OPTIONS], extack);
> >               if (err != 0)
> > @@ -1388,6 +1395,10 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
> >        */
> >       if (ops->destroy)
> >               ops->destroy(sch);
> > +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_BPF_JIT)
> > +     if (sch->flags & TCQ_F_BPF)
> > +             bpf_qdisc_destroy_post_op(sch);
> > +#endif
> >       qdisc_put_stab(rtnl_dereference(sch->stab));
> >   err_out3:
> >       lockdep_unregister_key(&sch->root_lock_key);
> > diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> > index 76e4a6efd17c..0ac05665c69f 100644
> > --- a/net/sched/sch_generic.c
> > +++ b/net/sched/sch_generic.c
> > @@ -1033,6 +1033,10 @@ void qdisc_reset(struct Qdisc *qdisc)
> >
> >       if (ops->reset)
> >               ops->reset(qdisc);
> > +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_BPF_JIT)
> > +     if (qdisc->flags & TCQ_F_BPF)
> > +             bpf_qdisc_reset_post_op(qdisc);
> > +#endif
> >
> >       __skb_queue_purge(&qdisc->gso_skb);
> >       __skb_queue_purge(&qdisc->skb_bad_txq);
> > @@ -1076,6 +1080,10 @@ static void __qdisc_destroy(struct Qdisc *qdisc)
> >
> >       if (ops->destroy)
> >               ops->destroy(qdisc);
> > +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_BPF_JIT)
> > +     if (qdisc->flags & TCQ_F_BPF)
> > +             bpf_qdisc_destroy_post_op(qdisc);
> > +#endif
> >
> >       lockdep_unregister_key(&qdisc->root_lock_key);
> >       bpf_module_put(ops, ops->owner);
>
Martin KaFai Lau July 30, 2024, 12:20 a.m. UTC | #4
On 7/26/24 3:30 PM, Amery Hung wrote:
>> The pre/post is mainly to initialize and cleanup the "struct bpf_sched_data"
>> before/after calling the bpf prog.
>>
>> For the pre (init), there is a ".gen_prologue(...., const struct bpf_prog
>> *prog)" in the "bpf_verifier_ops". Take a look at the tc_cls_act_prologue().
>> It calls a BPF_FUNC_skb_pull_data helper. It potentially can call a kfunc
>> bpf_qdisc_watchdog_cancel. However, the gen_prologue is invoked too late in the
>> verifier for kfunc calling now. This will need some thoughts and works.
>>
>> For the post (destroy,reset), there is no "gen_epilogue" now. If
>> bpf_qdisc_watchdog_schedule() is not allowed to be called in the ".reset" and
>> ".destroy" bpf prog. I think it can be changed to pre also? There is a ".filter"
>> function in the "struct btf_kfunc_id_set" during the kfunc register.
>>
> I can see how that would work. The ability to add prologue, epilogue
> to struct_ops operators is one thing on my wish list.
> 
> Meanwhile, I am not sure whether that should be written in the kernel
> or rewritten by the verifier. An argument for keeping it in the kernel
> is that the prologue or epilogue can get quite complex and involves
> many kernel structures not exposed to the bpf program (pre-defined ops
> in Qdisc_ops in v8).

Can the v8 pre-defined ops be called as a kfunc? The qdisc_watchdog_cancel/init 
in v9 could be a kfunc and called by pro/epilogue.

For checking and/or resetting skb->dev, it should be simple enough without 
kfunc. e.g. when reusing the skb->rbnode in the future followup effort.

[ Unrelated to qdisc. bpf_tcp_ca can also use help to ensure the cwnd is sane. ]

> 
> Maybe we can keep the current approach in the initial version as they
> are not in the fast path, and then move to (gen_prologue,
> gen_epilogue) once the plumbing is done?

Sure. It can be improved when things are ready.

I am trying some ideas on how to do gen_epilogue and will share when I get some 
tests working.
diff mbox series

Patch

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 214ed2e34faa..3041782b7527 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -1359,4 +1359,10 @@  static inline void qdisc_synchronize(const struct Qdisc *q)
 		msleep(1);
 }
 
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_BPF_JIT)
+int bpf_qdisc_init_pre_op(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack);
+void bpf_qdisc_destroy_post_op(struct Qdisc *sch);
+void bpf_qdisc_reset_post_op(struct Qdisc *sch);
+#endif
+
 #endif
diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c
index eff7559aa346..903b4eb54510 100644
--- a/net/sched/bpf_qdisc.c
+++ b/net/sched/bpf_qdisc.c
@@ -9,9 +9,6 @@ 
 static struct bpf_struct_ops bpf_Qdisc_ops;
 
 static u32 unsupported_ops[] = {
-	offsetof(struct Qdisc_ops, init),
-	offsetof(struct Qdisc_ops, reset),
-	offsetof(struct Qdisc_ops, destroy),
 	offsetof(struct Qdisc_ops, change),
 	offsetof(struct Qdisc_ops, attach),
 	offsetof(struct Qdisc_ops, change_real_num_tx),
@@ -36,8 +33,8 @@  static int bpf_qdisc_init(struct btf *btf)
 	return 0;
 }
 
-static int bpf_qdisc_init_op(struct Qdisc *sch, struct nlattr *opt,
-			     struct netlink_ext_ack *extack)
+int bpf_qdisc_init_pre_op(struct Qdisc *sch, struct nlattr *opt,
+			  struct netlink_ext_ack *extack)
 {
 	struct bpf_sched_data *q = qdisc_priv(sch);
 
@@ -45,14 +42,14 @@  static int bpf_qdisc_init_op(struct Qdisc *sch, struct nlattr *opt,
 	return 0;
 }
 
-static void bpf_qdisc_reset_op(struct Qdisc *sch)
+void bpf_qdisc_reset_post_op(struct Qdisc *sch)
 {
 	struct bpf_sched_data *q = qdisc_priv(sch);
 
 	qdisc_watchdog_cancel(&q->watchdog);
 }
 
-static void bpf_qdisc_destroy_op(struct Qdisc *sch)
+void bpf_qdisc_destroy_post_op(struct Qdisc *sch)
 {
 	struct bpf_sched_data *q = qdisc_priv(sch);
 
@@ -235,15 +232,6 @@  static int bpf_qdisc_init_member(const struct btf_type *t,
 			return -EINVAL;
 		qdisc_ops->static_flags = TCQ_F_BPF;
 		return 1;
-	case offsetof(struct Qdisc_ops, init):
-		qdisc_ops->init = bpf_qdisc_init_op;
-		return 1;
-	case offsetof(struct Qdisc_ops, reset):
-		qdisc_ops->reset = bpf_qdisc_reset_op;
-		return 1;
-	case offsetof(struct Qdisc_ops, destroy):
-		qdisc_ops->destroy = bpf_qdisc_destroy_op;
-		return 1;
 	case offsetof(struct Qdisc_ops, peek):
 		if (!uqdisc_ops->peek)
 			qdisc_ops->peek = qdisc_peek_dequeued;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 5064b6d2d1ec..9fb9375e2793 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1352,6 +1352,13 @@  static struct Qdisc *qdisc_create(struct net_device *dev,
 		rcu_assign_pointer(sch->stab, stab);
 	}
 
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_BPF_JIT)
+	if (sch->flags & TCQ_F_BPF) {
+		err = bpf_qdisc_init_pre_op(sch, tca[TCA_OPTIONS], extack);
+		if (err != 0)
+			goto err_out4;
+	}
+#endif
 	if (ops->init) {
 		err = ops->init(sch, tca[TCA_OPTIONS], extack);
 		if (err != 0)
@@ -1388,6 +1395,10 @@  static struct Qdisc *qdisc_create(struct net_device *dev,
 	 */
 	if (ops->destroy)
 		ops->destroy(sch);
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_BPF_JIT)
+	if (sch->flags & TCQ_F_BPF)
+		bpf_qdisc_destroy_post_op(sch);
+#endif
 	qdisc_put_stab(rtnl_dereference(sch->stab));
 err_out3:
 	lockdep_unregister_key(&sch->root_lock_key);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 76e4a6efd17c..0ac05665c69f 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1033,6 +1033,10 @@  void qdisc_reset(struct Qdisc *qdisc)
 
 	if (ops->reset)
 		ops->reset(qdisc);
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_BPF_JIT)
+	if (qdisc->flags & TCQ_F_BPF)
+		bpf_qdisc_reset_post_op(qdisc);
+#endif
 
 	__skb_queue_purge(&qdisc->gso_skb);
 	__skb_queue_purge(&qdisc->skb_bad_txq);
@@ -1076,6 +1080,10 @@  static void __qdisc_destroy(struct Qdisc *qdisc)
 
 	if (ops->destroy)
 		ops->destroy(qdisc);
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_BPF_JIT)
+	if (qdisc->flags & TCQ_F_BPF)
+		bpf_qdisc_destroy_post_op(qdisc);
+#endif
 
 	lockdep_unregister_key(&qdisc->root_lock_key);
 	bpf_module_put(ops, ops->owner);