diff mbox series

[RFC,v7,1/8] net_sched: Introduce eBPF based Qdisc

Message ID 232881645a5c4c05a35df4ff1f08a19ef9a02662.1705432850.git.amery.hung@bytedance.com (mailing list archive)
State RFC
Delegated to: BPF
Headers show
Series net_sched: Introduce eBPF based Qdisc | expand

Checks

Context Check Description
netdev/tree_selection success Guessing tree name failed - patch did not apply, async
bpf/vmtest-bpf-PR fail merge-conflict
bpf/vmtest-bpf-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-VM_Test-13 pending Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-42 success Logs for x86_64-llvm-18 / veristat

Commit Message

Amery Hung Jan. 17, 2024, 9:56 p.m. UTC
From: Cong Wang <xiyou.wangcong@gmail.com>

Introduce a new Qdisc which is completely managed by eBPF program
of type BPF_PROG_TYPE_QDISC. It accepts two eBPF programs of
the same type, but one for enqueue and the other for dequeue.

It interacts with Qdisc layer in two ways:
1) It relies on Qdisc watchdog to handle throttling;
2) It could pass the skb enqueue/dequeue down to child classes

The context is used differently for enqueue and dequeue, as shown below:

┌──────────┬───────────────┬──────────────────────────────────┐
│ prog     │     input     │              output              │
├──────────┼───────────────┼──────────────────────────────────┤
│          │ ctx->skb      │ SCH_BPF_THROTTLE: ctx->expire    │
│          │               │                   ctx->delta_ns  │
│          │ ctx->classid  │                                  │
│          │               │ SCH_BPF_QUEUED: None             │
│          │               │                                  │
│          │               │ SCH_BPF_BYPASS: None             │
│ enqueue  │               │                                  │
│          │               │ SCH_BPF_STOLEN: None             │
│          │               │                                  │
│          │               │ SCH_BPF_DROP: None               │
│          │               │                                  │
│          │               │ SCH_BPF_CN: None                 │
│          │               │                                  │
│          │               │ SCH_BPF_PASS: ctx->classid       │
├──────────┼───────────────┼──────────────────────────────────┤
│          │ ctx->classid  │ SCH_BPF_THROTTLE: ctx->expire    │
│          │               │                   ctx->delta_ns  │
│          │               │                                  │
│ dequeue  │               │ SCH_BPF_DEQUEUED: None           │
│          │               │                                  │
│          │               │ SCH_BPF_DROP: None               │
│          │               │                                  │
│          │               │ SCH_BPF_PASS: ctx->classid       │
└──────────┴───────────────┴──────────────────────────────────┘

Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Co-developed-by: Amery Hung <amery.hung@bytedance.com>
Signed-off-by: Amery Hung <amery.hung@bytedance.com>
---
 include/linux/bpf_types.h      |   4 +
 include/uapi/linux/bpf.h       |  21 ++
 include/uapi/linux/pkt_sched.h |  16 +
 kernel/bpf/btf.c               |   5 +
 kernel/bpf/helpers.c           |   1 +
 kernel/bpf/syscall.c           |   8 +
 net/core/filter.c              |  96 ++++++
 net/sched/Kconfig              |  15 +
 net/sched/Makefile             |   1 +
 net/sched/sch_bpf.c            | 537 +++++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  21 ++
 11 files changed, 725 insertions(+)
 create mode 100644 net/sched/sch_bpf.c

Comments

Martin KaFai Lau Jan. 23, 2024, 11:51 p.m. UTC | #1
On 1/17/24 1:56 PM, Amery Hung wrote:
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 0bb92414c036..df280bbb7c0d 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -997,6 +997,7 @@ enum bpf_prog_type {
>   	BPF_PROG_TYPE_SK_LOOKUP,
>   	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
>   	BPF_PROG_TYPE_NETFILTER,
> +	BPF_PROG_TYPE_QDISC,
>   };
>   
>   enum bpf_attach_type {
> @@ -1056,6 +1057,8 @@ enum bpf_attach_type {
>   	BPF_CGROUP_UNIX_GETSOCKNAME,
>   	BPF_NETKIT_PRIMARY,
>   	BPF_NETKIT_PEER,
> +	BPF_QDISC_ENQUEUE,
> +	BPF_QDISC_DEQUEUE,
>   	__MAX_BPF_ATTACH_TYPE
>   };
>   
> @@ -7357,4 +7360,22 @@ struct bpf_iter_num {
>   	__u64 __opaque[1];
>   } __attribute__((aligned(8)));
>   
> +struct bpf_qdisc_ctx {
> +	__bpf_md_ptr(struct sk_buff *, skb);
> +	__u32 classid;
> +	__u64 expire;
> +	__u64 delta_ns;
> +};
> +
> +enum {
> +	SCH_BPF_QUEUED,
> +	SCH_BPF_DEQUEUED = SCH_BPF_QUEUED,
> +	SCH_BPF_DROP,
> +	SCH_BPF_CN,
> +	SCH_BPF_THROTTLE,
> +	SCH_BPF_PASS,
> +	SCH_BPF_BYPASS,
> +	SCH_BPF_STOLEN,
> +};
> +
>   #endif /* _UAPI__LINUX_BPF_H__ */

[ ... ]

> +static bool tc_qdisc_is_valid_access(int off, int size,
> +				     enum bpf_access_type type,
> +				     const struct bpf_prog *prog,
> +				     struct bpf_insn_access_aux *info)
> +{
> +	struct btf *btf;
> +
> +	if (off < 0 || off >= sizeof(struct bpf_qdisc_ctx))
> +		return false;
> +
> +	switch (off) {
> +	case offsetof(struct bpf_qdisc_ctx, skb):
> +		if (type == BPF_WRITE ||
> +		    size != sizeof_field(struct bpf_qdisc_ctx, skb))
> +			return false;
> +
> +		if (prog->expected_attach_type != BPF_QDISC_ENQUEUE)
> +			return false;
> +
> +		btf = bpf_get_btf_vmlinux();
> +		if (IS_ERR_OR_NULL(btf))
> +			return false;
> +
> +		info->btf = btf;
> +		info->btf_id = tc_qdisc_ctx_access_btf_ids[0];
> +		info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED;
> +		return true;
> +	case bpf_ctx_range(struct bpf_qdisc_ctx, classid):
> +		return size == sizeof_field(struct bpf_qdisc_ctx, classid);
> +	case bpf_ctx_range(struct bpf_qdisc_ctx, expire):
> +		return size == sizeof_field(struct bpf_qdisc_ctx, expire);
> +	case bpf_ctx_range(struct bpf_qdisc_ctx, delta_ns):
> +		return size == sizeof_field(struct bpf_qdisc_ctx, delta_ns);
> +	default:
> +		return false;
> +	}
> +
> +	return false;
> +}
> +

[ ... ]

> +static int sch_bpf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
> +			   struct sk_buff **to_free)
> +{
> +	struct bpf_sched_data *q = qdisc_priv(sch);
> +	unsigned int len = qdisc_pkt_len(skb);
> +	struct bpf_qdisc_ctx ctx = {};
> +	int res = NET_XMIT_SUCCESS;
> +	struct sch_bpf_class *cl;
> +	struct bpf_prog *enqueue;
> +
> +	enqueue = rcu_dereference(q->enqueue_prog.prog);
> +	if (!enqueue)
> +		return NET_XMIT_DROP;
> +
> +	ctx.skb = skb;
> +	ctx.classid = sch->handle;
> +	res = bpf_prog_run(enqueue, &ctx);
> +	switch (res) {
> +	case SCH_BPF_THROTTLE:
> +		qdisc_watchdog_schedule_range_ns(&q->watchdog, ctx.expire, ctx.delta_ns);
> +		qdisc_qstats_overlimit(sch);
> +		fallthrough;
> +	case SCH_BPF_QUEUED:
> +		qdisc_qstats_backlog_inc(sch, skb);
> +		return NET_XMIT_SUCCESS;
> +	case SCH_BPF_BYPASS:
> +		qdisc_qstats_drop(sch);
> +		__qdisc_drop(skb, to_free);
> +		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
> +	case SCH_BPF_STOLEN:
> +		__qdisc_drop(skb, to_free);
> +		return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
> +	case SCH_BPF_CN:
> +		return NET_XMIT_CN;
> +	case SCH_BPF_PASS:
> +		break;
> +	default:
> +		return qdisc_drop(skb, sch, to_free);
> +	}
> +
> +	cl = sch_bpf_find(sch, ctx.classid);
> +	if (!cl || !cl->qdisc)
> +		return qdisc_drop(skb, sch, to_free);
> +
> +	res = qdisc_enqueue(skb, cl->qdisc, to_free);
> +	if (res != NET_XMIT_SUCCESS) {
> +		if (net_xmit_drop_count(res)) {
> +			qdisc_qstats_drop(sch);
> +			cl->drops++;
> +		}
> +		return res;
> +	}
> +
> +	sch->qstats.backlog += len;
> +	sch->q.qlen++;
> +	return res;
> +}
> +
> +DEFINE_PER_CPU(struct sk_buff*, bpf_skb_dequeue);
> +
> +static struct sk_buff *sch_bpf_dequeue(struct Qdisc *sch)
> +{
> +	struct bpf_sched_data *q = qdisc_priv(sch);
> +	struct bpf_qdisc_ctx ctx = {};
> +	struct sk_buff *skb = NULL;
> +	struct bpf_prog *dequeue;
> +	struct sch_bpf_class *cl;
> +	int res;
> +
> +	dequeue = rcu_dereference(q->dequeue_prog.prog);
> +	if (!dequeue)
> +		return NULL;
> +
> +	__this_cpu_write(bpf_skb_dequeue, NULL);
> +	ctx.classid = sch->handle;
> +	res = bpf_prog_run(dequeue, &ctx);
> +	switch (res) {
> +	case SCH_BPF_DEQUEUED:
> +		skb = __this_cpu_read(bpf_skb_dequeue);
> +		qdisc_bstats_update(sch, skb);
> +		qdisc_qstats_backlog_dec(sch, skb);
> +		break;
> +	case SCH_BPF_THROTTLE:
> +		qdisc_watchdog_schedule_range_ns(&q->watchdog, ctx.expire, ctx.delta_ns);
> +		qdisc_qstats_overlimit(sch);
> +		cl = sch_bpf_find(sch, ctx.classid);
> +		if (cl)
> +			cl->overlimits++;
> +		return NULL;
> +	case SCH_BPF_PASS:
> +		cl = sch_bpf_find(sch, ctx.classid);
> +		if (!cl || !cl->qdisc)
> +			return NULL;
> +		skb = qdisc_dequeue_peeked(cl->qdisc);
> +		if (skb) {
> +			bstats_update(&cl->bstats, skb);
> +			qdisc_bstats_update(sch, skb);
> +			qdisc_qstats_backlog_dec(sch, skb);
> +			sch->q.qlen--;
> +		}
> +		break;
> +	}
> +
> +	return skb;
> +}

[ ... ]

> +static int sch_bpf_init(struct Qdisc *sch, struct nlattr *opt,
> +			struct netlink_ext_ack *extack)
> +{
> +	struct bpf_sched_data *q = qdisc_priv(sch);
> +	int err;
> +
> +	qdisc_watchdog_init(&q->watchdog, sch);
> +	if (opt) {
> +		err = sch_bpf_change(sch, opt, extack);
> +		if (err)
> +			return err;
> +	}
> +
> +	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
> +	if (err)
> +		return err;
> +
> +	return qdisc_class_hash_init(&q->clhash);
> +}
> +
> +static void sch_bpf_reset(struct Qdisc *sch)
> +{
> +	struct bpf_sched_data *q = qdisc_priv(sch);
> +	struct sch_bpf_class *cl;
> +	unsigned int i;
> +
> +	for (i = 0; i < q->clhash.hashsize; i++) {
> +		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
> +			if (cl->qdisc)
> +				qdisc_reset(cl->qdisc);
> +		}
> +	}
> +
> +	qdisc_watchdog_cancel(&q->watchdog);
> +}
> +

[ ... ]

> +static const struct Qdisc_class_ops sch_bpf_class_ops = {
> +	.graft		=	sch_bpf_graft,
> +	.leaf		=	sch_bpf_leaf,
> +	.find		=	sch_bpf_search,
> +	.change		=	sch_bpf_change_class,
> +	.delete		=	sch_bpf_delete,
> +	.tcf_block	=	sch_bpf_tcf_block,
> +	.bind_tcf	=	sch_bpf_bind,
> +	.unbind_tcf	=	sch_bpf_unbind,
> +	.dump		=	sch_bpf_dump_class,
> +	.dump_stats	=	sch_bpf_dump_class_stats,
> +	.walk		=	sch_bpf_walk,
> +};
> +
> +static struct Qdisc_ops sch_bpf_qdisc_ops __read_mostly = {
> +	.cl_ops		=	&sch_bpf_class_ops,
> +	.id		=	"bpf",
> +	.priv_size	=	sizeof(struct bpf_sched_data),
> +	.enqueue	=	sch_bpf_enqueue,
> +	.dequeue	=	sch_bpf_dequeue,

I looked at the high level of the patchset. The major ops that it wants to be 
programmable in bpf is the ".enqueue" and ".dequeue" (+ ".init" and ".reset" in 
patch 4 and patch 5).

This patch adds a new prog type BPF_PROG_TYPE_QDISC, four attach types (each for 
".enqueue", ".dequeue", ".init", and ".reset"), and a new "bpf_qdisc_ctx" in the 
uapi. It is no long an acceptable way to add new bpf extension.

Can the ".enqueue", ".dequeue", ".init", and ".reset" be completely implemented 
in bpf (with the help of new kfuncs if needed)? Then a struct_ops for Qdisc_ops 
can be created. The bpf Qdisc_ops can be loaded through the existing struct_ops api.

If other ops (like ".dump", ".dump_stats"...) do not have good use case to be 
programmable in bpf, it can stay with the kernel implementation for now and only 
allows the userspace to load the a bpf Qdisc_ops with .equeue/dequeue/init/reset 
implemented.

You mentioned in the cover letter that:
"Current struct_ops attachment model does not seem to support replacing only 
functions of a specific instance of a module, but I might be wrong."

I assumed you meant allow bpf to replace only "some" ops of the Qdisc_ops? Yes, 
it can be done through the struct_ops's ".init_member". Take a look at 
bpf_tcp_ca_init_member. The kernel can assign the kernel implementation for 
".dump" (for example) when loading the bpf Qdisc_ops.

> +	.peek		=	qdisc_peek_dequeued,
> +	.init		=	sch_bpf_init,
> +	.reset		=	sch_bpf_reset, > +	.destroy	=	sch_bpf_destroy,
> +	.change		=	sch_bpf_change,
> +	.dump		=	sch_bpf_dump,
> +	.dump_stats	=	sch_bpf_dump_stats,
> +	.owner		=	THIS_MODULE,
> +};
> +
> +static int __init sch_bpf_mod_init(void)
> +{
> +	return register_qdisc(&sch_bpf_qdisc_ops);
> +}
> +
> +static void __exit sch_bpf_mod_exit(void)
> +{
> +	unregister_qdisc(&sch_bpf_qdisc_ops);
> +}
Amery Hung Jan. 24, 2024, 5:22 a.m. UTC | #2
On Tue, Jan 23, 2024 at 3:51 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 1/17/24 1:56 PM, Amery Hung wrote:
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 0bb92414c036..df280bbb7c0d 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -997,6 +997,7 @@ enum bpf_prog_type {
> >       BPF_PROG_TYPE_SK_LOOKUP,
> >       BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
> >       BPF_PROG_TYPE_NETFILTER,
> > +     BPF_PROG_TYPE_QDISC,
> >   };
> >
> >   enum bpf_attach_type {
> > @@ -1056,6 +1057,8 @@ enum bpf_attach_type {
> >       BPF_CGROUP_UNIX_GETSOCKNAME,
> >       BPF_NETKIT_PRIMARY,
> >       BPF_NETKIT_PEER,
> > +     BPF_QDISC_ENQUEUE,
> > +     BPF_QDISC_DEQUEUE,
> >       __MAX_BPF_ATTACH_TYPE
> >   };
> >
> > @@ -7357,4 +7360,22 @@ struct bpf_iter_num {
> >       __u64 __opaque[1];
> >   } __attribute__((aligned(8)));
> >
> > +struct bpf_qdisc_ctx {
> > +     __bpf_md_ptr(struct sk_buff *, skb);
> > +     __u32 classid;
> > +     __u64 expire;
> > +     __u64 delta_ns;
> > +};
> > +
> > +enum {
> > +     SCH_BPF_QUEUED,
> > +     SCH_BPF_DEQUEUED = SCH_BPF_QUEUED,
> > +     SCH_BPF_DROP,
> > +     SCH_BPF_CN,
> > +     SCH_BPF_THROTTLE,
> > +     SCH_BPF_PASS,
> > +     SCH_BPF_BYPASS,
> > +     SCH_BPF_STOLEN,
> > +};
> > +
> >   #endif /* _UAPI__LINUX_BPF_H__ */
>
> [ ... ]
>
> > +static bool tc_qdisc_is_valid_access(int off, int size,
> > +                                  enum bpf_access_type type,
> > +                                  const struct bpf_prog *prog,
> > +                                  struct bpf_insn_access_aux *info)
> > +{
> > +     struct btf *btf;
> > +
> > +     if (off < 0 || off >= sizeof(struct bpf_qdisc_ctx))
> > +             return false;
> > +
> > +     switch (off) {
> > +     case offsetof(struct bpf_qdisc_ctx, skb):
> > +             if (type == BPF_WRITE ||
> > +                 size != sizeof_field(struct bpf_qdisc_ctx, skb))
> > +                     return false;
> > +
> > +             if (prog->expected_attach_type != BPF_QDISC_ENQUEUE)
> > +                     return false;
> > +
> > +             btf = bpf_get_btf_vmlinux();
> > +             if (IS_ERR_OR_NULL(btf))
> > +                     return false;
> > +
> > +             info->btf = btf;
> > +             info->btf_id = tc_qdisc_ctx_access_btf_ids[0];
> > +             info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED;
> > +             return true;
> > +     case bpf_ctx_range(struct bpf_qdisc_ctx, classid):
> > +             return size == sizeof_field(struct bpf_qdisc_ctx, classid);
> > +     case bpf_ctx_range(struct bpf_qdisc_ctx, expire):
> > +             return size == sizeof_field(struct bpf_qdisc_ctx, expire);
> > +     case bpf_ctx_range(struct bpf_qdisc_ctx, delta_ns):
> > +             return size == sizeof_field(struct bpf_qdisc_ctx, delta_ns);
> > +     default:
> > +             return false;
> > +     }
> > +
> > +     return false;
> > +}
> > +
>
> [ ... ]
>
> > +static int sch_bpf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
> > +                        struct sk_buff **to_free)
> > +{
> > +     struct bpf_sched_data *q = qdisc_priv(sch);
> > +     unsigned int len = qdisc_pkt_len(skb);
> > +     struct bpf_qdisc_ctx ctx = {};
> > +     int res = NET_XMIT_SUCCESS;
> > +     struct sch_bpf_class *cl;
> > +     struct bpf_prog *enqueue;
> > +
> > +     enqueue = rcu_dereference(q->enqueue_prog.prog);
> > +     if (!enqueue)
> > +             return NET_XMIT_DROP;
> > +
> > +     ctx.skb = skb;
> > +     ctx.classid = sch->handle;
> > +     res = bpf_prog_run(enqueue, &ctx);
> > +     switch (res) {
> > +     case SCH_BPF_THROTTLE:
> > +             qdisc_watchdog_schedule_range_ns(&q->watchdog, ctx.expire, ctx.delta_ns);
> > +             qdisc_qstats_overlimit(sch);
> > +             fallthrough;
> > +     case SCH_BPF_QUEUED:
> > +             qdisc_qstats_backlog_inc(sch, skb);
> > +             return NET_XMIT_SUCCESS;
> > +     case SCH_BPF_BYPASS:
> > +             qdisc_qstats_drop(sch);
> > +             __qdisc_drop(skb, to_free);
> > +             return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
> > +     case SCH_BPF_STOLEN:
> > +             __qdisc_drop(skb, to_free);
> > +             return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
> > +     case SCH_BPF_CN:
> > +             return NET_XMIT_CN;
> > +     case SCH_BPF_PASS:
> > +             break;
> > +     default:
> > +             return qdisc_drop(skb, sch, to_free);
> > +     }
> > +
> > +     cl = sch_bpf_find(sch, ctx.classid);
> > +     if (!cl || !cl->qdisc)
> > +             return qdisc_drop(skb, sch, to_free);
> > +
> > +     res = qdisc_enqueue(skb, cl->qdisc, to_free);
> > +     if (res != NET_XMIT_SUCCESS) {
> > +             if (net_xmit_drop_count(res)) {
> > +                     qdisc_qstats_drop(sch);
> > +                     cl->drops++;
> > +             }
> > +             return res;
> > +     }
> > +
> > +     sch->qstats.backlog += len;
> > +     sch->q.qlen++;
> > +     return res;
> > +}
> > +
> > +DEFINE_PER_CPU(struct sk_buff*, bpf_skb_dequeue);
> > +
> > +static struct sk_buff *sch_bpf_dequeue(struct Qdisc *sch)
> > +{
> > +     struct bpf_sched_data *q = qdisc_priv(sch);
> > +     struct bpf_qdisc_ctx ctx = {};
> > +     struct sk_buff *skb = NULL;
> > +     struct bpf_prog *dequeue;
> > +     struct sch_bpf_class *cl;
> > +     int res;
> > +
> > +     dequeue = rcu_dereference(q->dequeue_prog.prog);
> > +     if (!dequeue)
> > +             return NULL;
> > +
> > +     __this_cpu_write(bpf_skb_dequeue, NULL);
> > +     ctx.classid = sch->handle;
> > +     res = bpf_prog_run(dequeue, &ctx);
> > +     switch (res) {
> > +     case SCH_BPF_DEQUEUED:
> > +             skb = __this_cpu_read(bpf_skb_dequeue);
> > +             qdisc_bstats_update(sch, skb);
> > +             qdisc_qstats_backlog_dec(sch, skb);
> > +             break;
> > +     case SCH_BPF_THROTTLE:
> > +             qdisc_watchdog_schedule_range_ns(&q->watchdog, ctx.expire, ctx.delta_ns);
> > +             qdisc_qstats_overlimit(sch);
> > +             cl = sch_bpf_find(sch, ctx.classid);
> > +             if (cl)
> > +                     cl->overlimits++;
> > +             return NULL;
> > +     case SCH_BPF_PASS:
> > +             cl = sch_bpf_find(sch, ctx.classid);
> > +             if (!cl || !cl->qdisc)
> > +                     return NULL;
> > +             skb = qdisc_dequeue_peeked(cl->qdisc);
> > +             if (skb) {
> > +                     bstats_update(&cl->bstats, skb);
> > +                     qdisc_bstats_update(sch, skb);
> > +                     qdisc_qstats_backlog_dec(sch, skb);
> > +                     sch->q.qlen--;
> > +             }
> > +             break;
> > +     }
> > +
> > +     return skb;
> > +}
>
> [ ... ]
>
> > +static int sch_bpf_init(struct Qdisc *sch, struct nlattr *opt,
> > +                     struct netlink_ext_ack *extack)
> > +{
> > +     struct bpf_sched_data *q = qdisc_priv(sch);
> > +     int err;
> > +
> > +     qdisc_watchdog_init(&q->watchdog, sch);
> > +     if (opt) {
> > +             err = sch_bpf_change(sch, opt, extack);
> > +             if (err)
> > +                     return err;
> > +     }
> > +
> > +     err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
> > +     if (err)
> > +             return err;
> > +
> > +     return qdisc_class_hash_init(&q->clhash);
> > +}
> > +
> > +static void sch_bpf_reset(struct Qdisc *sch)
> > +{
> > +     struct bpf_sched_data *q = qdisc_priv(sch);
> > +     struct sch_bpf_class *cl;
> > +     unsigned int i;
> > +
> > +     for (i = 0; i < q->clhash.hashsize; i++) {
> > +             hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
> > +                     if (cl->qdisc)
> > +                             qdisc_reset(cl->qdisc);
> > +             }
> > +     }
> > +
> > +     qdisc_watchdog_cancel(&q->watchdog);
> > +}
> > +
>
> [ ... ]
>
> > +static const struct Qdisc_class_ops sch_bpf_class_ops = {
> > +     .graft          =       sch_bpf_graft,
> > +     .leaf           =       sch_bpf_leaf,
> > +     .find           =       sch_bpf_search,
> > +     .change         =       sch_bpf_change_class,
> > +     .delete         =       sch_bpf_delete,
> > +     .tcf_block      =       sch_bpf_tcf_block,
> > +     .bind_tcf       =       sch_bpf_bind,
> > +     .unbind_tcf     =       sch_bpf_unbind,
> > +     .dump           =       sch_bpf_dump_class,
> > +     .dump_stats     =       sch_bpf_dump_class_stats,
> > +     .walk           =       sch_bpf_walk,
> > +};
> > +
> > +static struct Qdisc_ops sch_bpf_qdisc_ops __read_mostly = {
> > +     .cl_ops         =       &sch_bpf_class_ops,
> > +     .id             =       "bpf",
> > +     .priv_size      =       sizeof(struct bpf_sched_data),
> > +     .enqueue        =       sch_bpf_enqueue,
> > +     .dequeue        =       sch_bpf_dequeue,
>
> I looked at the high level of the patchset. The major ops that it wants to be
> programmable in bpf is the ".enqueue" and ".dequeue" (+ ".init" and ".reset" in
> patch 4 and patch 5).
>
> This patch adds a new prog type BPF_PROG_TYPE_QDISC, four attach types (each for
> ".enqueue", ".dequeue", ".init", and ".reset"), and a new "bpf_qdisc_ctx" in the
> uapi. It is no long an acceptable way to add new bpf extension.
>
> Can the ".enqueue", ".dequeue", ".init", and ".reset" be completely implemented
> in bpf (with the help of new kfuncs if needed)? Then a struct_ops for Qdisc_ops
> can be created. The bpf Qdisc_ops can be loaded through the existing struct_ops api.
>

Partially. If using struct_ops, I think we'll need another structure
like the following in bpf qdisc to be implemented with struct_ops bpf:

struct bpf_qdisc_ops {
    int (*enqueue) (struct sk_buff *skb)
    void (*dequeue) (void)
    void (*init) (void)
    void (*reset) (void)
};

Then, Qdisc_ops will wrap around them to handle things that cannot be
implemented with bpf (e.g., sch_tree_lock, returning a skb ptr).

> If other ops (like ".dump", ".dump_stats"...) do not have good use case to be
> programmable in bpf, it can stay with the kernel implementation for now and only
> allows the userspace to load the a bpf Qdisc_ops with .equeue/dequeue/init/reset
> implemented.
>
> You mentioned in the cover letter that:
> "Current struct_ops attachment model does not seem to support replacing only
> functions of a specific instance of a module, but I might be wrong."
>
> I assumed you meant allow bpf to replace only "some" ops of the Qdisc_ops? Yes,
> it can be done through the struct_ops's ".init_member". Take a look at
> bpf_tcp_ca_init_member. The kernel can assign the kernel implementation for
> ".dump" (for example) when loading the bpf Qdisc_ops.
>

I have no problem with partially replacing a struct, which like you
mentioned has been demonstrated by congestion control or sched_ext.
What I am not sure about is the ability to create multiple bpf qdiscs,
where each has different ".enqueue", ".dequeue", and so on. I like the
struct_ops approach and would love to try it if struct_ops support
this.

Thanks,
Amery



> > +     .peek           =       qdisc_peek_dequeued,
> > +     .init           =       sch_bpf_init,
> > +     .reset          =       sch_bpf_reset, > +      .destroy        =       sch_bpf_destroy,
> > +     .change         =       sch_bpf_change,
> > +     .dump           =       sch_bpf_dump,
> > +     .dump_stats     =       sch_bpf_dump_stats,
> > +     .owner          =       THIS_MODULE,
> > +};
> > +
> > +static int __init sch_bpf_mod_init(void)
> > +{
> > +     return register_qdisc(&sch_bpf_qdisc_ops);
> > +}
> > +
> > +static void __exit sch_bpf_mod_exit(void)
> > +{
> > +     unregister_qdisc(&sch_bpf_qdisc_ops);
> > +}
>
Martin KaFai Lau Jan. 26, 2024, 2:22 a.m. UTC | #3
On 1/23/24 9:22 PM, Amery Hung wrote:
>> I looked at the high level of the patchset. The major ops that it wants to be
>> programmable in bpf is the ".enqueue" and ".dequeue" (+ ".init" and ".reset" in
>> patch 4 and patch 5).
>>
>> This patch adds a new prog type BPF_PROG_TYPE_QDISC, four attach types (each for
>> ".enqueue", ".dequeue", ".init", and ".reset"), and a new "bpf_qdisc_ctx" in the
>> uapi. It is no long an acceptable way to add new bpf extension.
>>
>> Can the ".enqueue", ".dequeue", ".init", and ".reset" be completely implemented
>> in bpf (with the help of new kfuncs if needed)? Then a struct_ops for Qdisc_ops
>> can be created. The bpf Qdisc_ops can be loaded through the existing struct_ops api.
>>
> Partially. If using struct_ops, I think we'll need another structure
> like the following in bpf qdisc to be implemented with struct_ops bpf:
> 
> struct bpf_qdisc_ops {
>      int (*enqueue) (struct sk_buff *skb)
>      void (*dequeue) (void)
>      void (*init) (void)
>      void (*reset) (void)
> };
> 
> Then, Qdisc_ops will wrap around them to handle things that cannot be
> implemented with bpf (e.g., sch_tree_lock, returning a skb ptr).

We can see how those limitations (calling sch_tree_lock() and returning a ptr) 
can be addressed in bpf. This will also help other similar use cases.

Other than sch_tree_lock and returning a ptr from a bpf prog. What else do you 
see that blocks directly implementing the enqueue/dequeue/init/reset in the 
struct Qdisc_ops?

Have you thought above ".priv_size"? It is now fixed to sizeof(struct 
bpf_sched_data). It should be useful to allow the bpf prog to store its own data 
there?

> 
>> If other ops (like ".dump", ".dump_stats"...) do not have good use case to be
>> programmable in bpf, it can stay with the kernel implementation for now and only
>> allows the userspace to load the a bpf Qdisc_ops with .equeue/dequeue/init/reset
>> implemented.
>>
>> You mentioned in the cover letter that:
>> "Current struct_ops attachment model does not seem to support replacing only
>> functions of a specific instance of a module, but I might be wrong."
>>
>> I assumed you meant allow bpf to replace only "some" ops of the Qdisc_ops? Yes,
>> it can be done through the struct_ops's ".init_member". Take a look at
>> bpf_tcp_ca_init_member. The kernel can assign the kernel implementation for
>> ".dump" (for example) when loading the bpf Qdisc_ops.
>>
> I have no problem with partially replacing a struct, which like you
> mentioned has been demonstrated by congestion control or sched_ext.
> What I am not sure about is the ability to create multiple bpf qdiscs,
> where each has different ".enqueue", ".dequeue", and so on. I like the
> struct_ops approach and would love to try it if struct_ops support
> this.

The need for allowing different ".enqueue/.dequeue/..." bpf 
(BPF_PROG_TYPE_QDISC) programs loaded into different qdisc instances is because 
there is only one ".id == bpf" Qdisc_ops native kernel implementation which is 
then because of the limitation you mentioned above?

Am I understanding your reason correctly on why it requires to load different 
bpf prog for different qdisc instances?

If the ".enqueue/.dequeue/..." in the "struct Qdisc_ops" can be directly 
implemented in bpf prog itself, it can just load another bpf struct_ops which 
has a different ".enqueue/.dequeue/..." implementation:

#> bpftool struct_ops register bpf_simple_fq_v1.bpf.o
#> bpftool struct_ops register bpf_simple_fq_v2.bpf.o
#> bpftool struct_ops register bpf_simple_fq_xyz.bpf.o

 From reading the test bpf prog, I think the set is on a good footing. Instead 
of working around the limitation by wrapping the bpf prog in a predefined 
"struct Qdisc_ops sch_bpf_qdisc_ops", lets first understand what is missing in 
bpf and see how we could address them.
Amery Hung Jan. 27, 2024, 1:17 a.m. UTC | #4
On Thu, Jan 25, 2024 at 6:22 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 1/23/24 9:22 PM, Amery Hung wrote:
> >> I looked at the high level of the patchset. The major ops that it wants to be
> >> programmable in bpf is the ".enqueue" and ".dequeue" (+ ".init" and ".reset" in
> >> patch 4 and patch 5).
> >>
> >> This patch adds a new prog type BPF_PROG_TYPE_QDISC, four attach types (each for
> >> ".enqueue", ".dequeue", ".init", and ".reset"), and a new "bpf_qdisc_ctx" in the
> >> uapi. It is no long an acceptable way to add new bpf extension.
> >>
> >> Can the ".enqueue", ".dequeue", ".init", and ".reset" be completely implemented
> >> in bpf (with the help of new kfuncs if needed)? Then a struct_ops for Qdisc_ops
> >> can be created. The bpf Qdisc_ops can be loaded through the existing struct_ops api.
> >>
> > Partially. If using struct_ops, I think we'll need another structure
> > like the following in bpf qdisc to be implemented with struct_ops bpf:
> >
> > struct bpf_qdisc_ops {
> >      int (*enqueue) (struct sk_buff *skb)
> >      void (*dequeue) (void)
> >      void (*init) (void)
> >      void (*reset) (void)
> > };
> >
> > Then, Qdisc_ops will wrap around them to handle things that cannot be
> > implemented with bpf (e.g., sch_tree_lock, returning a skb ptr).
>
> We can see how those limitations (calling sch_tree_lock() and returning a ptr)
> can be addressed in bpf. This will also help other similar use cases.
>

For kptr, I wonder if we can support the following semantics in bpf if
they make sense:
1. Passing a referenced kptr into a bpf program, which will also need
to be released, or exchanged into maps or allocated objects.
2. Returning a kptr from a program and treating it as releasing the reference.

> Other than sch_tree_lock and returning a ptr from a bpf prog. What else do you
> see that blocks directly implementing the enqueue/dequeue/init/reset in the
> struct Qdisc_ops?
>

Not much! We can deal with sch_tree_lock later since
enqueue/dequeue/init/reset are unlikely to use it.

> Have you thought above ".priv_size"? It is now fixed to sizeof(struct
> bpf_sched_data). It should be useful to allow the bpf prog to store its own data
> there?
>

Maybe we can let bpf qdiscs store statistics here and make it work
with netlink. I haven't explored much in how bpf qdiscs record and
share statistics with user space.

> >
> >> If other ops (like ".dump", ".dump_stats"...) do not have good use case to be
> >> programmable in bpf, it can stay with the kernel implementation for now and only
> >> allows the userspace to load the a bpf Qdisc_ops with .equeue/dequeue/init/reset
> >> implemented.
> >>
> >> You mentioned in the cover letter that:
> >> "Current struct_ops attachment model does not seem to support replacing only
> >> functions of a specific instance of a module, but I might be wrong."
> >>
> >> I assumed you meant allow bpf to replace only "some" ops of the Qdisc_ops? Yes,
> >> it can be done through the struct_ops's ".init_member". Take a look at
> >> bpf_tcp_ca_init_member. The kernel can assign the kernel implementation for
> >> ".dump" (for example) when loading the bpf Qdisc_ops.
> >>
> > I have no problem with partially replacing a struct, which like you
> > mentioned has been demonstrated by congestion control or sched_ext.
> > What I am not sure about is the ability to create multiple bpf qdiscs,
> > where each has different ".enqueue", ".dequeue", and so on. I like the
> > struct_ops approach and would love to try it if struct_ops support
> > this.
>
> The need for allowing different ".enqueue/.dequeue/..." bpf
> (BPF_PROG_TYPE_QDISC) programs loaded into different qdisc instances is because
> there is only one ".id == bpf" Qdisc_ops native kernel implementation which is
> then because of the limitation you mentioned above?
>
> Am I understanding your reason correctly on why it requires to load different
> bpf prog for different qdisc instances?
>
> If the ".enqueue/.dequeue/..." in the "struct Qdisc_ops" can be directly
> implemented in bpf prog itself, it can just load another bpf struct_ops which
> has a different ".enqueue/.dequeue/..." implementation:
>
> #> bpftool struct_ops register bpf_simple_fq_v1.bpf.o
> #> bpftool struct_ops register bpf_simple_fq_v2.bpf.o
> #> bpftool struct_ops register bpf_simple_fq_xyz.bpf.o
>
>  From reading the test bpf prog, I think the set is on a good footing. Instead
> of working around the limitation by wrapping the bpf prog in a predefined
> "struct Qdisc_ops sch_bpf_qdisc_ops", lets first understand what is missing in
> bpf and see how we could address them.
>

Thank you so much for the clarification. I had the wrong impression
since I was thinking about using a structure in the bpf qdisc for
struct_ops. It makes sense to try making "struct Qdisc_ops" work with
struct_ops. I will send the next patch set with struct_ops.

Thanks,
Amery

>
Martin KaFai Lau Jan. 30, 2024, 6:39 a.m. UTC | #5
On 1/26/24 5:17 PM, Amery Hung wrote:
> On Thu, Jan 25, 2024 at 6:22 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>
>> On 1/23/24 9:22 PM, Amery Hung wrote:
>>>> I looked at the high level of the patchset. The major ops that it wants to be
>>>> programmable in bpf is the ".enqueue" and ".dequeue" (+ ".init" and ".reset" in
>>>> patch 4 and patch 5).
>>>>
>>>> This patch adds a new prog type BPF_PROG_TYPE_QDISC, four attach types (each for
>>>> ".enqueue", ".dequeue", ".init", and ".reset"), and a new "bpf_qdisc_ctx" in the
>>>> uapi. It is no long an acceptable way to add new bpf extension.
>>>>
>>>> Can the ".enqueue", ".dequeue", ".init", and ".reset" be completely implemented
>>>> in bpf (with the help of new kfuncs if needed)? Then a struct_ops for Qdisc_ops
>>>> can be created. The bpf Qdisc_ops can be loaded through the existing struct_ops api.
>>>>
>>> Partially. If using struct_ops, I think we'll need another structure
>>> like the following in bpf qdisc to be implemented with struct_ops bpf:
>>>
>>> struct bpf_qdisc_ops {
>>>       int (*enqueue) (struct sk_buff *skb)
>>>       void (*dequeue) (void)
>>>       void (*init) (void)
>>>       void (*reset) (void)
>>> };
>>>
>>> Then, Qdisc_ops will wrap around them to handle things that cannot be
>>> implemented with bpf (e.g., sch_tree_lock, returning a skb ptr).
>>
>> We can see how those limitations (calling sch_tree_lock() and returning a ptr)
>> can be addressed in bpf. This will also help other similar use cases.
>>
> 
> For kptr, I wonder if we can support the following semantics in bpf if
> they make sense:

I think they are useful but they are not fully supported now.

Some thoughts below.

> 1. Passing a referenced kptr into a bpf program, which will also need
> to be released, or exchanged into maps or allocated objects.

"enqueue" should be the one considering here:

struct Qdisc_ops {
	/* ... */
	int                     (*enqueue)(struct sk_buff *skb,
					   struct Qdisc *sch,
					   struct sk_buff **to_free);

};

The verifier only marks the skb as a trusted kptr but does not mark its 
reg->ref_obj_id. Take a look at btf_ctx_access(). In particular:

	if (prog_args_trusted(prog))
		info->reg_type |= PTR_TRUSTED;

The verifier does not know the skb ownership is passed into the ".enqueue" ops 
and does not know the bpf prog needs to release it or store it in a map.

The verifier tracks the reference state when a KF_ACQUIRE kfunc is called (just 
an example, not saying we need to use KF_ACQUIRE kfunc). Take a look at 
acquire_reference_state() which is the useful one here.

Whenever the verifier is loading the ".enqueue" bpf_prog, the verifier can 
always acquire_reference_state() for the "struct sk_buff *skb" argument.

Take a look at a recent RFC: 
https://lore.kernel.org/bpf/20240122212217.1391878-1-thinker.li@gmail.com/
which is tagging the argument of an ops (e.g. ".enqueue" here). That RFC patch 
is tagging the argument could be NULL by appending "__nullable" to the argument 
name. The verifier will enforce that the bpf prog must check for NULL first.

The similar idea can be used here but with a different tagging (for example, 
"__must_release", admittedly not a good name). While the RFC patch is 
in-progress, for now, may be hardcode for the ".enqueue" ops in 
check_struct_ops_btf_id() and always acquire_reference_state() for the skb. This 
part can be adjusted later once the RFC patch will be in shape.


Then one more thing is to track when the struct_ops bpf prog is actually reading 
the value of the skb pointer. One thing is worth to mention here, e.g. a 
struct_ops prog for enqueue:

SEC("struct_ops")
int BPF_PROG(bpf_dropall_enqueue, struct sk_buff *skb, struct Qdisc *sch,
	     struct sk_buff **to_free)
{
	return bpf_qdisc_drop(skb, sch, to_free);
}

Take a look at the BPF_PROG macro, the bpf prog is getting a pointer to an array 
of __u64 as the only argument. The skb is actually in ctx[0], sch is in 
ctx[1]...etc. When ctx[0] is read to get the skb pointer (e.g. r1 = ctx[0]), 
btf_ctx_access() marks the reg_type to PTR_TRUSTED. It needs to also initialize 
the reg->ref_obj_id by the id obtained earlier from acquire_reference_state() 
during check_struct_ops_btf_id() somehow.


> 2. Returning a kptr from a program and treating it as releasing the reference.

e.g. for dequeue:

struct Qdisc_ops {
	/* ... */
	struct sk_buff *        (*dequeue)(struct Qdisc *);
};


Right now the verifier should complain on check_reference_leak() if the 
struct_ops bpf prog is returning a referenced kptr.

Unlike an argument, the return type of a function does not have a name to tag. 
It is the first case that a struct_ops bpf_prog returning a pointer. One idea is 
to assume it must be a trusted pointer (PTR_TRUSTED) and the verifier should 
check it is indeed with PTR_TRUSTED flag.

May be release_reference_state() can be called to assume the kernel will release 
it as long as the return pointer type is PTR_TRUSTED and the type matches the 
return type of the ops. Take a look at check_return_code().
Kui-Feng Lee Jan. 30, 2024, 5:49 p.m. UTC | #6
On 1/29/24 22:39, Martin KaFai Lau wrote:
> On 1/26/24 5:17 PM, Amery Hung wrote:
>> On Thu, Jan 25, 2024 at 6:22 PM Martin KaFai Lau 
>> <martin.lau@linux.dev> wrote:
>>>
>>> On 1/23/24 9:22 PM, Amery Hung wrote:
>>>>> I looked at the high level of the patchset. The major ops that it 
>>>>> wants to be
>>>>> programmable in bpf is the ".enqueue" and ".dequeue" (+ ".init" and 
>>>>> ".reset" in
>>>>> patch 4 and patch 5).
>>>>>
>>>>> This patch adds a new prog type BPF_PROG_TYPE_QDISC, four attach 
>>>>> types (each for
>>>>> ".enqueue", ".dequeue", ".init", and ".reset"), and a new 
>>>>> "bpf_qdisc_ctx" in the
>>>>> uapi. It is no long an acceptable way to add new bpf extension.
>>>>>
>>>>> Can the ".enqueue", ".dequeue", ".init", and ".reset" be completely 
>>>>> implemented
>>>>> in bpf (with the help of new kfuncs if needed)? Then a struct_ops 
>>>>> for Qdisc_ops
>>>>> can be created. The bpf Qdisc_ops can be loaded through the 
>>>>> existing struct_ops api.
>>>>>
>>>> Partially. If using struct_ops, I think we'll need another structure
>>>> like the following in bpf qdisc to be implemented with struct_ops bpf:
>>>>
>>>> struct bpf_qdisc_ops {
>>>>       int (*enqueue) (struct sk_buff *skb)
>>>>       void (*dequeue) (void)
>>>>       void (*init) (void)
>>>>       void (*reset) (void)
>>>> };
>>>>
>>>> Then, Qdisc_ops will wrap around them to handle things that cannot be
>>>> implemented with bpf (e.g., sch_tree_lock, returning a skb ptr).
>>>
>>> We can see how those limitations (calling sch_tree_lock() and 
>>> returning a ptr)
>>> can be addressed in bpf. This will also help other similar use cases.
>>>
>>
>> For kptr, I wonder if we can support the following semantics in bpf if
>> they make sense:
> 
> I think they are useful but they are not fully supported now.
> 
> Some thoughts below.
> 
>> 1. Passing a referenced kptr into a bpf program, which will also need
>> to be released, or exchanged into maps or allocated objects.
> 
> "enqueue" should be the one considering here:
> 
> struct Qdisc_ops {
>      /* ... */
>      int                     (*enqueue)(struct sk_buff *skb,
>                         struct Qdisc *sch,
>                         struct sk_buff **to_free);
> 
> };
> 
> The verifier only marks the skb as a trusted kptr but does not mark its 
> reg->ref_obj_id. Take a look at btf_ctx_access(). In particular:
> 
>      if (prog_args_trusted(prog))
>          info->reg_type |= PTR_TRUSTED;
> 
> The verifier does not know the skb ownership is passed into the 
> ".enqueue" ops and does not know the bpf prog needs to release it or 
> store it in a map.
> 
> The verifier tracks the reference state when a KF_ACQUIRE kfunc is 
> called (just an example, not saying we need to use KF_ACQUIRE kfunc). 
> Take a look at acquire_reference_state() which is the useful one here.
> 
> Whenever the verifier is loading the ".enqueue" bpf_prog, the verifier 
> can always acquire_reference_state() for the "struct sk_buff *skb" 
> argument.
> 
> Take a look at a recent RFC: 
> https://lore.kernel.org/bpf/20240122212217.1391878-1-thinker.li@gmail.com/
> which is tagging the argument of an ops (e.g. ".enqueue" here). That RFC 
> patch is tagging the argument could be NULL by appending "__nullable" to 
> the argument name. The verifier will enforce that the bpf prog must 
> check for NULL first.
> 
> The similar idea can be used here but with a different tagging (for 
> example, "__must_release", admittedly not a good name). While the RFC 
> patch is in-progress, for now, may be hardcode for the ".enqueue" ops in 
> check_struct_ops_btf_id() and always acquire_reference_state() for the 
> skb. This part can be adjusted later once the RFC patch will be in shape.
> 
> 
> Then one more thing is to track when the struct_ops bpf prog is actually 
> reading the value of the skb pointer. One thing is worth to mention 
> here, e.g. a struct_ops prog for enqueue:
> 
> SEC("struct_ops")
> int BPF_PROG(bpf_dropall_enqueue, struct sk_buff *skb, struct Qdisc *sch,
>           struct sk_buff **to_free)
> {
>      return bpf_qdisc_drop(skb, sch, to_free);
> }
> 
> Take a look at the BPF_PROG macro, the bpf prog is getting a pointer to 
> an array of __u64 as the only argument. The skb is actually in ctx[0], 
> sch is in ctx[1]...etc. When ctx[0] is read to get the skb pointer (e.g. 
> r1 = ctx[0]), btf_ctx_access() marks the reg_type to PTR_TRUSTED. It 
> needs to also initialize the reg->ref_obj_id by the id obtained earlier 
> from acquire_reference_state() during check_struct_ops_btf_id() somehow.
> 
> 
>> 2. Returning a kptr from a program and treating it as releasing the 
>> reference.
> 
> e.g. for dequeue:
> 
> struct Qdisc_ops {
>      /* ... */
>      struct sk_buff *        (*dequeue)(struct Qdisc *);
> };
> 
> 
> Right now the verifier should complain on check_reference_leak() if the 
> struct_ops bpf prog is returning a referenced kptr.
> 
> Unlike an argument, the return type of a function does not have a name 
> to tag. It is the first case that a struct_ops bpf_prog returning a 

We may tag the stub functions instead, right?
Is the purpose here to return a referenced pointer from a struct_ops
operator without verifier complaining?

> pointer. One idea is to assume it must be a trusted pointer 
> (PTR_TRUSTED) and the verifier should check it is indeed with 
> PTR_TRUSTED flag.
> 
> May be release_reference_state() can be called to assume the kernel will 
> release it as long as the return pointer type is PTR_TRUSTED and the 
> type matches the return type of the ops. Take a look at 
> check_return_code().
>
Martin KaFai Lau Jan. 31, 2024, 1:01 a.m. UTC | #7
On 1/30/24 9:49 AM, Kui-Feng Lee wrote:
>>> 2. Returning a kptr from a program and treating it as releasing the reference.
>>
>> e.g. for dequeue:
>>
>> struct Qdisc_ops {
>>      /* ... */
>>      struct sk_buff *        (*dequeue)(struct Qdisc *);
>> };
>>
>>
>> Right now the verifier should complain on check_reference_leak() if the 
>> struct_ops bpf prog is returning a referenced kptr.
>>
>> Unlike an argument, the return type of a function does not have a name to tag. 
>> It is the first case that a struct_ops bpf_prog returning a 
> 
> We may tag the stub functions instead, right?

What is the suggestion on how to tag the return type?

I was suggesting it doesn't need to tag and it should by default require a 
trusted ptr for the pointer returned by struct_ops. The pointer argument and the 
return pointer of a struct_ops should be a trusted ptr.

> Is the purpose here to return a referenced pointer from a struct_ops
> operator without verifier complaining?

Yes, basically need to teach the verifier the kernel will do the reference release.

> 
>> pointer. One idea is to assume it must be a trusted pointer (PTR_TRUSTED) and 
>> the verifier should check it is indeed with PTR_TRUSTED flag.
>>
>> May be release_reference_state() can be called to assume the kernel will 
>> release it as long as the return pointer type is PTR_TRUSTED and the type 
>> matches the return type of the ops. Take a look at check_return_code().
Amery Hung Jan. 31, 2024, 4:23 p.m. UTC | #8
On Mon, Jan 29, 2024 at 10:39 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >> We can see how those limitations (calling sch_tree_lock() and returning a ptr)
> >> can be addressed in bpf. This will also help other similar use cases.
> >>
> >
> > For kptr, I wonder if we can support the following semantics in bpf if
> > they make sense:
>
> I think they are useful but they are not fully supported now.
>
> Some thoughts below.
>
> > 1. Passing a referenced kptr into a bpf program, which will also need
> > to be released, or exchanged into maps or allocated objects.
>
> "enqueue" should be the one considering here:
>
> struct Qdisc_ops {
>         /* ... */
>         int                     (*enqueue)(struct sk_buff *skb,
>                                            struct Qdisc *sch,
>                                            struct sk_buff **to_free);
>
> };
>
> The verifier only marks the skb as a trusted kptr but does not mark its
> reg->ref_obj_id. Take a look at btf_ctx_access(). In particular:
>
>         if (prog_args_trusted(prog))
>                 info->reg_type |= PTR_TRUSTED;
>
> The verifier does not know the skb ownership is passed into the ".enqueue" ops
> and does not know the bpf prog needs to release it or store it in a map.
>
> The verifier tracks the reference state when a KF_ACQUIRE kfunc is called (just
> an example, not saying we need to use KF_ACQUIRE kfunc). Take a look at
> acquire_reference_state() which is the useful one here.
>
> Whenever the verifier is loading the ".enqueue" bpf_prog, the verifier can
> always acquire_reference_state() for the "struct sk_buff *skb" argument.
>
> Take a look at a recent RFC:
> https://lore.kernel.org/bpf/20240122212217.1391878-1-thinker.li@gmail.com/
> which is tagging the argument of an ops (e.g. ".enqueue" here). That RFC patch
> is tagging the argument could be NULL by appending "__nullable" to the argument
> name. The verifier will enforce that the bpf prog must check for NULL first.
>
> The similar idea can be used here but with a different tagging (for example,
> "__must_release", admittedly not a good name). While the RFC patch is
> in-progress, for now, may be hardcode for the ".enqueue" ops in
> check_struct_ops_btf_id() and always acquire_reference_state() for the skb. This
> part can be adjusted later once the RFC patch will be in shape.
>

Make sense. One more thing to consider here is that .enqueue is
actually a reference acquiring and releasing function at the same
time. Assuming ctx written to by a struct_ops program can be seen by
the kernel, another new tag for the "to_free" argument will still be
needed so that the verifier can recognize when writing skb to
"to_free".

>
> Then one more thing is to track when the struct_ops bpf prog is actually reading
> the value of the skb pointer. One thing is worth to mention here, e.g. a
> struct_ops prog for enqueue:
>
> SEC("struct_ops")
> int BPF_PROG(bpf_dropall_enqueue, struct sk_buff *skb, struct Qdisc *sch,
>              struct sk_buff **to_free)
> {
>         return bpf_qdisc_drop(skb, sch, to_free);
> }
>
> Take a look at the BPF_PROG macro, the bpf prog is getting a pointer to an array
> of __u64 as the only argument. The skb is actually in ctx[0], sch is in
> ctx[1]...etc. When ctx[0] is read to get the skb pointer (e.g. r1 = ctx[0]),
> btf_ctx_access() marks the reg_type to PTR_TRUSTED. It needs to also initialize
> the reg->ref_obj_id by the id obtained earlier from acquire_reference_state()
> during check_struct_ops_btf_id() somehow.
>
>
> > 2. Returning a kptr from a program and treating it as releasing the reference.
>
> e.g. for dequeue:
>
> struct Qdisc_ops {
>         /* ... */
>         struct sk_buff *        (*dequeue)(struct Qdisc *);
> };
>
>
> Right now the verifier should complain on check_reference_leak() if the
> struct_ops bpf prog is returning a referenced kptr.
>
> Unlike an argument, the return type of a function does not have a name to tag.
> It is the first case that a struct_ops bpf_prog returning a pointer. One idea is
> to assume it must be a trusted pointer (PTR_TRUSTED) and the verifier should
> check it is indeed with PTR_TRUSTED flag.
>
> May be release_reference_state() can be called to assume the kernel will release
> it as long as the return pointer type is PTR_TRUSTED and the type matches the
> return type of the ops. Take a look at check_return_code().
Kui-Feng Lee Jan. 31, 2024, 4:49 p.m. UTC | #9
On 1/30/24 17:01, Martin KaFai Lau wrote:
> On 1/30/24 9:49 AM, Kui-Feng Lee wrote:
>>>> 2. Returning a kptr from a program and treating it as releasing the 
>>>> reference.
>>>
>>> e.g. for dequeue:
>>>
>>> struct Qdisc_ops {
>>>      /* ... */
>>>      struct sk_buff *        (*dequeue)(struct Qdisc *);
>>> };
>>>
>>>
>>> Right now the verifier should complain on check_reference_leak() if 
>>> the struct_ops bpf prog is returning a referenced kptr.
>>>
>>> Unlike an argument, the return type of a function does not have a 
>>> name to tag. It is the first case that a struct_ops bpf_prog returning a 
>>
>> We may tag the stub functions instead, right?
> 
> What is the suggestion on how to tag the return type?
> 
> I was suggesting it doesn't need to tag and it should by default require 
> a trusted ptr for the pointer returned by struct_ops. The pointer 
> argument and the return pointer of a struct_ops should be a trusted ptr.


That make sense to me. Should we also allow operators to return a null
pointer?

> 
>> Is the purpose here to return a referenced pointer from a struct_ops
>> operator without verifier complaining?
> 
> Yes, basically need to teach the verifier the kernel will do the 
> reference release.
> 
>>
>>> pointer. One idea is to assume it must be a trusted pointer 
>>> (PTR_TRUSTED) and the verifier should check it is indeed with 
>>> PTR_TRUSTED flag.
>>>
>>> May be release_reference_state() can be called to assume the kernel 
>>> will release it as long as the return pointer type is PTR_TRUSTED and 
>>> the type matches the return type of the ops. Take a look at 
>>> check_return_code(). 
>
Amery Hung Jan. 31, 2024, 4:59 p.m. UTC | #10
On Wed, Jan 31, 2024 at 8:49 AM Kui-Feng Lee <sinquersw@gmail.com> wrote:
>
>
>
> On 1/30/24 17:01, Martin KaFai Lau wrote:
> > On 1/30/24 9:49 AM, Kui-Feng Lee wrote:
> >>>> 2. Returning a kptr from a program and treating it as releasing the
> >>>> reference.
> >>>
> >>> e.g. for dequeue:
> >>>
> >>> struct Qdisc_ops {
> >>>      /* ... */
> >>>      struct sk_buff *        (*dequeue)(struct Qdisc *);
> >>> };
> >>>
> >>>
> >>> Right now the verifier should complain on check_reference_leak() if
> >>> the struct_ops bpf prog is returning a referenced kptr.
> >>>
> >>> Unlike an argument, the return type of a function does not have a
> >>> name to tag. It is the first case that a struct_ops bpf_prog returning a
> >>
> >> We may tag the stub functions instead, right?
> >
> > What is the suggestion on how to tag the return type?
> >
> > I was suggesting it doesn't need to tag and it should by default require
> > a trusted ptr for the pointer returned by struct_ops. The pointer
> > argument and the return pointer of a struct_ops should be a trusted ptr.
>
>
> That make sense to me. Should we also allow operators to return a null
> pointer?
>

.dequeue in Qdisc_ops can return a null pointer when there is no skb
to be dequeued so I think that should be allowed.

> >
> >> Is the purpose here to return a referenced pointer from a struct_ops
> >> operator without verifier complaining?
> >
> > Yes, basically need to teach the verifier the kernel will do the
> > reference release.
> >
> >>
> >>> pointer. One idea is to assume it must be a trusted pointer
> >>> (PTR_TRUSTED) and the verifier should check it is indeed with
> >>> PTR_TRUSTED flag.
> >>>
> >>> May be release_reference_state() can be called to assume the kernel
> >>> will release it as long as the return pointer type is PTR_TRUSTED and
> >>> the type matches the return type of the ops. Take a look at
> >>> check_return_code().
> >
Martin KaFai Lau Feb. 2, 2024, 1:47 a.m. UTC | #11
On 1/31/24 8:23 AM, Amery Hung wrote:
>>> 1. Passing a referenced kptr into a bpf program, which will also need
>>> to be released, or exchanged into maps or allocated objects.
>> "enqueue" should be the one considering here:
>>
>> struct Qdisc_ops {
>>          /* ... */
>>          int                     (*enqueue)(struct sk_buff *skb,
>>                                             struct Qdisc *sch,
>>                                             struct sk_buff **to_free);
>>
>> };
>>
>> The verifier only marks the skb as a trusted kptr but does not mark its
>> reg->ref_obj_id. Take a look at btf_ctx_access(). In particular:
>>
>>          if (prog_args_trusted(prog))
>>                  info->reg_type |= PTR_TRUSTED;
>>
>> The verifier does not know the skb ownership is passed into the ".enqueue" ops
>> and does not know the bpf prog needs to release it or store it in a map.
>>
>> The verifier tracks the reference state when a KF_ACQUIRE kfunc is called (just
>> an example, not saying we need to use KF_ACQUIRE kfunc). Take a look at
>> acquire_reference_state() which is the useful one here.
>>
>> Whenever the verifier is loading the ".enqueue" bpf_prog, the verifier can
>> always acquire_reference_state() for the "struct sk_buff *skb" argument.
>>
>> Take a look at a recent RFC:
>> https://lore.kernel.org/bpf/20240122212217.1391878-1-thinker.li@gmail.com/
>> which is tagging the argument of an ops (e.g. ".enqueue" here). That RFC patch
>> is tagging the argument could be NULL by appending "__nullable" to the argument
>> name. The verifier will enforce that the bpf prog must check for NULL first.
>>
>> The similar idea can be used here but with a different tagging (for example,
>> "__must_release", admittedly not a good name). While the RFC patch is
>> in-progress, for now, may be hardcode for the ".enqueue" ops in
>> check_struct_ops_btf_id() and always acquire_reference_state() for the skb. This
>> part can be adjusted later once the RFC patch will be in shape.
>>
> Make sense. One more thing to consider here is that .enqueue is
> actually a reference acquiring and releasing function at the same
> time. Assuming ctx written to by a struct_ops program can be seen by
> the kernel, another new tag for the "to_free" argument will still be
> needed so that the verifier can recognize when writing skb to
> "to_free".

I don't think "to_free" needs special tagging. I was thinking the 
"bpf_qdisc_drop" kfunc could be a KF_RELEASE. Ideally, it should be like

__bpf_kfunc int bpf_qdisc_drop(struct sk_buff *skb, struct Qdisc *sch,
	                       struct sk_buff **to_free)
{
	return qdisc_drop(skb, sch, to_free);
}

However, I don't think the verifier supports pointer to pointer now. Meaning
"struct sk_buff **to_free" does not work.

If the ptr indirection spinning in my head is sound, one possible solution to 
unblock the qdisc work is to introduce:

struct bpf_sk_buff_ptr {
	struct sk_buff *skb;
};

and the bpf_qdisc_drop kfunc:

__bpf_kfunc int bpf_qdisc_drop(struct sk_buff *skb, struct Qdisc *sch,
                                struct bpf_sk_buff_ptr *to_free_list)

and the enqueue prog:

SEC("struct_ops/enqueue")
int BPF_PROG(test_enqueue, struct sk_buff *skb,
              struct Qdisc *sch,
              struct bpf_sk_buff_ptr *to_free_list)
{
	return bpf_qdisc_drop(skb, sch, to_free_list);
}

and the ".is_valid_access" needs to change the btf_type from "struct sk_buff **" 
to "struct bpf_sk_buff_ptr *" which is sort of similar to the bpf_tcp_ca.c that 
is changing the "struct sock *" type to the "struct tcp_sock *" type.

I have the compiler-tested idea here: 
https://git.kernel.org/pub/scm/linux/kernel/git/martin.lau/bpf-next.git/log/?h=qdisc-ideas


> 
>> Then one more thing is to track when the struct_ops bpf prog is actually reading
>> the value of the skb pointer. One thing is worth to mention here, e.g. a
>> struct_ops prog for enqueue:
>>
>> SEC("struct_ops")
>> int BPF_PROG(bpf_dropall_enqueue, struct sk_buff *skb, struct Qdisc *sch,
>>               struct sk_buff **to_free)
>> {
>>          return bpf_qdisc_drop(skb, sch, to_free);
>> }
>>
>> Take a look at the BPF_PROG macro, the bpf prog is getting a pointer to an array
>> of __u64 as the only argument. The skb is actually in ctx[0], sch is in
>> ctx[1]...etc. When ctx[0] is read to get the skb pointer (e.g. r1 = ctx[0]),
>> btf_ctx_access() marks the reg_type to PTR_TRUSTED. It needs to also initialize
>> the reg->ref_obj_id by the id obtained earlier from acquire_reference_state()
>> during check_struct_ops_btf_id() somehow.
Amery Hung Feb. 9, 2024, 8:14 p.m. UTC | #12
On Thu, Feb 1, 2024 at 5:47 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 1/31/24 8:23 AM, Amery Hung wrote:
> >>> 1. Passing a referenced kptr into a bpf program, which will also need
> >>> to be released, or exchanged into maps or allocated objects.
> >> "enqueue" should be the one considering here:
> >>
> >> struct Qdisc_ops {
> >>          /* ... */
> >>          int                     (*enqueue)(struct sk_buff *skb,
> >>                                             struct Qdisc *sch,
> >>                                             struct sk_buff **to_free);
> >>
> >> };
> >>
> >> The verifier only marks the skb as a trusted kptr but does not mark its
> >> reg->ref_obj_id. Take a look at btf_ctx_access(). In particular:
> >>
> >>          if (prog_args_trusted(prog))
> >>                  info->reg_type |= PTR_TRUSTED;
> >>
> >> The verifier does not know the skb ownership is passed into the ".enqueue" ops
> >> and does not know the bpf prog needs to release it or store it in a map.
> >>
> >> The verifier tracks the reference state when a KF_ACQUIRE kfunc is called (just
> >> an example, not saying we need to use KF_ACQUIRE kfunc). Take a look at
> >> acquire_reference_state() which is the useful one here.
> >>
> >> Whenever the verifier is loading the ".enqueue" bpf_prog, the verifier can
> >> always acquire_reference_state() for the "struct sk_buff *skb" argument.
> >>
> >> Take a look at a recent RFC:
> >> https://lore.kernel.org/bpf/20240122212217.1391878-1-thinker.li@gmail.com/
> >> which is tagging the argument of an ops (e.g. ".enqueue" here). That RFC patch
> >> is tagging the argument could be NULL by appending "__nullable" to the argument
> >> name. The verifier will enforce that the bpf prog must check for NULL first.
> >>
> >> The similar idea can be used here but with a different tagging (for example,
> >> "__must_release", admittedly not a good name). While the RFC patch is
> >> in-progress, for now, may be hardcode for the ".enqueue" ops in
> >> check_struct_ops_btf_id() and always acquire_reference_state() for the skb. This
> >> part can be adjusted later once the RFC patch will be in shape.
> >>
> > Make sense. One more thing to consider here is that .enqueue is
> > actually a reference acquiring and releasing function at the same
> > time. Assuming ctx written to by a struct_ops program can be seen by
> > the kernel, another new tag for the "to_free" argument will still be
> > needed so that the verifier can recognize when writing skb to
> > "to_free".
>
> I don't think "to_free" needs special tagging. I was thinking the
> "bpf_qdisc_drop" kfunc could be a KF_RELEASE. Ideally, it should be like
>
> __bpf_kfunc int bpf_qdisc_drop(struct sk_buff *skb, struct Qdisc *sch,
>                                struct sk_buff **to_free)
> {
>         return qdisc_drop(skb, sch, to_free);
> }
>
> However, I don't think the verifier supports pointer to pointer now. Meaning
> "struct sk_buff **to_free" does not work.
>
> If the ptr indirection spinning in my head is sound, one possible solution to
> unblock the qdisc work is to introduce:
>
> struct bpf_sk_buff_ptr {
>         struct sk_buff *skb;
> };
>
> and the bpf_qdisc_drop kfunc:
>
> __bpf_kfunc int bpf_qdisc_drop(struct sk_buff *skb, struct Qdisc *sch,
>                                 struct bpf_sk_buff_ptr *to_free_list)
>
> and the enqueue prog:
>
> SEC("struct_ops/enqueue")
> int BPF_PROG(test_enqueue, struct sk_buff *skb,
>               struct Qdisc *sch,
>               struct bpf_sk_buff_ptr *to_free_list)
> {
>         return bpf_qdisc_drop(skb, sch, to_free_list);
> }
>
> and the ".is_valid_access" needs to change the btf_type from "struct sk_buff **"
> to "struct bpf_sk_buff_ptr *" which is sort of similar to the bpf_tcp_ca.c that
> is changing the "struct sock *" type to the "struct tcp_sock *" type.
>
> I have the compiler-tested idea here:
> https://git.kernel.org/pub/scm/linux/kernel/git/martin.lau/bpf-next.git/log/?h=qdisc-ideas
>
>
> >
> >> Then one more thing is to track when the struct_ops bpf prog is actually reading
> >> the value of the skb pointer. One thing is worth to mention here, e.g. a
> >> struct_ops prog for enqueue:
> >>
> >> SEC("struct_ops")
> >> int BPF_PROG(bpf_dropall_enqueue, struct sk_buff *skb, struct Qdisc *sch,
> >>               struct sk_buff **to_free)
> >> {
> >>          return bpf_qdisc_drop(skb, sch, to_free);
> >> }
> >>
> >> Take a look at the BPF_PROG macro, the bpf prog is getting a pointer to an array
> >> of __u64 as the only argument. The skb is actually in ctx[0], sch is in
> >> ctx[1]...etc. When ctx[0] is read to get the skb pointer (e.g. r1 = ctx[0]),
> >> btf_ctx_access() marks the reg_type to PTR_TRUSTED. It needs to also initialize
> >> the reg->ref_obj_id by the id obtained earlier from acquire_reference_state()
> >> during check_struct_ops_btf_id() somehow.
>

I appreciate the idea. The pointer redirection works without problems.
I now have a working fifo bpf qdisc using struct_ops. I will explore
how other parts of qdisc work with struct_ops.

Thanks,
Amery
diff mbox series

Patch

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index a4247377e951..3e35033a9126 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -83,6 +83,10 @@  BPF_PROG_TYPE(BPF_PROG_TYPE_SYSCALL, bpf_syscall,
 BPF_PROG_TYPE(BPF_PROG_TYPE_NETFILTER, netfilter,
 	      struct bpf_nf_ctx, struct bpf_nf_ctx)
 #endif
+#ifdef CONFIG_NET
+BPF_PROG_TYPE(BPF_PROG_TYPE_QDISC, tc_qdisc,
+	      struct bpf_qdisc_ctx, struct bpf_qdisc_ctx)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0bb92414c036..df280bbb7c0d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -997,6 +997,7 @@  enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_LOOKUP,
 	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
 	BPF_PROG_TYPE_NETFILTER,
+	BPF_PROG_TYPE_QDISC,
 };
 
 enum bpf_attach_type {
@@ -1056,6 +1057,8 @@  enum bpf_attach_type {
 	BPF_CGROUP_UNIX_GETSOCKNAME,
 	BPF_NETKIT_PRIMARY,
 	BPF_NETKIT_PEER,
+	BPF_QDISC_ENQUEUE,
+	BPF_QDISC_DEQUEUE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -7357,4 +7360,22 @@  struct bpf_iter_num {
 	__u64 __opaque[1];
 } __attribute__((aligned(8)));
 
+struct bpf_qdisc_ctx {
+	__bpf_md_ptr(struct sk_buff *, skb);
+	__u32 classid;
+	__u64 expire;
+	__u64 delta_ns;
+};
+
+enum {
+	SCH_BPF_QUEUED,
+	SCH_BPF_DEQUEUED = SCH_BPF_QUEUED,
+	SCH_BPF_DROP,
+	SCH_BPF_CN,
+	SCH_BPF_THROTTLE,
+	SCH_BPF_PASS,
+	SCH_BPF_BYPASS,
+	SCH_BPF_STOLEN,
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index f762a10bfb78..d05462309f5a 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -1317,4 +1317,20 @@  enum {
 
 #define TCA_ETS_MAX (__TCA_ETS_MAX - 1)
 
+#define TCA_SCH_BPF_FLAG_DIRECT _BITUL(0)
+enum {
+	TCA_SCH_BPF_UNSPEC,
+	TCA_SCH_BPF_ENQUEUE_PROG_NAME,	/* string */
+	TCA_SCH_BPF_ENQUEUE_PROG_FD,	/* u32 */
+	TCA_SCH_BPF_ENQUEUE_PROG_ID,	/* u32 */
+	TCA_SCH_BPF_ENQUEUE_PROG_TAG,	/* data */
+	TCA_SCH_BPF_DEQUEUE_PROG_NAME,	/* string */
+	TCA_SCH_BPF_DEQUEUE_PROG_FD,	/* u32 */
+	TCA_SCH_BPF_DEQUEUE_PROG_ID,	/* u32 */
+	TCA_SCH_BPF_DEQUEUE_PROG_TAG,	/* data */
+	__TCA_SCH_BPF_MAX,
+};
+
+#define TCA_SCH_BPF_MAX (__TCA_SCH_BPF_MAX - 1)
+
 #endif
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 15d71d2986d3..ee8d6c127b04 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -217,6 +217,7 @@  enum btf_kfunc_hook {
 	BTF_KFUNC_HOOK_SOCKET_FILTER,
 	BTF_KFUNC_HOOK_LWT,
 	BTF_KFUNC_HOOK_NETFILTER,
+	BTF_KFUNC_HOOK_QDISC,
 	BTF_KFUNC_HOOK_MAX,
 };
 
@@ -5928,6 +5929,8 @@  static bool prog_args_trusted(const struct bpf_prog *prog)
 		return bpf_lsm_is_trusted(prog);
 	case BPF_PROG_TYPE_STRUCT_OPS:
 		return true;
+	case BPF_PROG_TYPE_QDISC:
+		return true;
 	default:
 		return false;
 	}
@@ -7865,6 +7868,8 @@  static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
 		return BTF_KFUNC_HOOK_LWT;
 	case BPF_PROG_TYPE_NETFILTER:
 		return BTF_KFUNC_HOOK_NETFILTER;
+	case BPF_PROG_TYPE_QDISC:
+		return BTF_KFUNC_HOOK_QDISC;
 	default:
 		return BTF_KFUNC_HOOK_MAX;
 	}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 56b0c1f678ee..d5e581ccd9a0 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2610,6 +2610,7 @@  static int __init kfunc_init(void)
 
 	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_QDISC, &generic_kfunc_set);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
 	ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
 						  ARRAY_SIZE(generic_dtors),
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 13eb50446e7a..1838bddd8526 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2502,6 +2502,14 @@  bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
 		if (expected_attach_type == BPF_NETFILTER)
 			return 0;
 		return -EINVAL;
+	case BPF_PROG_TYPE_QDISC:
+		switch (expected_attach_type) {
+		case BPF_QDISC_ENQUEUE:
+		case BPF_QDISC_DEQUEUE:
+			return 0;
+		default:
+			return -EINVAL;
+		}
 	case BPF_PROG_TYPE_SYSCALL:
 	case BPF_PROG_TYPE_EXT:
 		if (expected_attach_type)
diff --git a/net/core/filter.c b/net/core/filter.c
index 383f96b0a1c7..f25a0b6b5d56 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -8889,6 +8889,90 @@  static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log,
 	return ret;
 }
 
+static int tc_qdisc_prologue(struct bpf_insn *insn_buf, bool direct_write,
+			     const struct bpf_prog *prog)
+{
+	return bpf_unclone_prologue(insn_buf, direct_write, prog,
+				    SCH_BPF_DROP);
+}
+
+BTF_ID_LIST_SINGLE(tc_qdisc_ctx_access_btf_ids, struct, sk_buff)
+
+static bool tc_qdisc_is_valid_access(int off, int size,
+				     enum bpf_access_type type,
+				     const struct bpf_prog *prog,
+				     struct bpf_insn_access_aux *info)
+{
+	struct btf *btf;
+
+	if (off < 0 || off >= sizeof(struct bpf_qdisc_ctx))
+		return false;
+
+	switch (off) {
+	case offsetof(struct bpf_qdisc_ctx, skb):
+		if (type == BPF_WRITE ||
+		    size != sizeof_field(struct bpf_qdisc_ctx, skb))
+			return false;
+
+		if (prog->expected_attach_type != BPF_QDISC_ENQUEUE)
+			return false;
+
+		btf = bpf_get_btf_vmlinux();
+		if (IS_ERR_OR_NULL(btf))
+			return false;
+
+		info->btf = btf;
+		info->btf_id = tc_qdisc_ctx_access_btf_ids[0];
+		info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED;
+		return true;
+	case bpf_ctx_range(struct bpf_qdisc_ctx, classid):
+		return size == sizeof_field(struct bpf_qdisc_ctx, classid);
+	case bpf_ctx_range(struct bpf_qdisc_ctx, expire):
+		return size == sizeof_field(struct bpf_qdisc_ctx, expire);
+	case bpf_ctx_range(struct bpf_qdisc_ctx, delta_ns):
+		return size == sizeof_field(struct bpf_qdisc_ctx, delta_ns);
+	default:
+		return false;
+	}
+
+	return false;
+}
+
+static int tc_qdisc_btf_struct_access(struct bpf_verifier_log *log,
+				      const struct bpf_reg_state *reg,
+				      int off, int size)
+{
+	const struct btf_type *skbt, *t;
+	size_t end;
+
+	skbt = btf_type_by_id(reg->btf, tc_qdisc_ctx_access_btf_ids[0]);
+	t = btf_type_by_id(reg->btf, reg->btf_id);
+	if (t != skbt)
+		return -EACCES;
+
+	switch (off) {
+	case offsetof(struct sk_buff, cb) ...
+	     offsetofend(struct sk_buff, cb) - 1:
+		end = offsetofend(struct sk_buff, cb);
+		break;
+	case offsetof(struct sk_buff, tstamp):
+		end = offsetofend(struct sk_buff, tstamp);
+		break;
+	default:
+		bpf_log(log, "no write support to skb at off %d\n", off);
+		return -EACCES;
+	}
+
+	if (off + size > end) {
+		bpf_log(log,
+			"write access at off %d with size %d beyond the member of sk_buff ended at %zu\n",
+			off, size, end);
+		return -EACCES;
+	}
+
+	return 0;
+}
+
 static bool __is_valid_xdp_access(int off, int size)
 {
 	if (off < 0 || off >= sizeof(struct xdp_md))
@@ -10890,6 +10974,18 @@  const struct bpf_prog_ops tc_cls_act_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
+const struct bpf_verifier_ops tc_qdisc_verifier_ops = {
+	.get_func_proto		= tc_cls_act_func_proto,
+	.is_valid_access	= tc_qdisc_is_valid_access,
+	.gen_prologue		= tc_qdisc_prologue,
+	.gen_ld_abs		= bpf_gen_ld_abs,
+	.btf_struct_access	= tc_qdisc_btf_struct_access,
+};
+
+const struct bpf_prog_ops tc_qdisc_prog_ops = {
+	.test_run		= bpf_prog_test_run_skb,
+};
+
 const struct bpf_verifier_ops xdp_verifier_ops = {
 	.get_func_proto		= xdp_func_proto,
 	.is_valid_access	= xdp_is_valid_access,
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 470c70deffe2..e4ece091af4d 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -403,6 +403,21 @@  config NET_SCH_ETS
 
 	  If unsure, say N.
 
+config NET_SCH_BPF
+	tristate "eBPF based programmable queue discipline"
+	help
+	  This eBPF based queue discipline offers a way to program your
+	  own packet scheduling algorithm. This is a classful qdisc which
+	  also allows you to decide the hierarchy.
+
+	  Say Y here if you want to use the eBPF based programmable queue
+	  discipline.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sch_bpf.
+
+	  If unsure, say N.
+
 menuconfig NET_SCH_DEFAULT
 	bool "Allow override default queue discipline"
 	help
diff --git a/net/sched/Makefile b/net/sched/Makefile
index b5fd49641d91..4e24c6c79cb8 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -63,6 +63,7 @@  obj-$(CONFIG_NET_SCH_FQ_PIE)	+= sch_fq_pie.o
 obj-$(CONFIG_NET_SCH_CBS)	+= sch_cbs.o
 obj-$(CONFIG_NET_SCH_ETF)	+= sch_etf.o
 obj-$(CONFIG_NET_SCH_TAPRIO)	+= sch_taprio.o
+obj-$(CONFIG_NET_SCH_BPF)	+= sch_bpf.o
 
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
diff --git a/net/sched/sch_bpf.c b/net/sched/sch_bpf.c
new file mode 100644
index 000000000000..56f3ab9c6059
--- /dev/null
+++ b/net/sched/sch_bpf.c
@@ -0,0 +1,537 @@ 
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Programmable Qdisc with eBPF
+ *
+ * Copyright (C) 2022, ByteDance, Cong Wang <cong.wang@bytedance.com>
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+#define ACT_BPF_NAME_LEN	256
+
+struct sch_bpf_prog {
+	struct bpf_prog *prog;
+	const char *name;
+};
+
+struct sch_bpf_class {
+	struct Qdisc_class_common common;
+	struct Qdisc *qdisc;
+
+	unsigned int drops;
+	unsigned int overlimits;
+	struct gnet_stats_basic_sync bstats;
+};
+
+struct bpf_sched_data {
+	struct tcf_proto __rcu *filter_list; /* optional external classifier */
+	struct tcf_block *block;
+	struct Qdisc_class_hash clhash;
+	struct sch_bpf_prog __rcu enqueue_prog;
+	struct sch_bpf_prog __rcu dequeue_prog;
+
+	struct qdisc_watchdog watchdog;
+};
+
+static int sch_bpf_dump_prog(const struct sch_bpf_prog *prog, struct sk_buff *skb,
+			     int name, int id, int tag)
+{
+	struct nlattr *nla;
+
+	if (prog->name &&
+	    nla_put_string(skb, name, prog->name))
+		return -EMSGSIZE;
+
+	if (nla_put_u32(skb, id, prog->prog->aux->id))
+		return -EMSGSIZE;
+
+	nla = nla_reserve(skb, tag, sizeof(prog->prog->tag));
+	if (!nla)
+		return -EMSGSIZE;
+
+	memcpy(nla_data(nla), prog->prog->tag, nla_len(nla));
+	return 0;
+}
+
+static int sch_bpf_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct bpf_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts;
+
+	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
+	if (!opts)
+		goto nla_put_failure;
+
+	if (sch_bpf_dump_prog(&q->enqueue_prog, skb, TCA_SCH_BPF_ENQUEUE_PROG_NAME,
+			      TCA_SCH_BPF_ENQUEUE_PROG_ID, TCA_SCH_BPF_ENQUEUE_PROG_TAG))
+		goto nla_put_failure;
+	if (sch_bpf_dump_prog(&q->dequeue_prog, skb, TCA_SCH_BPF_DEQUEUE_PROG_NAME,
+			      TCA_SCH_BPF_DEQUEUE_PROG_ID, TCA_SCH_BPF_DEQUEUE_PROG_TAG))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	return -1;
+}
+
+static int sch_bpf_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	return 0;
+}
+
+static struct sch_bpf_class *sch_bpf_find(struct Qdisc *sch, u32 classid)
+{
+	struct bpf_sched_data *q = qdisc_priv(sch);
+	struct Qdisc_class_common *clc;
+
+	clc = qdisc_class_find(&q->clhash, classid);
+	if (!clc)
+		return NULL;
+	return container_of(clc, struct sch_bpf_class, common);
+}
+
+static int sch_bpf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+			   struct sk_buff **to_free)
+{
+	struct bpf_sched_data *q = qdisc_priv(sch);
+	unsigned int len = qdisc_pkt_len(skb);
+	struct bpf_qdisc_ctx ctx = {};
+	int res = NET_XMIT_SUCCESS;
+	struct sch_bpf_class *cl;
+	struct bpf_prog *enqueue;
+
+	enqueue = rcu_dereference(q->enqueue_prog.prog);
+	if (!enqueue)
+		return NET_XMIT_DROP;
+
+	ctx.skb = skb;
+	ctx.classid = sch->handle;
+	res = bpf_prog_run(enqueue, &ctx);
+	switch (res) {
+	case SCH_BPF_THROTTLE:
+		qdisc_watchdog_schedule_range_ns(&q->watchdog, ctx.expire, ctx.delta_ns);
+		qdisc_qstats_overlimit(sch);
+		fallthrough;
+	case SCH_BPF_QUEUED:
+		qdisc_qstats_backlog_inc(sch, skb);
+		return NET_XMIT_SUCCESS;
+	case SCH_BPF_BYPASS:
+		qdisc_qstats_drop(sch);
+		__qdisc_drop(skb, to_free);
+		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	case SCH_BPF_STOLEN:
+		__qdisc_drop(skb, to_free);
+		return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+	case SCH_BPF_CN:
+		return NET_XMIT_CN;
+	case SCH_BPF_PASS:
+		break;
+	default:
+		return qdisc_drop(skb, sch, to_free);
+	}
+
+	cl = sch_bpf_find(sch, ctx.classid);
+	if (!cl || !cl->qdisc)
+		return qdisc_drop(skb, sch, to_free);
+
+	res = qdisc_enqueue(skb, cl->qdisc, to_free);
+	if (res != NET_XMIT_SUCCESS) {
+		if (net_xmit_drop_count(res)) {
+			qdisc_qstats_drop(sch);
+			cl->drops++;
+		}
+		return res;
+	}
+
+	sch->qstats.backlog += len;
+	sch->q.qlen++;
+	return res;
+}
+
+DEFINE_PER_CPU(struct sk_buff*, bpf_skb_dequeue);
+
+static struct sk_buff *sch_bpf_dequeue(struct Qdisc *sch)
+{
+	struct bpf_sched_data *q = qdisc_priv(sch);
+	struct bpf_qdisc_ctx ctx = {};
+	struct sk_buff *skb = NULL;
+	struct bpf_prog *dequeue;
+	struct sch_bpf_class *cl;
+	int res;
+
+	dequeue = rcu_dereference(q->dequeue_prog.prog);
+	if (!dequeue)
+		return NULL;
+
+	__this_cpu_write(bpf_skb_dequeue, NULL);
+	ctx.classid = sch->handle;
+	res = bpf_prog_run(dequeue, &ctx);
+	switch (res) {
+	case SCH_BPF_DEQUEUED:
+		skb = __this_cpu_read(bpf_skb_dequeue);
+		qdisc_bstats_update(sch, skb);
+		qdisc_qstats_backlog_dec(sch, skb);
+		break;
+	case SCH_BPF_THROTTLE:
+		qdisc_watchdog_schedule_range_ns(&q->watchdog, ctx.expire, ctx.delta_ns);
+		qdisc_qstats_overlimit(sch);
+		cl = sch_bpf_find(sch, ctx.classid);
+		if (cl)
+			cl->overlimits++;
+		return NULL;
+	case SCH_BPF_PASS:
+		cl = sch_bpf_find(sch, ctx.classid);
+		if (!cl || !cl->qdisc)
+			return NULL;
+		skb = qdisc_dequeue_peeked(cl->qdisc);
+		if (skb) {
+			bstats_update(&cl->bstats, skb);
+			qdisc_bstats_update(sch, skb);
+			qdisc_qstats_backlog_dec(sch, skb);
+			sch->q.qlen--;
+		}
+		break;
+	}
+
+	return skb;
+}
+
+static struct Qdisc *sch_bpf_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct sch_bpf_class *cl = (struct sch_bpf_class *)arg;
+
+	return cl->qdisc;
+}
+
+static int sch_bpf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+			 struct Qdisc **old, struct netlink_ext_ack *extack)
+{
+	struct sch_bpf_class *cl = (struct sch_bpf_class *)arg;
+
+	if (new)
+		*old = qdisc_replace(sch, new, &cl->qdisc);
+	return 0;
+}
+
+static unsigned long sch_bpf_bind(struct Qdisc *sch, unsigned long parent,
+				  u32 classid)
+{
+	return 0;
+}
+
+static void sch_bpf_unbind(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static unsigned long sch_bpf_search(struct Qdisc *sch, u32 handle)
+{
+	return (unsigned long)sch_bpf_find(sch, handle);
+}
+
+static struct tcf_block *sch_bpf_tcf_block(struct Qdisc *sch, unsigned long cl,
+					   struct netlink_ext_ack *extack)
+{
+	struct bpf_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return q->block;
+}
+
+static const struct nla_policy sch_bpf_policy[TCA_SCH_BPF_MAX + 1] = {
+	[TCA_SCH_BPF_ENQUEUE_PROG_FD]	= { .type = NLA_U32 },
+	[TCA_SCH_BPF_ENQUEUE_PROG_NAME]	= { .type = NLA_NUL_STRING,
+					    .len = ACT_BPF_NAME_LEN },
+	[TCA_SCH_BPF_DEQUEUE_PROG_FD]	= { .type = NLA_U32 },
+	[TCA_SCH_BPF_DEQUEUE_PROG_NAME]	= { .type = NLA_NUL_STRING,
+					    .len = ACT_BPF_NAME_LEN },
+};
+
+static int bpf_init_prog(struct nlattr *fd, struct nlattr *name, struct sch_bpf_prog *prog)
+{
+	struct bpf_prog *fp, *old_fp;
+	char *prog_name = NULL;
+	u32 bpf_fd;
+
+	if (!fd)
+		return -EINVAL;
+	bpf_fd = nla_get_u32(fd);
+
+	fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_QDISC);
+	if (IS_ERR(fp))
+		return PTR_ERR(fp);
+
+	if (name) {
+		prog_name = nla_memdup(name, GFP_KERNEL);
+		if (!prog_name) {
+			bpf_prog_put(fp);
+			return -ENOMEM;
+		}
+	}
+
+	prog->name = prog_name;
+
+	/* updates to prog->prog are prevent since the caller holds
+	 * sch_tree_lock
+	 */
+	old_fp = rcu_replace_pointer(prog->prog, fp, 1);
+	if (old_fp)
+		bpf_prog_put(old_fp);
+
+	return 0;
+}
+
+static void bpf_cleanup_prog(struct sch_bpf_prog *prog)
+{
+	struct bpf_prog *old_fp = NULL;
+
+	/* updates to prog->prog are prevent since the caller holds
+	 * sch_tree_lock
+	 */
+	old_fp = rcu_replace_pointer(prog->prog, old_fp, 1);
+	if (old_fp)
+		bpf_prog_put(old_fp);
+
+	kfree(prog->name);
+}
+
+static int sch_bpf_change(struct Qdisc *sch, struct nlattr *opt,
+			  struct netlink_ext_ack *extack)
+{
+	struct bpf_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_SCH_BPF_MAX + 1];
+	int err;
+
+	if (!opt)
+		return -EINVAL;
+
+	err = nla_parse_nested_deprecated(tb, TCA_SCH_BPF_MAX, opt,
+					  sch_bpf_policy, NULL);
+	if (err < 0)
+		return err;
+
+	sch_tree_lock(sch);
+
+	err = bpf_init_prog(tb[TCA_SCH_BPF_ENQUEUE_PROG_FD],
+			    tb[TCA_SCH_BPF_ENQUEUE_PROG_NAME], &q->enqueue_prog);
+	if (err)
+		goto failure;
+	err = bpf_init_prog(tb[TCA_SCH_BPF_DEQUEUE_PROG_FD],
+			    tb[TCA_SCH_BPF_DEQUEUE_PROG_NAME], &q->dequeue_prog);
+failure:
+	sch_tree_unlock(sch);
+	return err;
+}
+
+static int sch_bpf_init(struct Qdisc *sch, struct nlattr *opt,
+			struct netlink_ext_ack *extack)
+{
+	struct bpf_sched_data *q = qdisc_priv(sch);
+	int err;
+
+	qdisc_watchdog_init(&q->watchdog, sch);
+	if (opt) {
+		err = sch_bpf_change(sch, opt, extack);
+		if (err)
+			return err;
+	}
+
+	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
+	if (err)
+		return err;
+
+	return qdisc_class_hash_init(&q->clhash);
+}
+
+static void sch_bpf_reset(struct Qdisc *sch)
+{
+	struct bpf_sched_data *q = qdisc_priv(sch);
+	struct sch_bpf_class *cl;
+	unsigned int i;
+
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+			if (cl->qdisc)
+				qdisc_reset(cl->qdisc);
+		}
+	}
+
+	qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static void sch_bpf_destroy_class(struct Qdisc *sch, struct sch_bpf_class *cl)
+{
+	qdisc_put(cl->qdisc);
+	kfree(cl);
+}
+
+static void sch_bpf_destroy(struct Qdisc *sch)
+{
+	struct bpf_sched_data *q = qdisc_priv(sch);
+	struct sch_bpf_class *cl;
+	unsigned int i;
+
+	qdisc_watchdog_cancel(&q->watchdog);
+	tcf_block_put(q->block);
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+			sch_bpf_destroy_class(sch, cl);
+		}
+	}
+
+	qdisc_class_hash_destroy(&q->clhash);
+
+	sch_tree_lock(sch);
+	bpf_cleanup_prog(&q->enqueue_prog);
+	bpf_cleanup_prog(&q->dequeue_prog);
+	sch_tree_unlock(sch);
+}
+
+static int sch_bpf_change_class(struct Qdisc *sch, u32 classid,
+				u32 parentid, struct nlattr **tca,
+				unsigned long *arg,
+				struct netlink_ext_ack *extack)
+{
+	struct sch_bpf_class *cl = (struct sch_bpf_class *)*arg;
+	struct bpf_sched_data *q = qdisc_priv(sch);
+
+	if (!cl) {
+		if (classid == 0 || TC_H_MAJ(classid ^ sch->handle) != 0 ||
+		    sch_bpf_find(sch, classid))
+			return -EINVAL;
+
+		cl = kzalloc(sizeof(*cl), GFP_KERNEL);
+		if (!cl)
+			return -ENOBUFS;
+
+		cl->common.classid = classid;
+		gnet_stats_basic_sync_init(&cl->bstats);
+		qdisc_class_hash_insert(&q->clhash, &cl->common);
+	}
+
+	qdisc_class_hash_grow(sch, &q->clhash);
+	*arg = (unsigned long)cl;
+	return 0;
+}
+
+static int sch_bpf_delete(struct Qdisc *sch, unsigned long arg,
+			  struct netlink_ext_ack *extack)
+{
+	struct sch_bpf_class *cl = (struct sch_bpf_class *)arg;
+	struct bpf_sched_data *q = qdisc_priv(sch);
+
+	qdisc_class_hash_remove(&q->clhash, &cl->common);
+	if (cl->qdisc)
+		qdisc_put(cl->qdisc);
+	return 0;
+}
+
+static int sch_bpf_dump_class(struct Qdisc *sch, unsigned long arg,
+			      struct sk_buff *skb, struct tcmsg *tcm)
+{
+	return 0;
+}
+
+static int
+sch_bpf_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
+{
+	struct sch_bpf_class *cl = (struct sch_bpf_class *)arg;
+	struct gnet_stats_queue qs = {
+		.drops = cl->drops,
+		.overlimits = cl->overlimits,
+	};
+	__u32 qlen = 0;
+
+	if (cl->qdisc)
+		qdisc_qstats_qlen_backlog(cl->qdisc, &qlen, &qs.backlog);
+	else
+		qlen = 0;
+
+	if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 ||
+	    gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
+		return -1;
+	return 0;
+}
+
+static void sch_bpf_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct bpf_sched_data *q = qdisc_priv(sch);
+	struct sch_bpf_class *cl;
+	unsigned int i;
+
+	if (arg->stop)
+		return;
+
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+			if (arg->count < arg->skip) {
+				arg->count++;
+				continue;
+			}
+			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+				arg->stop = 1;
+				return;
+			}
+			arg->count++;
+		}
+	}
+}
+
+static const struct Qdisc_class_ops sch_bpf_class_ops = {
+	.graft		=	sch_bpf_graft,
+	.leaf		=	sch_bpf_leaf,
+	.find		=	sch_bpf_search,
+	.change		=	sch_bpf_change_class,
+	.delete		=	sch_bpf_delete,
+	.tcf_block	=	sch_bpf_tcf_block,
+	.bind_tcf	=	sch_bpf_bind,
+	.unbind_tcf	=	sch_bpf_unbind,
+	.dump		=	sch_bpf_dump_class,
+	.dump_stats	=	sch_bpf_dump_class_stats,
+	.walk		=	sch_bpf_walk,
+};
+
+static struct Qdisc_ops sch_bpf_qdisc_ops __read_mostly = {
+	.cl_ops		=	&sch_bpf_class_ops,
+	.id		=	"bpf",
+	.priv_size	=	sizeof(struct bpf_sched_data),
+	.enqueue	=	sch_bpf_enqueue,
+	.dequeue	=	sch_bpf_dequeue,
+	.peek		=	qdisc_peek_dequeued,
+	.init		=	sch_bpf_init,
+	.reset		=	sch_bpf_reset,
+	.destroy	=	sch_bpf_destroy,
+	.change		=	sch_bpf_change,
+	.dump		=	sch_bpf_dump,
+	.dump_stats	=	sch_bpf_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init sch_bpf_mod_init(void)
+{
+	return register_qdisc(&sch_bpf_qdisc_ops);
+}
+
+static void __exit sch_bpf_mod_exit(void)
+{
+	unregister_qdisc(&sch_bpf_qdisc_ops);
+}
+
+module_init(sch_bpf_mod_init)
+module_exit(sch_bpf_mod_exit)
+MODULE_AUTHOR("Cong Wang");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("eBPF queue discipline");
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 0bb92414c036..df280bbb7c0d 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -997,6 +997,7 @@  enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_LOOKUP,
 	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
 	BPF_PROG_TYPE_NETFILTER,
+	BPF_PROG_TYPE_QDISC,
 };
 
 enum bpf_attach_type {
@@ -1056,6 +1057,8 @@  enum bpf_attach_type {
 	BPF_CGROUP_UNIX_GETSOCKNAME,
 	BPF_NETKIT_PRIMARY,
 	BPF_NETKIT_PEER,
+	BPF_QDISC_ENQUEUE,
+	BPF_QDISC_DEQUEUE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -7357,4 +7360,22 @@  struct bpf_iter_num {
 	__u64 __opaque[1];
 } __attribute__((aligned(8)));
 
+struct bpf_qdisc_ctx {
+	__bpf_md_ptr(struct sk_buff *, skb);
+	__u32 classid;
+	__u64 expire;
+	__u64 delta_ns;
+};
+
+enum {
+	SCH_BPF_QUEUED,
+	SCH_BPF_DEQUEUED = SCH_BPF_QUEUED,
+	SCH_BPF_DROP,
+	SCH_BPF_CN,
+	SCH_BPF_THROTTLE,
+	SCH_BPF_PASS,
+	SCH_BPF_BYPASS,
+	SCH_BPF_STOLEN,
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */