diff mbox series

[RFC,v7,4/8] net_sched: Add reset program

Message ID a45e9b29b616fdfb71cb6920aaecc6d22b1540b4.1705432850.git.amery.hung@bytedance.com (mailing list archive)
State RFC
Delegated to: BPF
Headers show
Series net_sched: Introduce eBPF based Qdisc | expand

Checks

Context Check Description
netdev/tree_selection success Guessing tree name failed - patch did not apply, async
bpf/vmtest-bpf-PR fail merge-conflict
bpf/vmtest-bpf-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-VM_Test-13 pending Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-42 success Logs for x86_64-llvm-18 / veristat

Commit Message

Amery Hung Jan. 17, 2024, 9:56 p.m. UTC
Allow developers to implement customized reset logic through an optional
reset program. The program also takes bpf_qdisc_ctx as context, but
currently cannot access any field.

To release skbs, the program can release all references to bpf list or
rbtree serving as skb queues. The destructor kfunc bpf_skb_destroy()
will be called by bpf_map_free_deferred(). This prevents the qdisc from
holding the sch_tree_lock for too long when there are many packets in
the qdisc.

Signed-off-by: Amery Hung <amery.hung@bytedance.com>
---
 include/uapi/linux/bpf.h       |  1 +
 include/uapi/linux/pkt_sched.h |  4 ++++
 kernel/bpf/syscall.c           |  1 +
 net/core/filter.c              |  3 +++
 net/sched/sch_bpf.c            | 30 ++++++++++++++++++++++++++----
 tools/include/uapi/linux/bpf.h |  1 +
 6 files changed, 36 insertions(+), 4 deletions(-)
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index df280bbb7c0d..84669886a493 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1059,6 +1059,7 @@  enum bpf_attach_type {
 	BPF_NETKIT_PEER,
 	BPF_QDISC_ENQUEUE,
 	BPF_QDISC_DEQUEUE,
+	BPF_QDISC_RESET,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index d05462309f5a..e9e1a83c22f7 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -1328,6 +1328,10 @@  enum {
 	TCA_SCH_BPF_DEQUEUE_PROG_FD,	/* u32 */
 	TCA_SCH_BPF_DEQUEUE_PROG_ID,	/* u32 */
 	TCA_SCH_BPF_DEQUEUE_PROG_TAG,	/* data */
+	TCA_SCH_BPF_RESET_PROG_NAME,	/* string */
+	TCA_SCH_BPF_RESET_PROG_FD,	/* u32 */
+	TCA_SCH_BPF_RESET_PROG_ID,	/* u32 */
+	TCA_SCH_BPF_RESET_PROG_TAG,	/* data */
 	__TCA_SCH_BPF_MAX,
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1838bddd8526..9af6fa542f2e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2506,6 +2506,7 @@  bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
 		switch (expected_attach_type) {
 		case BPF_QDISC_ENQUEUE:
 		case BPF_QDISC_DEQUEUE:
+		case BPF_QDISC_RESET:
 			return 0;
 		default:
 			return -EINVAL;
diff --git a/net/core/filter.c b/net/core/filter.c
index f25a0b6b5d56..f8e17465377f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -8905,6 +8905,9 @@  static bool tc_qdisc_is_valid_access(int off, int size,
 {
 	struct btf *btf;
 
+	if (prog->expected_attach_type == BPF_QDISC_RESET)
+		return false;
+
 	if (off < 0 || off >= sizeof(struct bpf_qdisc_ctx))
 		return false;
 
diff --git a/net/sched/sch_bpf.c b/net/sched/sch_bpf.c
index 1910a58a3352..3f0f809dced6 100644
--- a/net/sched/sch_bpf.c
+++ b/net/sched/sch_bpf.c
@@ -42,6 +42,7 @@  struct bpf_sched_data {
 	struct Qdisc_class_hash clhash;
 	struct sch_bpf_prog __rcu enqueue_prog;
 	struct sch_bpf_prog __rcu dequeue_prog;
+	struct sch_bpf_prog __rcu reset_prog;
 
 	struct qdisc_watchdog watchdog;
 };
@@ -51,6 +52,9 @@  static int sch_bpf_dump_prog(const struct sch_bpf_prog *prog, struct sk_buff *sk
 {
 	struct nlattr *nla;
 
+	if (!prog->prog)
+		return 0;
+
 	if (prog->name &&
 	    nla_put_string(skb, name, prog->name))
 		return -EMSGSIZE;
@@ -81,6 +85,9 @@  static int sch_bpf_dump(struct Qdisc *sch, struct sk_buff *skb)
 	if (sch_bpf_dump_prog(&q->dequeue_prog, skb, TCA_SCH_BPF_DEQUEUE_PROG_NAME,
 			      TCA_SCH_BPF_DEQUEUE_PROG_ID, TCA_SCH_BPF_DEQUEUE_PROG_TAG))
 		goto nla_put_failure;
+	if (sch_bpf_dump_prog(&q->reset_prog, skb, TCA_SCH_BPF_RESET_PROG_NAME,
+			      TCA_SCH_BPF_RESET_PROG_ID, TCA_SCH_BPF_RESET_PROG_TAG))
+		goto nla_put_failure;
 
 	return nla_nest_end(skb, opts);
 
@@ -259,16 +266,21 @@  static const struct nla_policy sch_bpf_policy[TCA_SCH_BPF_MAX + 1] = {
 	[TCA_SCH_BPF_DEQUEUE_PROG_FD]	= { .type = NLA_U32 },
 	[TCA_SCH_BPF_DEQUEUE_PROG_NAME]	= { .type = NLA_NUL_STRING,
 					    .len = ACT_BPF_NAME_LEN },
+	[TCA_SCH_BPF_RESET_PROG_FD]	= { .type = NLA_U32 },
+	[TCA_SCH_BPF_RESET_PROG_NAME]	= { .type = NLA_NUL_STRING,
+					    .len = ACT_BPF_NAME_LEN },
 };
 
-static int bpf_init_prog(struct nlattr *fd, struct nlattr *name, struct sch_bpf_prog *prog)
+static int bpf_init_prog(struct nlattr *fd, struct nlattr *name,
+			 struct sch_bpf_prog *prog, bool optional)
 {
 	struct bpf_prog *fp, *old_fp;
 	char *prog_name = NULL;
 	u32 bpf_fd;
 
 	if (!fd)
-		return -EINVAL;
+		return optional ? 0 : -EINVAL;
+
 	bpf_fd = nla_get_u32(fd);
 
 	fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_QDISC);
@@ -327,11 +339,15 @@  static int sch_bpf_change(struct Qdisc *sch, struct nlattr *opt,
 	sch_tree_lock(sch);
 
 	err = bpf_init_prog(tb[TCA_SCH_BPF_ENQUEUE_PROG_FD],
-			    tb[TCA_SCH_BPF_ENQUEUE_PROG_NAME], &q->enqueue_prog);
+			    tb[TCA_SCH_BPF_ENQUEUE_PROG_NAME], &q->enqueue_prog, false);
 	if (err)
 		goto failure;
 	err = bpf_init_prog(tb[TCA_SCH_BPF_DEQUEUE_PROG_FD],
-			    tb[TCA_SCH_BPF_DEQUEUE_PROG_NAME], &q->dequeue_prog);
+			    tb[TCA_SCH_BPF_DEQUEUE_PROG_NAME], &q->dequeue_prog, false);
+	if (err)
+		goto failure;
+	err = bpf_init_prog(tb[TCA_SCH_BPF_RESET_PROG_FD],
+			    tb[TCA_SCH_BPF_RESET_PROG_NAME], &q->reset_prog, true);
 failure:
 	sch_tree_unlock(sch);
 	return err;
@@ -360,7 +376,9 @@  static int sch_bpf_init(struct Qdisc *sch, struct nlattr *opt,
 static void sch_bpf_reset(struct Qdisc *sch)
 {
 	struct bpf_sched_data *q = qdisc_priv(sch);
+	struct bpf_qdisc_ctx ctx = {};
 	struct sch_bpf_class *cl;
+	struct bpf_prog *reset;
 	unsigned int i;
 
 	for (i = 0; i < q->clhash.hashsize; i++) {
@@ -371,6 +389,9 @@  static void sch_bpf_reset(struct Qdisc *sch)
 	}
 
 	qdisc_watchdog_cancel(&q->watchdog);
+	reset = rcu_dereference(q->reset_prog.prog);
+	if (reset)
+		bpf_prog_run(reset, &ctx);
 }
 
 static void sch_bpf_destroy_class(struct Qdisc *sch, struct sch_bpf_class *cl)
@@ -398,6 +419,7 @@  static void sch_bpf_destroy(struct Qdisc *sch)
 	sch_tree_lock(sch);
 	bpf_cleanup_prog(&q->enqueue_prog);
 	bpf_cleanup_prog(&q->dequeue_prog);
+	bpf_cleanup_prog(&q->reset_prog);
 	sch_tree_unlock(sch);
 }
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index df280bbb7c0d..84669886a493 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1059,6 +1059,7 @@  enum bpf_attach_type {
 	BPF_NETKIT_PEER,
 	BPF_QDISC_ENQUEUE,
 	BPF_QDISC_DEQUEUE,
+	BPF_QDISC_RESET,
 	__MAX_BPF_ATTACH_TYPE
 };