Message ID | 20241007184130.3960565-1-edumazet@google.com (mailing list archive) |
---|---|
State | Accepted |
Commit | 3cb7cf1540ddff5473d6baeb530228d19bc97b8a |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | [net] net/sched: accept TCA_STAB only for root qdisc | expand |
On Mon, Oct 7, 2024 at 8:41 PM Eric Dumazet <edumazet@google.com> wrote: > > Most qdiscs maintain their backlog using qdisc_pkt_len(skb) > on the assumption it is invariant between the enqueue() > and dequeue() handlers. > > Unfortunately syzbot can crash a host rather easily using > a TBF + SFQ combination, with an STAB on SFQ [1] > > We can't support TCA_STAB on arbitrary level, this would > require to maintain per-qdisc storage. > > [1] > [ 88.796496] BUG: kernel NULL pointer dereference, address: 0000000000000000 > [ 88.798611] #PF: supervisor read access in kernel mode > [ 88.799014] #PF: error_code(0x0000) - not-present page > [ 88.799506] PGD 0 P4D 0 > [ 88.799829] Oops: Oops: 0000 [#1] SMP NOPTI > [ 88.800569] CPU: 14 UID: 0 PID: 2053 Comm: b371744477 Not tainted 6.12.0-rc1-virtme #1117 > [ 88.801107] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 > [ 88.801779] RIP: 0010:sfq_dequeue (net/sched/sch_sfq.c:272 net/sched/sch_sfq.c:499) sch_sfq > [ 88.802544] Code: 0f b7 50 12 48 8d 04 d5 00 00 00 00 48 89 d6 48 29 d0 48 8b 91 c0 01 00 00 48 c1 e0 03 48 01 c2 66 83 7a 1a 00 7e c0 48 8b 3a <4c> 8b 07 4c 89 02 49 89 50 08 48 c7 47 08 00 00 00 00 48 c7 07 00 > All code > ======== > 0: 0f b7 50 12 movzwl 0x12(%rax),%edx > 4: 48 8d 04 d5 00 00 00 lea 0x0(,%rdx,8),%rax > b: 00 > c: 48 89 d6 mov %rdx,%rsi > f: 48 29 d0 sub %rdx,%rax > 12: 48 8b 91 c0 01 00 00 mov 0x1c0(%rcx),%rdx > 19: 48 c1 e0 03 shl $0x3,%rax > 1d: 48 01 c2 add %rax,%rdx > 20: 66 83 7a 1a 00 cmpw $0x0,0x1a(%rdx) > 25: 7e c0 jle 0xffffffffffffffe7 > 27: 48 8b 3a mov (%rdx),%rdi > 2a:* 4c 8b 07 mov (%rdi),%r8 <-- trapping instruction > 2d: 4c 89 02 mov %r8,(%rdx) > 30: 49 89 50 08 mov %rdx,0x8(%r8) > 34: 48 c7 47 08 00 00 00 movq $0x0,0x8(%rdi) > 3b: 00 > 3c: 48 rex.W > 3d: c7 .byte 0xc7 > 3e: 07 (bad) > ... > > Code starting with the faulting instruction > =========================================== > 0: 4c 8b 07 mov (%rdi),%r8 > 3: 4c 89 02 mov %r8,(%rdx) > 6: 49 89 50 08 mov %rdx,0x8(%r8) > a: 48 c7 47 08 00 00 00 movq $0x0,0x8(%rdi) > 11: 00 > 12: 48 rex.W > 13: c7 .byte 0xc7 > 14: 07 (bad) > ... > [ 88.803721] RSP: 0018:ffff9a1f892b7d58 EFLAGS: 00000206 > [ 88.804032] RAX: 0000000000000000 RBX: ffff9a1f8420c800 RCX: ffff9a1f8420c800 > [ 88.804560] RDX: ffff9a1f81bc1440 RSI: 0000000000000000 RDI: 0000000000000000 > [ 88.805056] RBP: ffffffffc04bb0e0 R08: 0000000000000001 R09: 00000000ff7f9a1f > [ 88.805473] R10: 000000000001001b R11: 0000000000009a1f R12: 0000000000000140 > [ 88.806194] R13: 0000000000000001 R14: ffff9a1f886df400 R15: ffff9a1f886df4ac > [ 88.806734] FS: 00007f445601a740(0000) GS:ffff9a2e7fd80000(0000) knlGS:0000000000000000 > [ 88.807225] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 88.807672] CR2: 0000000000000000 CR3: 000000050cc46000 CR4: 00000000000006f0 > [ 88.808165] Call Trace: > [ 88.808459] <TASK> > [ 88.808710] ? __die (arch/x86/kernel/dumpstack.c:421 arch/x86/kernel/dumpstack.c:434) > [ 88.809261] ? page_fault_oops (arch/x86/mm/fault.c:715) > [ 88.809561] ? exc_page_fault (./arch/x86/include/asm/irqflags.h:26 ./arch/x86/include/asm/irqflags.h:87 ./arch/x86/include/asm/irqflags.h:147 arch/x86/mm/fault.c:1489 arch/x86/mm/fault.c:1539) > [ 88.809806] ? asm_exc_page_fault (./arch/x86/include/asm/idtentry.h:623) > [ 88.810074] ? sfq_dequeue (net/sched/sch_sfq.c:272 net/sched/sch_sfq.c:499) sch_sfq > [ 88.810411] sfq_reset (net/sched/sch_sfq.c:525) sch_sfq > [ 88.810671] qdisc_reset (./include/linux/skbuff.h:2135 ./include/linux/skbuff.h:2441 ./include/linux/skbuff.h:3304 ./include/linux/skbuff.h:3310 net/sched/sch_generic.c:1036) > [ 88.810950] tbf_reset (./include/linux/timekeeping.h:169 net/sched/sch_tbf.c:334) sch_tbf > [ 88.811208] qdisc_reset (./include/linux/skbuff.h:2135 ./include/linux/skbuff.h:2441 ./include/linux/skbuff.h:3304 ./include/linux/skbuff.h:3310 net/sched/sch_generic.c:1036) > [ 88.811484] netif_set_real_num_tx_queues (./include/linux/spinlock.h:396 ./include/net/sch_generic.h:768 net/core/dev.c:2958) > [ 88.811870] __tun_detach (drivers/net/tun.c:590 drivers/net/tun.c:673) > [ 88.812271] tun_chr_close (drivers/net/tun.c:702 drivers/net/tun.c:3517) > [ 88.812505] __fput (fs/file_table.c:432 (discriminator 1)) > [ 88.812735] task_work_run (kernel/task_work.c:230) > [ 88.813016] do_exit (kernel/exit.c:940) > [ 88.813372] ? trace_hardirqs_on (kernel/trace/trace_preemptirq.c:58 (discriminator 4)) > [ 88.813639] ? handle_mm_fault (./arch/x86/include/asm/irqflags.h:42 ./arch/x86/include/asm/irqflags.h:97 ./arch/x86/include/asm/irqflags.h:155 ./include/linux/memcontrol.h:1022 ./include/linux/memcontrol.h:1045 ./include/linux/memcontrol.h:1052 mm/memory.c:5928 mm/memory.c:6088) > [ 88.813867] do_group_exit (kernel/exit.c:1070) > [ 88.814138] __x64_sys_exit_group (kernel/exit.c:1099) > [ 88.814490] x64_sys_call (??:?) > [ 88.814791] do_syscall_64 (arch/x86/entry/common.c:52 (discriminator 1) arch/x86/entry/common.c:83 (discriminator 1)) > [ 88.815012] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) > [ 88.815495] RIP: 0033:0x7f44560f1975 > > Fixes: 175f9c1bba9b ("Jussi Kivilinna <jussi.kivilinna@mbnet.fi>") Copy/paste error, this should be : Fixes: 175f9c1bba9b ("net_sched: Add size table for qdiscs") > Reported-by: syzbot <syzkaller@googlegroups.com> > Signed-off-by: Eric Dumazet <edumazet@google.com> > Cc: Daniel Borkmann <daniel@iogearbox.net> > --- > include/net/sch_generic.h | 1 - > net/sched/sch_api.c | 7 ++++++- > 2 files changed, 6 insertions(+), 2 deletions(-) > > diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h > index 79edd5b5e3c9139cac0af251f95cc63e173d05f5..5d74fa7e694cc85be91dbf01f0876b9feaa29115 100644 > --- a/include/net/sch_generic.h > +++ b/include/net/sch_generic.h > @@ -848,7 +848,6 @@ static inline void qdisc_calculate_pkt_len(struct sk_buff *skb, > static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, > struct sk_buff **to_free) > { > - qdisc_calculate_pkt_len(skb, sch); > return sch->enqueue(skb, sch, to_free); > } > > diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c > index 74afc210527d237cca3b48166be5918f802eb326..2eefa4783879971c557ca3d98b74ac1218ea2bd1 100644 > --- a/net/sched/sch_api.c > +++ b/net/sched/sch_api.c > @@ -593,7 +593,6 @@ void __qdisc_calculate_pkt_len(struct sk_buff *skb, > pkt_len = 1; > qdisc_skb_cb(skb)->pkt_len = pkt_len; > } > -EXPORT_SYMBOL(__qdisc_calculate_pkt_len); > > void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc) > { > @@ -1201,6 +1200,12 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, > return -EINVAL; > } > > + if (new && > + !(parent->flags & TCQ_F_MQROOT) && > + rcu_access_pointer(new->stab)) { > + NL_SET_ERR_MSG(extack, "STAB not supported on a non root"); > + return -EINVAL; > + } > err = cops->graft(parent, cl, new, &old, extack); > if (err) > return err; > -- > 2.47.0.rc0.187.ge670bccf7e-goog >
Hello: This patch was applied to netdev/net.git (main) by Jakub Kicinski <kuba@kernel.org>: On Mon, 7 Oct 2024 18:41:30 +0000 you wrote: > Most qdiscs maintain their backlog using qdisc_pkt_len(skb) > on the assumption it is invariant between the enqueue() > and dequeue() handlers. > > Unfortunately syzbot can crash a host rather easily using > a TBF + SFQ combination, with an STAB on SFQ [1] > > [...] Here is the summary with links: - [net] net/sched: accept TCA_STAB only for root qdisc https://git.kernel.org/netdev/net/c/3cb7cf1540dd You are awesome, thank you!
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 79edd5b5e3c9139cac0af251f95cc63e173d05f5..5d74fa7e694cc85be91dbf01f0876b9feaa29115 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -848,7 +848,6 @@ static inline void qdisc_calculate_pkt_len(struct sk_buff *skb, static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { - qdisc_calculate_pkt_len(skb, sch); return sch->enqueue(skb, sch, to_free); } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 74afc210527d237cca3b48166be5918f802eb326..2eefa4783879971c557ca3d98b74ac1218ea2bd1 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -593,7 +593,6 @@ void __qdisc_calculate_pkt_len(struct sk_buff *skb, pkt_len = 1; qdisc_skb_cb(skb)->pkt_len = pkt_len; } -EXPORT_SYMBOL(__qdisc_calculate_pkt_len); void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc) { @@ -1201,6 +1200,12 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, return -EINVAL; } + if (new && + !(parent->flags & TCQ_F_MQROOT) && + rcu_access_pointer(new->stab)) { + NL_SET_ERR_MSG(extack, "STAB not supported on a non root"); + return -EINVAL; + } err = cops->graft(parent, cl, new, &old, extack); if (err) return err;
Most qdiscs maintain their backlog using qdisc_pkt_len(skb) on the assumption it is invariant between the enqueue() and dequeue() handlers. Unfortunately syzbot can crash a host rather easily using a TBF + SFQ combination, with an STAB on SFQ [1] We can't support TCA_STAB on arbitrary level, this would require to maintain per-qdisc storage. [1] [ 88.796496] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 88.798611] #PF: supervisor read access in kernel mode [ 88.799014] #PF: error_code(0x0000) - not-present page [ 88.799506] PGD 0 P4D 0 [ 88.799829] Oops: Oops: 0000 [#1] SMP NOPTI [ 88.800569] CPU: 14 UID: 0 PID: 2053 Comm: b371744477 Not tainted 6.12.0-rc1-virtme #1117 [ 88.801107] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 88.801779] RIP: 0010:sfq_dequeue (net/sched/sch_sfq.c:272 net/sched/sch_sfq.c:499) sch_sfq [ 88.802544] Code: 0f b7 50 12 48 8d 04 d5 00 00 00 00 48 89 d6 48 29 d0 48 8b 91 c0 01 00 00 48 c1 e0 03 48 01 c2 66 83 7a 1a 00 7e c0 48 8b 3a <4c> 8b 07 4c 89 02 49 89 50 08 48 c7 47 08 00 00 00 00 48 c7 07 00 All code ======== 0: 0f b7 50 12 movzwl 0x12(%rax),%edx 4: 48 8d 04 d5 00 00 00 lea 0x0(,%rdx,8),%rax b: 00 c: 48 89 d6 mov %rdx,%rsi f: 48 29 d0 sub %rdx,%rax 12: 48 8b 91 c0 01 00 00 mov 0x1c0(%rcx),%rdx 19: 48 c1 e0 03 shl $0x3,%rax 1d: 48 01 c2 add %rax,%rdx 20: 66 83 7a 1a 00 cmpw $0x0,0x1a(%rdx) 25: 7e c0 jle 0xffffffffffffffe7 27: 48 8b 3a mov (%rdx),%rdi 2a:* 4c 8b 07 mov (%rdi),%r8 <-- trapping instruction 2d: 4c 89 02 mov %r8,(%rdx) 30: 49 89 50 08 mov %rdx,0x8(%r8) 34: 48 c7 47 08 00 00 00 movq $0x0,0x8(%rdi) 3b: 00 3c: 48 rex.W 3d: c7 .byte 0xc7 3e: 07 (bad) ... Code starting with the faulting instruction =========================================== 0: 4c 8b 07 mov (%rdi),%r8 3: 4c 89 02 mov %r8,(%rdx) 6: 49 89 50 08 mov %rdx,0x8(%r8) a: 48 c7 47 08 00 00 00 movq $0x0,0x8(%rdi) 11: 00 12: 48 rex.W 13: c7 .byte 0xc7 14: 07 (bad) ... [ 88.803721] RSP: 0018:ffff9a1f892b7d58 EFLAGS: 00000206 [ 88.804032] RAX: 0000000000000000 RBX: ffff9a1f8420c800 RCX: ffff9a1f8420c800 [ 88.804560] RDX: ffff9a1f81bc1440 RSI: 0000000000000000 RDI: 0000000000000000 [ 88.805056] RBP: ffffffffc04bb0e0 R08: 0000000000000001 R09: 00000000ff7f9a1f [ 88.805473] R10: 000000000001001b R11: 0000000000009a1f R12: 0000000000000140 [ 88.806194] R13: 0000000000000001 R14: ffff9a1f886df400 R15: ffff9a1f886df4ac [ 88.806734] FS: 00007f445601a740(0000) GS:ffff9a2e7fd80000(0000) knlGS:0000000000000000 [ 88.807225] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 88.807672] CR2: 0000000000000000 CR3: 000000050cc46000 CR4: 00000000000006f0 [ 88.808165] Call Trace: [ 88.808459] <TASK> [ 88.808710] ? __die (arch/x86/kernel/dumpstack.c:421 arch/x86/kernel/dumpstack.c:434) [ 88.809261] ? page_fault_oops (arch/x86/mm/fault.c:715) [ 88.809561] ? exc_page_fault (./arch/x86/include/asm/irqflags.h:26 ./arch/x86/include/asm/irqflags.h:87 ./arch/x86/include/asm/irqflags.h:147 arch/x86/mm/fault.c:1489 arch/x86/mm/fault.c:1539) [ 88.809806] ? asm_exc_page_fault (./arch/x86/include/asm/idtentry.h:623) [ 88.810074] ? sfq_dequeue (net/sched/sch_sfq.c:272 net/sched/sch_sfq.c:499) sch_sfq [ 88.810411] sfq_reset (net/sched/sch_sfq.c:525) sch_sfq [ 88.810671] qdisc_reset (./include/linux/skbuff.h:2135 ./include/linux/skbuff.h:2441 ./include/linux/skbuff.h:3304 ./include/linux/skbuff.h:3310 net/sched/sch_generic.c:1036) [ 88.810950] tbf_reset (./include/linux/timekeeping.h:169 net/sched/sch_tbf.c:334) sch_tbf [ 88.811208] qdisc_reset (./include/linux/skbuff.h:2135 ./include/linux/skbuff.h:2441 ./include/linux/skbuff.h:3304 ./include/linux/skbuff.h:3310 net/sched/sch_generic.c:1036) [ 88.811484] netif_set_real_num_tx_queues (./include/linux/spinlock.h:396 ./include/net/sch_generic.h:768 net/core/dev.c:2958) [ 88.811870] __tun_detach (drivers/net/tun.c:590 drivers/net/tun.c:673) [ 88.812271] tun_chr_close (drivers/net/tun.c:702 drivers/net/tun.c:3517) [ 88.812505] __fput (fs/file_table.c:432 (discriminator 1)) [ 88.812735] task_work_run (kernel/task_work.c:230) [ 88.813016] do_exit (kernel/exit.c:940) [ 88.813372] ? trace_hardirqs_on (kernel/trace/trace_preemptirq.c:58 (discriminator 4)) [ 88.813639] ? handle_mm_fault (./arch/x86/include/asm/irqflags.h:42 ./arch/x86/include/asm/irqflags.h:97 ./arch/x86/include/asm/irqflags.h:155 ./include/linux/memcontrol.h:1022 ./include/linux/memcontrol.h:1045 ./include/linux/memcontrol.h:1052 mm/memory.c:5928 mm/memory.c:6088) [ 88.813867] do_group_exit (kernel/exit.c:1070) [ 88.814138] __x64_sys_exit_group (kernel/exit.c:1099) [ 88.814490] x64_sys_call (??:?) [ 88.814791] do_syscall_64 (arch/x86/entry/common.c:52 (discriminator 1) arch/x86/entry/common.c:83 (discriminator 1)) [ 88.815012] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) [ 88.815495] RIP: 0033:0x7f44560f1975 Fixes: 175f9c1bba9b ("Jussi Kivilinna <jussi.kivilinna@mbnet.fi>") Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Daniel Borkmann <daniel@iogearbox.net> --- include/net/sch_generic.h | 1 - net/sched/sch_api.c | 7 ++++++- 2 files changed, 6 insertions(+), 2 deletions(-)