Message ID | 20211105232330.1936330-2-songliubraving@fb.com (mailing list archive) |
---|---|
State | Accepted |
Delegated to: | BPF |
Headers | show |
Series | introduce bpf_find_vma | expand |
On 11/5/21 4:23 PM, Song Liu wrote: > In some profiler use cases, it is necessary to map an address to the > backing file, e.g., a shared library. bpf_find_vma helper provides a > flexible way to achieve this. bpf_find_vma maps an address of a task to > the vma (vm_area_struct) for this address, and feed the vma to an callback > BPF function. The callback function is necessary here, as we need to > ensure mmap_sem is unlocked. > > It is necessary to lock mmap_sem for find_vma. To lock and unlock mmap_sem > safely when irqs are disable, we use the same mechanism as stackmap with > build_id. Specifically, when irqs are disabled, the unlocked is postponed > in an irq_work. Refactor stackmap.c so that the irq_work is shared among > bpf_find_vma and stackmap helpers. > > Acked-by: Yonghong Song <yhs@fb.com> > Tested-by: Hengqi Chen <hengqi.chen@gmail.com> > Signed-off-by: Song Liu <songliubraving@fb.com> > --- ... > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c > index dbc3ad07e21b6..cdb0fba656006 100644 > --- a/kernel/bpf/btf.c > +++ b/kernel/bpf/btf.c > @@ -6342,7 +6342,10 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = { > .arg4_type = ARG_ANYTHING, > }; > > -BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct) > +BTF_ID_LIST_GLOBAL(btf_task_struct_ids) > +BTF_ID(struct, task_struct) > +BTF_ID(struct, file) > +BTF_ID(struct, vm_area_struct) $ nm -v vmlinux |grep -A3 btf_task_struct_ids ffffffff82adfd9c R btf_task_struct_ids ffffffff82adfda0 r __BTF_ID__struct__file__715 ffffffff82adfda4 r __BTF_ID__struct__vm_area_struct__716 ffffffff82adfda8 r bpf_skb_output_btf_ids KASAN thinks btf_task_struct_ids has 4 bytes only. BUG: KASAN: global-out-of-bounds in task_iter_init+0x212/0x2e7 kernel/bpf/task_iter.c:661 Read of size 4 at addr ffffffff90297404 by task swapper/0/1 CPU: 1 PID: 1 Comm: swapper/0 Not tainted 5.15.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: <TASK> __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description.constprop.0.cold+0xf/0x309 mm/kasan/report.c:256 __kasan_report mm/kasan/report.c:442 [inline] kasan_report.cold+0x83/0xdf mm/kasan/report.c:459 task_iter_init+0x212/0x2e7 kernel/bpf/task_iter.c:661 do_one_initcall+0x103/0x650 init/main.c:1295 do_initcall_level init/main.c:1368 [inline] do_initcalls init/main.c:1384 [inline] do_basic_setup init/main.c:1403 [inline] kernel_init_freeable+0x6b1/0x73a init/main.c:1606 kernel_init+0x1a/0x1d0 init/main.c:1497 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 </TASK> The buggy address belongs to the variable: btf_task_struct_ids+0x4/0x40
> On Nov 8, 2021, at 10:36 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > > > > On 11/5/21 4:23 PM, Song Liu wrote: >> In some profiler use cases, it is necessary to map an address to the >> backing file, e.g., a shared library. bpf_find_vma helper provides a >> flexible way to achieve this. bpf_find_vma maps an address of a task to >> the vma (vm_area_struct) for this address, and feed the vma to an callback >> BPF function. The callback function is necessary here, as we need to >> ensure mmap_sem is unlocked. >> >> It is necessary to lock mmap_sem for find_vma. To lock and unlock mmap_sem >> safely when irqs are disable, we use the same mechanism as stackmap with >> build_id. Specifically, when irqs are disabled, the unlocked is postponed >> in an irq_work. Refactor stackmap.c so that the irq_work is shared among >> bpf_find_vma and stackmap helpers. >> >> Acked-by: Yonghong Song <yhs@fb.com> >> Tested-by: Hengqi Chen <hengqi.chen@gmail.com> >> Signed-off-by: Song Liu <songliubraving@fb.com> >> --- > > ... > >> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c >> index dbc3ad07e21b6..cdb0fba656006 100644 >> --- a/kernel/bpf/btf.c >> +++ b/kernel/bpf/btf.c >> @@ -6342,7 +6342,10 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = { >> .arg4_type = ARG_ANYTHING, >> }; >> >> -BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct) >> +BTF_ID_LIST_GLOBAL(btf_task_struct_ids) >> +BTF_ID(struct, task_struct) >> +BTF_ID(struct, file) >> +BTF_ID(struct, vm_area_struct) > > $ nm -v vmlinux |grep -A3 btf_task_struct_ids > ffffffff82adfd9c R btf_task_struct_ids > ffffffff82adfda0 r __BTF_ID__struct__file__715 > ffffffff82adfda4 r __BTF_ID__struct__vm_area_struct__716 > ffffffff82adfda8 r bpf_skb_output_btf_ids > > KASAN thinks btf_task_struct_ids has 4 bytes only. I have KASAN enabled, but couldn't repro this issue. I think btf_task_struct_ids looks correct: nm -v vmlinux | grep -A3 -B1 btf_task_struct_ids ffffffff83cf8260 r __BTF_ID__struct__task_struct__1026 ffffffff83cf8260 R btf_task_struct_ids ffffffff83cf8264 r __BTF_ID__struct__file__1027 ffffffff83cf8268 r __BTF_ID__struct__vm_area_struct__1028 ffffffff83cf826c r bpf_skb_output_btf_ids Did I miss something? Thanks, Song > > BUG: KASAN: global-out-of-bounds in task_iter_init+0x212/0x2e7 kernel/bpf/task_iter.c:661 > Read of size 4 at addr ffffffff90297404 by task swapper/0/1 > > CPU: 1 PID: 1 Comm: swapper/0 Not tainted 5.15.0-syzkaller #0 > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 > Call Trace: > <TASK> > __dump_stack lib/dump_stack.c:88 [inline] > dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 > print_address_description.constprop.0.cold+0xf/0x309 mm/kasan/report.c:256 > __kasan_report mm/kasan/report.c:442 [inline] > kasan_report.cold+0x83/0xdf mm/kasan/report.c:459 > task_iter_init+0x212/0x2e7 kernel/bpf/task_iter.c:661 > do_one_initcall+0x103/0x650 init/main.c:1295 > do_initcall_level init/main.c:1368 [inline] > do_initcalls init/main.c:1384 [inline] > do_basic_setup init/main.c:1403 [inline] > kernel_init_freeable+0x6b1/0x73a init/main.c:1606 > kernel_init+0x1a/0x1d0 init/main.c:1497 > ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 > </TASK> > > The buggy address belongs to the variable: > btf_task_struct_ids+0x4/0x40
On 11/8/21 1:59 PM, Song Liu wrote: > > >> On Nov 8, 2021, at 10:36 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote: >> >> >> >> On 11/5/21 4:23 PM, Song Liu wrote: >>> In some profiler use cases, it is necessary to map an address to the >>> backing file, e.g., a shared library. bpf_find_vma helper provides a >>> flexible way to achieve this. bpf_find_vma maps an address of a task to >>> the vma (vm_area_struct) for this address, and feed the vma to an callback >>> BPF function. The callback function is necessary here, as we need to >>> ensure mmap_sem is unlocked. >>> >>> It is necessary to lock mmap_sem for find_vma. To lock and unlock mmap_sem >>> safely when irqs are disable, we use the same mechanism as stackmap with >>> build_id. Specifically, when irqs are disabled, the unlocked is postponed >>> in an irq_work. Refactor stackmap.c so that the irq_work is shared among >>> bpf_find_vma and stackmap helpers. >>> >>> Acked-by: Yonghong Song <yhs@fb.com> >>> Tested-by: Hengqi Chen <hengqi.chen@gmail.com> >>> Signed-off-by: Song Liu <songliubraving@fb.com> >>> --- >> >> ... >> >>> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c >>> index dbc3ad07e21b6..cdb0fba656006 100644 >>> --- a/kernel/bpf/btf.c >>> +++ b/kernel/bpf/btf.c >>> @@ -6342,7 +6342,10 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = { >>> .arg4_type = ARG_ANYTHING, >>> }; >>> >>> -BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct) >>> +BTF_ID_LIST_GLOBAL(btf_task_struct_ids) >>> +BTF_ID(struct, task_struct) >>> +BTF_ID(struct, file) >>> +BTF_ID(struct, vm_area_struct) >> >> $ nm -v vmlinux |grep -A3 btf_task_struct_ids >> ffffffff82adfd9c R btf_task_struct_ids >> ffffffff82adfda0 r __BTF_ID__struct__file__715 >> ffffffff82adfda4 r __BTF_ID__struct__vm_area_struct__716 >> ffffffff82adfda8 r bpf_skb_output_btf_ids >> >> KASAN thinks btf_task_struct_ids has 4 bytes only. > > I have KASAN enabled, but couldn't repro this issue. I think > btf_task_struct_ids looks correct: > > nm -v vmlinux | grep -A3 -B1 btf_task_struct_ids > ffffffff83cf8260 r __BTF_ID__struct__task_struct__1026 > ffffffff83cf8260 R btf_task_struct_ids > ffffffff83cf8264 r __BTF_ID__struct__file__1027 > ffffffff83cf8268 r __BTF_ID__struct__vm_area_struct__1028 > ffffffff83cf826c r bpf_skb_output_btf_ids > > Did I miss something? > > Thanks, > Song > I will release the syzbot bug, so that you can use its .config Basically, we have u32 btf_task_struct_ids[1]; xxxx = btf_task_struct_ids[2]; /* trap */
On 11/8/21 2:27 PM, Eric Dumazet wrote: > > > On 11/8/21 1:59 PM, Song Liu wrote: >> >> >>> On Nov 8, 2021, at 10:36 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote: >>> >>> >>> >>> On 11/5/21 4:23 PM, Song Liu wrote: >>>> In some profiler use cases, it is necessary to map an address to the >>>> backing file, e.g., a shared library. bpf_find_vma helper provides a >>>> flexible way to achieve this. bpf_find_vma maps an address of a task to >>>> the vma (vm_area_struct) for this address, and feed the vma to an callback >>>> BPF function. The callback function is necessary here, as we need to >>>> ensure mmap_sem is unlocked. >>>> >>>> It is necessary to lock mmap_sem for find_vma. To lock and unlock mmap_sem >>>> safely when irqs are disable, we use the same mechanism as stackmap with >>>> build_id. Specifically, when irqs are disabled, the unlocked is postponed >>>> in an irq_work. Refactor stackmap.c so that the irq_work is shared among >>>> bpf_find_vma and stackmap helpers. >>>> >>>> Acked-by: Yonghong Song <yhs@fb.com> >>>> Tested-by: Hengqi Chen <hengqi.chen@gmail.com> >>>> Signed-off-by: Song Liu <songliubraving@fb.com> >>>> --- >>> >>> ... >>> >>>> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c >>>> index dbc3ad07e21b6..cdb0fba656006 100644 >>>> --- a/kernel/bpf/btf.c >>>> +++ b/kernel/bpf/btf.c >>>> @@ -6342,7 +6342,10 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = { >>>> .arg4_type = ARG_ANYTHING, >>>> }; >>>> >>>> -BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct) >>>> +BTF_ID_LIST_GLOBAL(btf_task_struct_ids) >>>> +BTF_ID(struct, task_struct) >>>> +BTF_ID(struct, file) >>>> +BTF_ID(struct, vm_area_struct) >>> >>> $ nm -v vmlinux |grep -A3 btf_task_struct_ids >>> ffffffff82adfd9c R btf_task_struct_ids >>> ffffffff82adfda0 r __BTF_ID__struct__file__715 >>> ffffffff82adfda4 r __BTF_ID__struct__vm_area_struct__716 >>> ffffffff82adfda8 r bpf_skb_output_btf_ids >>> >>> KASAN thinks btf_task_struct_ids has 4 bytes only. >> >> I have KASAN enabled, but couldn't repro this issue. I think >> btf_task_struct_ids looks correct: >> >> nm -v vmlinux | grep -A3 -B1 btf_task_struct_ids >> ffffffff83cf8260 r __BTF_ID__struct__task_struct__1026 >> ffffffff83cf8260 R btf_task_struct_ids >> ffffffff83cf8264 r __BTF_ID__struct__file__1027 >> ffffffff83cf8268 r __BTF_ID__struct__vm_area_struct__1028 >> ffffffff83cf826c r bpf_skb_output_btf_ids >> >> Did I miss something? >> >> Thanks, >> Song >> > > I will release the syzbot bug, so that you can use its .config > > Basically, we have > > u32 btf_task_struct_ids[1]; That is, if # CONFIG_DEBUG_INFO_BTF is not set > > xxxx = btf_task_struct_ids[2]; /* trap */ > > >
On 11/8/21 2:43 PM, Eric Dumazet wrote: > > > On 11/8/21 2:27 PM, Eric Dumazet wrote: >> >> >> On 11/8/21 1:59 PM, Song Liu wrote: >>> >>> >>>> On Nov 8, 2021, at 10:36 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote: >>>> >>>> >>>> >>>> On 11/5/21 4:23 PM, Song Liu wrote: >>>>> In some profiler use cases, it is necessary to map an address to the >>>>> backing file, e.g., a shared library. bpf_find_vma helper provides a >>>>> flexible way to achieve this. bpf_find_vma maps an address of a task to >>>>> the vma (vm_area_struct) for this address, and feed the vma to an callback >>>>> BPF function. The callback function is necessary here, as we need to >>>>> ensure mmap_sem is unlocked. >>>>> >>>>> It is necessary to lock mmap_sem for find_vma. To lock and unlock mmap_sem >>>>> safely when irqs are disable, we use the same mechanism as stackmap with >>>>> build_id. Specifically, when irqs are disabled, the unlocked is postponed >>>>> in an irq_work. Refactor stackmap.c so that the irq_work is shared among >>>>> bpf_find_vma and stackmap helpers. >>>>> >>>>> Acked-by: Yonghong Song <yhs@fb.com> >>>>> Tested-by: Hengqi Chen <hengqi.chen@gmail.com> >>>>> Signed-off-by: Song Liu <songliubraving@fb.com> >>>>> --- >>>> >>>> ... >>>> >>>>> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c >>>>> index dbc3ad07e21b6..cdb0fba656006 100644 >>>>> --- a/kernel/bpf/btf.c >>>>> +++ b/kernel/bpf/btf.c >>>>> @@ -6342,7 +6342,10 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = { >>>>> .arg4_type = ARG_ANYTHING, >>>>> }; >>>>> >>>>> -BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct) >>>>> +BTF_ID_LIST_GLOBAL(btf_task_struct_ids) >>>>> +BTF_ID(struct, task_struct) >>>>> +BTF_ID(struct, file) >>>>> +BTF_ID(struct, vm_area_struct) >>>> >>>> $ nm -v vmlinux |grep -A3 btf_task_struct_ids >>>> ffffffff82adfd9c R btf_task_struct_ids >>>> ffffffff82adfda0 r __BTF_ID__struct__file__715 >>>> ffffffff82adfda4 r __BTF_ID__struct__vm_area_struct__716 >>>> ffffffff82adfda8 r bpf_skb_output_btf_ids >>>> >>>> KASAN thinks btf_task_struct_ids has 4 bytes only. >>> >>> I have KASAN enabled, but couldn't repro this issue. I think >>> btf_task_struct_ids looks correct: >>> >>> nm -v vmlinux | grep -A3 -B1 btf_task_struct_ids >>> ffffffff83cf8260 r __BTF_ID__struct__task_struct__1026 >>> ffffffff83cf8260 R btf_task_struct_ids >>> ffffffff83cf8264 r __BTF_ID__struct__file__1027 >>> ffffffff83cf8268 r __BTF_ID__struct__vm_area_struct__1028 >>> ffffffff83cf826c r bpf_skb_output_btf_ids >>> >>> Did I miss something? >>> >>> Thanks, >>> Song >>> >> >> I will release the syzbot bug, so that you can use its .config >> >> Basically, we have >> >> u32 btf_task_struct_ids[1]; > > That is, if > > # CONFIG_DEBUG_INFO_BTF is not set > This is how btf_sock_ids gets defined : #ifdef CONFIG_DEBUG_INFO_BTF BTF_ID_LIST_GLOBAL(btf_sock_ids) #define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type) BTF_SOCK_TYPE_xxx #undef BTF_SOCK_TYPE #else u32 btf_sock_ids[MAX_BTF_SOCK_TYPE]; #endif Perhaps do the same for btf_task_struct_ids ? Thanks.
> On Nov 8, 2021, at 2:56 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > > > > On 11/8/21 2:43 PM, Eric Dumazet wrote: >> >> >> On 11/8/21 2:27 PM, Eric Dumazet wrote: >>> >>> >>> On 11/8/21 1:59 PM, Song Liu wrote: >>>> >>>> >>>>> On Nov 8, 2021, at 10:36 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote: >>>>> >>>>> >>>>> >>>>> On 11/5/21 4:23 PM, Song Liu wrote: >>>>>> In some profiler use cases, it is necessary to map an address to the >>>>>> backing file, e.g., a shared library. bpf_find_vma helper provides a >>>>>> flexible way to achieve this. bpf_find_vma maps an address of a task to >>>>>> the vma (vm_area_struct) for this address, and feed the vma to an callback >>>>>> BPF function. The callback function is necessary here, as we need to >>>>>> ensure mmap_sem is unlocked. >>>>>> >>>>>> It is necessary to lock mmap_sem for find_vma. To lock and unlock mmap_sem >>>>>> safely when irqs are disable, we use the same mechanism as stackmap with >>>>>> build_id. Specifically, when irqs are disabled, the unlocked is postponed >>>>>> in an irq_work. Refactor stackmap.c so that the irq_work is shared among >>>>>> bpf_find_vma and stackmap helpers. >>>>>> >>>>>> Acked-by: Yonghong Song <yhs@fb.com> >>>>>> Tested-by: Hengqi Chen <hengqi.chen@gmail.com> >>>>>> Signed-off-by: Song Liu <songliubraving@fb.com> >>>>>> --- >>>>> >>>>> ... >>>>> >>>>>> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c >>>>>> index dbc3ad07e21b6..cdb0fba656006 100644 >>>>>> --- a/kernel/bpf/btf.c >>>>>> +++ b/kernel/bpf/btf.c >>>>>> @@ -6342,7 +6342,10 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = { >>>>>> .arg4_type = ARG_ANYTHING, >>>>>> }; >>>>>> >>>>>> -BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct) >>>>>> +BTF_ID_LIST_GLOBAL(btf_task_struct_ids) >>>>>> +BTF_ID(struct, task_struct) >>>>>> +BTF_ID(struct, file) >>>>>> +BTF_ID(struct, vm_area_struct) >>>>> >>>>> $ nm -v vmlinux |grep -A3 btf_task_struct_ids >>>>> ffffffff82adfd9c R btf_task_struct_ids >>>>> ffffffff82adfda0 r __BTF_ID__struct__file__715 >>>>> ffffffff82adfda4 r __BTF_ID__struct__vm_area_struct__716 >>>>> ffffffff82adfda8 r bpf_skb_output_btf_ids >>>>> >>>>> KASAN thinks btf_task_struct_ids has 4 bytes only. >>>> >>>> I have KASAN enabled, but couldn't repro this issue. I think >>>> btf_task_struct_ids looks correct: >>>> >>>> nm -v vmlinux | grep -A3 -B1 btf_task_struct_ids >>>> ffffffff83cf8260 r __BTF_ID__struct__task_struct__1026 >>>> ffffffff83cf8260 R btf_task_struct_ids >>>> ffffffff83cf8264 r __BTF_ID__struct__file__1027 >>>> ffffffff83cf8268 r __BTF_ID__struct__vm_area_struct__1028 >>>> ffffffff83cf826c r bpf_skb_output_btf_ids >>>> >>>> Did I miss something? >>>> >>>> Thanks, >>>> Song >>>> >>> >>> I will release the syzbot bug, so that you can use its .config >>> >>> Basically, we have >>> >>> u32 btf_task_struct_ids[1]; >> >> That is, if >> >> # CONFIG_DEBUG_INFO_BTF is not set >> > > This is how btf_sock_ids gets defined : > > #ifdef CONFIG_DEBUG_INFO_BTF > BTF_ID_LIST_GLOBAL(btf_sock_ids) > #define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type) > BTF_SOCK_TYPE_xxx > #undef BTF_SOCK_TYPE > #else > u32 btf_sock_ids[MAX_BTF_SOCK_TYPE]; > #endif > > > Perhaps do the same for btf_task_struct_ids ? Yeah, I was testing something below, but this one looks better. Shall I include syzbot link for the fix? Thanks, Song diff --git i/include/linux/btf_ids.h w/include/linux/btf_ids.h index 47d9abfbdb556..4153264c1236b 100644 --- i/include/linux/btf_ids.h +++ w/include/linux/btf_ids.h @@ -149,7 +149,7 @@ extern struct btf_id_set name; #define BTF_ID_LIST(name) static u32 name[5]; #define BTF_ID(prefix, name) #define BTF_ID_UNUSED -#define BTF_ID_LIST_GLOBAL(name) u32 name[1]; +#define BTF_ID_LIST_GLOBAL(name) u32 name[3]; #define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 name[1]; #define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) u32 name[1]; #define BTF_SET_START(name) static struct btf_id_set name = { 0 };
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2be6dfd68df99..df3410bff4b06 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2157,6 +2157,7 @@ extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; extern const struct bpf_func_proto bpf_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_sk_getsockopt_proto; extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto; +extern const struct bpf_func_proto bpf_find_vma_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ba5af15e25f5c..509eee5f0393d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4938,6 +4938,25 @@ union bpf_attr { * **-ENOENT** if symbol is not found. * * **-EPERM** if caller does not have permission to obtain kernel address. + * + * long bpf_find_vma(struct task_struct *task, u64 addr, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * Find vma of *task* that contains *addr*, call *callback_fn* + * function with *task*, *vma*, and *callback_ctx*. + * The *callback_fn* should be a static function and + * the *callback_ctx* should be a pointer to the stack. + * The *flags* is used to control certain aspects of the helper. + * Currently, the *flags* must be 0. + * + * The expected callback signature is + * + * long (\*callback_fn)(struct task_struct \*task, struct vm_area_struct \*vma, void \*callback_ctx); + * + * Return + * 0 on success. + * **-ENOENT** if *task->mm* is NULL, or no vma contains *addr*. + * **-EBUSY** if failed to try lock mmap_lock. + * **-EINVAL** for invalid **flags**. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5120,6 +5139,7 @@ union bpf_attr { FN(trace_vprintk), \ FN(skc_to_unix_sock), \ FN(kallsyms_lookup_name), \ + FN(find_vma), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index dbc3ad07e21b6..cdb0fba656006 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6342,7 +6342,10 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = { .arg4_type = ARG_ANYTHING, }; -BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct) +BTF_ID_LIST_GLOBAL(btf_task_struct_ids) +BTF_ID(struct, task_struct) +BTF_ID(struct, file) +BTF_ID(struct, vm_area_struct) /* BTF ID set registration API for modules */ diff --git a/kernel/bpf/mmap_unlock_work.h b/kernel/bpf/mmap_unlock_work.h new file mode 100644 index 0000000000000..5d18d7d85bef9 --- /dev/null +++ b/kernel/bpf/mmap_unlock_work.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2021 Facebook + */ + +#ifndef __MMAP_UNLOCK_WORK_H__ +#define __MMAP_UNLOCK_WORK_H__ +#include <linux/irq_work.h> + +/* irq_work to run mmap_read_unlock() in irq_work */ +struct mmap_unlock_irq_work { + struct irq_work irq_work; + struct mm_struct *mm; +}; + +DECLARE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work); + +/* + * We cannot do mmap_read_unlock() when the irq is disabled, because of + * risk to deadlock with rq_lock. To look up vma when the irqs are + * disabled, we need to run mmap_read_unlock() in irq_work. We use a + * percpu variable to do the irq_work. If the irq_work is already used + * by another lookup, we fall over. + */ +static inline bool bpf_mmap_unlock_get_irq_work(struct mmap_unlock_irq_work **work_ptr) +{ + struct mmap_unlock_irq_work *work = NULL; + bool irq_work_busy = false; + + if (irqs_disabled()) { + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + work = this_cpu_ptr(&mmap_unlock_work); + if (irq_work_is_busy(&work->irq_work)) { + /* cannot queue more up_read, fallback */ + irq_work_busy = true; + } + } else { + /* + * PREEMPT_RT does not allow to trylock mmap sem in + * interrupt disabled context. Force the fallback code. + */ + irq_work_busy = true; + } + } + + *work_ptr = work; + return irq_work_busy; +} + +static inline void bpf_mmap_unlock_mm(struct mmap_unlock_irq_work *work, struct mm_struct *mm) +{ + if (!work) { + mmap_read_unlock(mm); + } else { + work->mm = mm; + + /* The lock will be released once we're out of interrupt + * context. Tell lockdep that we've released it now so + * it doesn't complain that we forgot to release it. + */ + rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_); + irq_work_queue(&work->irq_work); + } +} + +#endif /* __MMAP_UNLOCK_WORK_H__ */ diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 6e75bbee39f0b..1de0a1b03636e 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -7,10 +7,10 @@ #include <linux/kernel.h> #include <linux/stacktrace.h> #include <linux/perf_event.h> -#include <linux/irq_work.h> #include <linux/btf_ids.h> #include <linux/buildid.h> #include "percpu_freelist.h" +#include "mmap_unlock_work.h" #define STACK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \ @@ -31,25 +31,6 @@ struct bpf_stack_map { struct stack_map_bucket *buckets[]; }; -/* irq_work to run up_read() for build_id lookup in nmi context */ -struct stack_map_irq_work { - struct irq_work irq_work; - struct mm_struct *mm; -}; - -static void do_up_read(struct irq_work *entry) -{ - struct stack_map_irq_work *work; - - if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) - return; - - work = container_of(entry, struct stack_map_irq_work, irq_work); - mmap_read_unlock_non_owner(work->mm); -} - -static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work); - static inline bool stack_map_use_build_id(struct bpf_map *map) { return (map->map_flags & BPF_F_STACK_BUILD_ID); @@ -149,35 +130,13 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, u64 *ips, u32 trace_nr, bool user) { int i; + struct mmap_unlock_irq_work *work = NULL; + bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); struct vm_area_struct *vma; - bool irq_work_busy = false; - struct stack_map_irq_work *work = NULL; - - if (irqs_disabled()) { - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { - work = this_cpu_ptr(&up_read_work); - if (irq_work_is_busy(&work->irq_work)) { - /* cannot queue more up_read, fallback */ - irq_work_busy = true; - } - } else { - /* - * PREEMPT_RT does not allow to trylock mmap sem in - * interrupt disabled context. Force the fallback code. - */ - irq_work_busy = true; - } - } - /* - * We cannot do up_read() when the irq is disabled, because of - * risk to deadlock with rq_lock. To do build_id lookup when the - * irqs are disabled, we need to run up_read() in irq_work. We use - * a percpu variable to do the irq_work. If the irq_work is - * already used by another lookup, we fall back to report ips. - * - * Same fallback is used for kernel stack (!user) on a stackmap - * with build_id. + /* If the irq_work is in use, fall back to report ips. Same + * fallback is used for kernel stack (!user) on a stackmap with + * build_id. */ if (!user || !current || !current->mm || irq_work_busy || !mmap_read_trylock(current->mm)) { @@ -203,19 +162,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, - vma->vm_start; id_offs[i].status = BPF_STACK_BUILD_ID_VALID; } - - if (!work) { - mmap_read_unlock(current->mm); - } else { - work->mm = current->mm; - - /* The lock will be released once we're out of interrupt - * context. Tell lockdep that we've released it now so - * it doesn't complain that we forgot to release it. - */ - rwsem_release(¤t->mm->mmap_lock.dep_map, _RET_IP_); - irq_work_queue(&work->irq_work); - } + bpf_mmap_unlock_mm(work, current->mm); } static struct perf_callchain_entry * @@ -719,16 +666,3 @@ const struct bpf_map_ops stack_trace_map_ops = { .map_btf_name = "bpf_stack_map", .map_btf_id = &stack_trace_map_btf_id, }; - -static int __init stack_map_init(void) -{ - int cpu; - struct stack_map_irq_work *work; - - for_each_possible_cpu(cpu) { - work = per_cpu_ptr(&up_read_work, cpu); - init_irq_work(&work->irq_work, do_up_read); - } - return 0; -} -subsys_initcall(stack_map_init); diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index b48750bfba5aa..f171479f7dd6b 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -8,6 +8,7 @@ #include <linux/fdtable.h> #include <linux/filter.h> #include <linux/btf_ids.h> +#include "mmap_unlock_work.h" struct bpf_iter_seq_task_common { struct pid_namespace *ns; @@ -524,10 +525,6 @@ static const struct seq_operations task_vma_seq_ops = { .show = task_vma_seq_show, }; -BTF_ID_LIST(btf_task_file_ids) -BTF_ID(struct, file) -BTF_ID(struct, vm_area_struct) - static const struct bpf_iter_seq_info task_seq_info = { .seq_ops = &task_seq_ops, .init_seq_private = init_seq_pidns, @@ -586,9 +583,74 @@ static struct bpf_iter_reg task_vma_reg_info = { .seq_info = &task_vma_seq_info, }; +BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start, + bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags) +{ + struct mmap_unlock_irq_work *work = NULL; + struct vm_area_struct *vma; + bool irq_work_busy = false; + struct mm_struct *mm; + int ret = -ENOENT; + + if (flags) + return -EINVAL; + + if (!task) + return -ENOENT; + + mm = task->mm; + if (!mm) + return -ENOENT; + + irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); + + if (irq_work_busy || !mmap_read_trylock(mm)) + return -EBUSY; + + vma = find_vma(mm, start); + + if (vma && vma->vm_start <= start && vma->vm_end > start) { + callback_fn((u64)(long)task, (u64)(long)vma, + (u64)(long)callback_ctx, 0, 0); + ret = 0; + } + bpf_mmap_unlock_mm(work, mm); + return ret; +} + +const struct bpf_func_proto bpf_find_vma_proto = { + .func = bpf_find_vma, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_task_struct_ids[0], + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_FUNC, + .arg4_type = ARG_PTR_TO_STACK_OR_NULL, + .arg5_type = ARG_ANYTHING, +}; + +DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work); + +static void do_mmap_read_unlock(struct irq_work *entry) +{ + struct mmap_unlock_irq_work *work; + + if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) + return; + + work = container_of(entry, struct mmap_unlock_irq_work, irq_work); + mmap_read_unlock_non_owner(work->mm); +} + static int __init task_iter_init(void) { - int ret; + struct mmap_unlock_irq_work *work; + int ret, cpu; + + for_each_possible_cpu(cpu) { + work = per_cpu_ptr(&mmap_unlock_work, cpu); + init_irq_work(&work->irq_work, do_mmap_read_unlock); + } task_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; ret = bpf_iter_reg_target(&task_reg_info); @@ -596,13 +658,13 @@ static int __init task_iter_init(void) return ret; task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; - task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[0]; + task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_struct_ids[1]; ret = bpf_iter_reg_target(&task_file_reg_info); if (ret) return ret; task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; - task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1]; + task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_struct_ids[2]; return bpf_iter_reg_target(&task_vma_reg_info); } late_initcall(task_iter_init); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f0dca726ebfde..1aafb43f61d1c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6132,6 +6132,33 @@ static int set_timer_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_find_vma_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + /* bpf_find_vma(struct task_struct *task, u64 addr, + * void *callback_fn, void *callback_ctx, u64 flags) + * (callback_fn)(struct task_struct *task, + * struct vm_area_struct *vma, void *callback_ctx); + */ + callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1]; + + callee->regs[BPF_REG_2].type = PTR_TO_BTF_ID; + __mark_reg_known_zero(&callee->regs[BPF_REG_2]); + callee->regs[BPF_REG_2].btf = btf_vmlinux; + callee->regs[BPF_REG_2].btf_id = btf_task_struct_ids[2]; + + /* pointer to stack or null */ + callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4]; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + callee->in_callback_fn = true; + return 0; +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; @@ -6489,6 +6516,13 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EINVAL; } + if (func_id == BPF_FUNC_find_vma) { + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_find_vma_callback_state); + if (err < 0) + return -EINVAL; + } + if (func_id == BPF_FUNC_snprintf) { err = check_bpf_snprintf_call(env, regs); if (err < 0) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7396488793ff7..390176a3031ab 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1208,6 +1208,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_func_ip_proto_tracing; case BPF_FUNC_get_branch_snapshot: return &bpf_get_branch_snapshot_proto; + case BPF_FUNC_find_vma: + return &bpf_find_vma_proto; case BPF_FUNC_trace_vprintk: return bpf_get_trace_vprintk_proto(); default: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ba5af15e25f5c..509eee5f0393d 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4938,6 +4938,25 @@ union bpf_attr { * **-ENOENT** if symbol is not found. * * **-EPERM** if caller does not have permission to obtain kernel address. + * + * long bpf_find_vma(struct task_struct *task, u64 addr, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * Find vma of *task* that contains *addr*, call *callback_fn* + * function with *task*, *vma*, and *callback_ctx*. + * The *callback_fn* should be a static function and + * the *callback_ctx* should be a pointer to the stack. + * The *flags* is used to control certain aspects of the helper. + * Currently, the *flags* must be 0. + * + * The expected callback signature is + * + * long (\*callback_fn)(struct task_struct \*task, struct vm_area_struct \*vma, void \*callback_ctx); + * + * Return + * 0 on success. + * **-ENOENT** if *task->mm* is NULL, or no vma contains *addr*. + * **-EBUSY** if failed to try lock mmap_lock. + * **-EINVAL** for invalid **flags**. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5120,6 +5139,7 @@ union bpf_attr { FN(trace_vprintk), \ FN(skc_to_unix_sock), \ FN(kallsyms_lookup_name), \ + FN(find_vma), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper