Message ID | 20230113031921.64716-4-bhe@redhat.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm/vmalloc.c: allow vread() to read out vm_map_ram areas | expand |
Hi Baoquan, url: https://github.com/intel-lab-lkp/linux/commits/Baoquan-He/mm-vmalloc-c-add-used_map-into-vmap_block-to-track-space-of-vmap_block/20230113-112149 base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything patch link: https://lore.kernel.org/r/20230113031921.64716-4-bhe%40redhat.com patch subject: [PATCH v3 3/7] mm/vmalloc.c: allow vread() to read out vm_map_ram areas config: i386-randconfig-m021 compiler: gcc-11 (Debian 11.3.0-8) 11.3.0 If you fix the issue, kindly add following tag where applicable | Reported-by: kernel test robot <lkp@intel.com> | Reported-by: Dan Carpenter <error27@gmail.com> smatch warnings: mm/vmalloc.c:3682 vread() error: we previously assumed 'vm' could be null (see line 3664) vim +/vm +3682 mm/vmalloc.c ^1da177e4c3f415 Linus Torvalds 2005-04-16 3630 long vread(char *buf, char *addr, unsigned long count) ^1da177e4c3f415 Linus Torvalds 2005-04-16 3631 { e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3632 struct vmap_area *va; e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3633 struct vm_struct *vm; ^1da177e4c3f415 Linus Torvalds 2005-04-16 3634 char *vaddr, *buf_start = buf; d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3635 unsigned long buflen = count; 129dbdf298d7383 Baoquan He 2023-01-13 3636 unsigned long n, size, flags; ^1da177e4c3f415 Linus Torvalds 2005-04-16 3637 4aff1dc4fb3a5a3 Andrey Konovalov 2022-03-24 3638 addr = kasan_reset_tag(addr); 4aff1dc4fb3a5a3 Andrey Konovalov 2022-03-24 3639 ^1da177e4c3f415 Linus Torvalds 2005-04-16 3640 /* Don't allow overflow */ ^1da177e4c3f415 Linus Torvalds 2005-04-16 3641 if ((unsigned long) addr + count < count) ^1da177e4c3f415 Linus Torvalds 2005-04-16 3642 count = -(unsigned long) addr; ^1da177e4c3f415 Linus Torvalds 2005-04-16 3643 e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3644 spin_lock(&vmap_area_lock); f181234a5a21fd0 Chen Wandun 2021-09-02 3645 va = find_vmap_area_exceed_addr((unsigned long)addr); f608788cd2d6cae Serapheim Dimitropoulos 2021-04-29 3646 if (!va) f608788cd2d6cae Serapheim Dimitropoulos 2021-04-29 3647 goto finished; f181234a5a21fd0 Chen Wandun 2021-09-02 3648 f181234a5a21fd0 Chen Wandun 2021-09-02 3649 /* no intersects with alive vmap_area */ f181234a5a21fd0 Chen Wandun 2021-09-02 3650 if ((unsigned long)addr + count <= va->va_start) f181234a5a21fd0 Chen Wandun 2021-09-02 3651 goto finished; f181234a5a21fd0 Chen Wandun 2021-09-02 3652 f608788cd2d6cae Serapheim Dimitropoulos 2021-04-29 3653 list_for_each_entry_from(va, &vmap_area_list, list) { e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3654 if (!count) e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3655 break; e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3656 129dbdf298d7383 Baoquan He 2023-01-13 3657 vm = va->vm; 129dbdf298d7383 Baoquan He 2023-01-13 3658 flags = va->flags & VMAP_FLAGS_MASK; 129dbdf298d7383 Baoquan He 2023-01-13 3659 129dbdf298d7383 Baoquan He 2023-01-13 3660 if (!vm && !flags) ^^^ vm can be NULL if a flag in VMAP_FLAGS_MASK is set. e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3661 continue; e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3662 129dbdf298d7383 Baoquan He 2023-01-13 3663 vaddr = (char *) va->va_start; 129dbdf298d7383 Baoquan He 2023-01-13 @3664 size = vm ? get_vm_area_size(vm) : va_size(va); ^^ 129dbdf298d7383 Baoquan He 2023-01-13 3665 129dbdf298d7383 Baoquan He 2023-01-13 3666 if (addr >= vaddr + size) ^1da177e4c3f415 Linus Torvalds 2005-04-16 3667 continue; ^1da177e4c3f415 Linus Torvalds 2005-04-16 3668 while (addr < vaddr) { ^1da177e4c3f415 Linus Torvalds 2005-04-16 3669 if (count == 0) ^1da177e4c3f415 Linus Torvalds 2005-04-16 3670 goto finished; ^1da177e4c3f415 Linus Torvalds 2005-04-16 3671 *buf = '\0'; ^1da177e4c3f415 Linus Torvalds 2005-04-16 3672 buf++; ^1da177e4c3f415 Linus Torvalds 2005-04-16 3673 addr++; ^1da177e4c3f415 Linus Torvalds 2005-04-16 3674 count--; ^1da177e4c3f415 Linus Torvalds 2005-04-16 3675 } 129dbdf298d7383 Baoquan He 2023-01-13 3676 n = vaddr + size - addr; d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3677 if (n > count) d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3678 n = count; 129dbdf298d7383 Baoquan He 2023-01-13 3679 129dbdf298d7383 Baoquan He 2023-01-13 3680 if (flags & VMAP_RAM) assume VMAP_RAM is not set 129dbdf298d7383 Baoquan He 2023-01-13 3681 vmap_ram_vread(buf, addr, n, flags); 129dbdf298d7383 Baoquan He 2023-01-13 @3682 else if (!(vm->flags & VM_IOREMAP)) ^^^^^^^^^ Unchecked dereference. Should this be "flags" instead of "vm->flags"? d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3683 aligned_vread(buf, addr, n); d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3684 else /* IOREMAP area is treated as memory hole */ d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3685 memset(buf, 0, n); d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3686 buf += n; d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3687 addr += n; d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3688 count -= n; ^1da177e4c3f415 Linus Torvalds 2005-04-16 3689 } ^1da177e4c3f415 Linus Torvalds 2005-04-16 3690 finished: e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3691 spin_unlock(&vmap_area_lock); d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3692 d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3693 if (buf == buf_start) d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3694 return 0; d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3695 /* zero-fill memory holes */ d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3696 if (buf != buf_start + buflen) d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3697 memset(buf, 0, buflen - (buf - buf_start)); d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3698 d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3699 return buflen; ^1da177e4c3f415 Linus Torvalds 2005-04-16 3700 }
Hi Dan, On 01/14/23 at 10:57am, Dan Carpenter wrote: > Hi Baoquan, > > url: https://github.com/intel-lab-lkp/linux/commits/Baoquan-He/mm-vmalloc-c-add-used_map-into-vmap_block-to-track-space-of-vmap_block/20230113-112149 > base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything > patch link: https://lore.kernel.org/r/20230113031921.64716-4-bhe%40redhat.com > patch subject: [PATCH v3 3/7] mm/vmalloc.c: allow vread() to read out vm_map_ram areas > config: i386-randconfig-m021 > compiler: gcc-11 (Debian 11.3.0-8) 11.3.0 > > If you fix the issue, kindly add following tag where applicable > | Reported-by: kernel test robot <lkp@intel.com> > | Reported-by: Dan Carpenter <error27@gmail.com> > > smatch warnings: > mm/vmalloc.c:3682 vread() error: we previously assumed 'vm' could be null (see line 3664) Thanks for checking this. I went through the code flow again, personally think that the issue and risk pointed out could not exist. Please see the comment at below. > > vim +/vm +3682 mm/vmalloc.c > > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3630 long vread(char *buf, char *addr, unsigned long count) > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3631 { > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3632 struct vmap_area *va; > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3633 struct vm_struct *vm; > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3634 char *vaddr, *buf_start = buf; > d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3635 unsigned long buflen = count; > 129dbdf298d7383 Baoquan He 2023-01-13 3636 unsigned long n, size, flags; > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3637 > 4aff1dc4fb3a5a3 Andrey Konovalov 2022-03-24 3638 addr = kasan_reset_tag(addr); > 4aff1dc4fb3a5a3 Andrey Konovalov 2022-03-24 3639 > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3640 /* Don't allow overflow */ > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3641 if ((unsigned long) addr + count < count) > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3642 count = -(unsigned long) addr; > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3643 > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3644 spin_lock(&vmap_area_lock); > f181234a5a21fd0 Chen Wandun 2021-09-02 3645 va = find_vmap_area_exceed_addr((unsigned long)addr); > f608788cd2d6cae Serapheim Dimitropoulos 2021-04-29 3646 if (!va) > f608788cd2d6cae Serapheim Dimitropoulos 2021-04-29 3647 goto finished; > f181234a5a21fd0 Chen Wandun 2021-09-02 3648 > f181234a5a21fd0 Chen Wandun 2021-09-02 3649 /* no intersects with alive vmap_area */ > f181234a5a21fd0 Chen Wandun 2021-09-02 3650 if ((unsigned long)addr + count <= va->va_start) > f181234a5a21fd0 Chen Wandun 2021-09-02 3651 goto finished; > f181234a5a21fd0 Chen Wandun 2021-09-02 3652 > f608788cd2d6cae Serapheim Dimitropoulos 2021-04-29 3653 list_for_each_entry_from(va, &vmap_area_list, list) { > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3654 if (!count) > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3655 break; > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3656 > 129dbdf298d7383 Baoquan He 2023-01-13 3657 vm = va->vm; > 129dbdf298d7383 Baoquan He 2023-01-13 3658 flags = va->flags & VMAP_FLAGS_MASK; > 129dbdf298d7383 Baoquan He 2023-01-13 3659 > 129dbdf298d7383 Baoquan He 2023-01-13 3660 if (!vm && !flags) > ^^^ > vm can be NULL if a flag in VMAP_FLAGS_MASK is set. > > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3661 continue; Right, after the 'continue;' line, only two cases could happen when it comes here. (vm != null) or (vm->flags & VMAP_RAM) is true. > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3662 > 129dbdf298d7383 Baoquan He 2023-01-13 3663 vaddr = (char *) va->va_start; > 129dbdf298d7383 Baoquan He 2023-01-13 @3664 size = vm ? get_vm_area_size(vm) : va_size(va); > ^^ > > 129dbdf298d7383 Baoquan He 2023-01-13 3665 > 129dbdf298d7383 Baoquan He 2023-01-13 3666 if (addr >= vaddr + size) > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3667 continue; > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3668 while (addr < vaddr) { > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3669 if (count == 0) > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3670 goto finished; > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3671 *buf = '\0'; > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3672 buf++; > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3673 addr++; > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3674 count--; > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3675 } > 129dbdf298d7383 Baoquan He 2023-01-13 3676 n = vaddr + size - addr; > d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3677 if (n > count) > d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3678 n = count; > 129dbdf298d7383 Baoquan He 2023-01-13 3679 > 129dbdf298d7383 Baoquan He 2023-01-13 3680 if (flags & VMAP_RAM) > > assume VMAP_RAM is not set OK, then vm is not null. > > 129dbdf298d7383 Baoquan He 2023-01-13 3681 vmap_ram_vread(buf, addr, n, flags); > 129dbdf298d7383 Baoquan He 2023-01-13 @3682 else if (!(vm->flags & VM_IOREMAP)) > ^^^^^^^^^ > Unchecked dereference. Should this be "flags" instead of "vm->flags"? Thus, here, in 'else if', vm is not null. And in this 'else if', we are intending to check vm->flags. I don't see issue or risk here. Please help check if I miss anything. Thanks Baoquan > > d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3683 aligned_vread(buf, addr, n); > d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3684 else /* IOREMAP area is treated as memory hole */ > d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3685 memset(buf, 0, n); > d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3686 buf += n; > d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3687 addr += n; > d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3688 count -= n; > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3689 } > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3690 finished: > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3691 spin_unlock(&vmap_area_lock); > d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3692 > d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3693 if (buf == buf_start) > d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3694 return 0; > d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3695 /* zero-fill memory holes */ > d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3696 if (buf != buf_start + buflen) > d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3697 memset(buf, 0, buflen - (buf - buf_start)); > d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3698 > d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3699 return buflen; > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3700 } > > -- > 0-DAY CI Kernel Test Service > https://github.com/intel/lkp-tests >
On Fri, Jan 13, 2023 at 11:19:17AM +0800, Baoquan He wrote: > Currently, vread can read out vmalloc areas which is associated with > a vm_struct. While this doesn't work for areas created by vm_map_ram() > interface because it doesn't have an associated vm_struct. Then in vread(), > these areas are all skipped. > > Here, add a new function vmap_ram_vread() to read out vm_map_ram areas. > The area created with vmap_ram_vread() interface directly can be handled > like the other normal vmap areas with aligned_vread(). While areas > which will be further subdivided and managed with vmap_block need > carefully read out page-aligned small regions and zero fill holes. > > Signed-off-by: Baoquan He <bhe@redhat.com> > --- > mm/vmalloc.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++----- > 1 file changed, 73 insertions(+), 7 deletions(-) > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c > index ab4825050b5c..13875bc41e27 100644 > --- a/mm/vmalloc.c > +++ b/mm/vmalloc.c > @@ -3544,6 +3544,65 @@ static int aligned_vread(char *buf, char *addr, unsigned long count) > return copied; > } > > +static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags) > +{ > + char *start; > + struct vmap_block *vb; > + unsigned long offset; > + unsigned int rs, re, n; > + > + /* > + * If it's area created by vm_map_ram() interface directly, but > + * not further subdividing and delegating management to vmap_block, > + * handle it here. > + */ > + if (!(flags & VMAP_BLOCK)) { > + aligned_vread(buf, addr, count); > + return; > + } > + > + /* > + * Area is split into regions and tracked with vmap_block, read out > + * each region and zero fill the hole between regions. > + */ > + vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr)); > + > + spin_lock(&vb->lock); > + if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) { > CPU-X invokes free_vmap_block() whereas we take the vb->lock and do some manipulations with vb that might be already freed over RCU-core. Should we protect it by the rcu_read_lock() also here? -- Uladzislau Rezki
On Sun, Jan 15, 2023 at 10:08:55PM +0800, Baoquan He wrote: > > f181234a5a21fd0 Chen Wandun 2021-09-02 3650 if ((unsigned long)addr + count <= va->va_start) > > f181234a5a21fd0 Chen Wandun 2021-09-02 3651 goto finished; > > f181234a5a21fd0 Chen Wandun 2021-09-02 3652 > > f608788cd2d6cae Serapheim Dimitropoulos 2021-04-29 3653 list_for_each_entry_from(va, &vmap_area_list, list) { > > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3654 if (!count) > > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3655 break; > > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3656 > > 129dbdf298d7383 Baoquan He 2023-01-13 3657 vm = va->vm; > > 129dbdf298d7383 Baoquan He 2023-01-13 3658 flags = va->flags & VMAP_FLAGS_MASK; > > 129dbdf298d7383 Baoquan He 2023-01-13 3659 > > 129dbdf298d7383 Baoquan He 2023-01-13 3660 if (!vm && !flags) > > ^^^ > > vm can be NULL if a flag in VMAP_FLAGS_MASK is set. > > > > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3661 continue; > > Right, after the 'continue;' line, only two cases could happen when it > comes here. (vm != null) or (vm->flags & VMAP_RAM) is true. > You're saying VMAP_RAM, but strictly speaking the code is checking VMAP_FLAGS_MASK and not VMAP_RAM. +#define VMAP_RAM 0x1 /* indicates vm_map_ram area*/ +#define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/ +#define VMAP_FLAGS_MASK 0x3 If we assume that vm is NULL, VMAP_BLOCK is set and VMAP_RAM is clear then it would lead to a NULL dereference. There might be reasons why that combination is impossible outside the function but we can't tell from the information we have here. Which is fine, outside information is a common reason for false positives with this check. But I was just concerned about the mix of VMAP_FLAGS_MASK and VMAP_RAM. regards, dan carpenter > > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3662 > > 129dbdf298d7383 Baoquan He 2023-01-13 3663 vaddr = (char *) va->va_start; > > 129dbdf298d7383 Baoquan He 2023-01-13 @3664 size = vm ? get_vm_area_size(vm) : va_size(va); > > ^^ > > > > 129dbdf298d7383 Baoquan He 2023-01-13 3665 > > 129dbdf298d7383 Baoquan He 2023-01-13 3666 if (addr >= vaddr + size) > > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3667 continue; > > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3668 while (addr < vaddr) { > > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3669 if (count == 0) > > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3670 goto finished; > > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3671 *buf = '\0'; > > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3672 buf++; > > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3673 addr++; > > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3674 count--; > > ^1da177e4c3f415 Linus Torvalds 2005-04-16 3675 } > > 129dbdf298d7383 Baoquan He 2023-01-13 3676 n = vaddr + size - addr; > > d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3677 if (n > count) > > d0107eb07320b5d KAMEZAWA Hiroyuki 2009-09-21 3678 n = count; > > 129dbdf298d7383 Baoquan He 2023-01-13 3679 > > 129dbdf298d7383 Baoquan He 2023-01-13 3680 if (flags & VMAP_RAM) > > > > assume VMAP_RAM is not set > > OK, then vm is not null. > > > > 129dbdf298d7383 Baoquan He 2023-01-13 3681 vmap_ram_vread(buf, addr, n, flags); > > 129dbdf298d7383 Baoquan He 2023-01-13 @3682 else if (!(vm->flags & VM_IOREMAP)) > > ^^^^^^^^^ > > Unchecked dereference. Should this be "flags" instead of "vm->flags"? > > Thus, here, in 'else if', vm is not null. And in this 'else if', we are > intending to check vm->flags. I don't see issue or risk here. Please > help check if I miss anything. > > Thanks > Baoquan >
On Fri, Jan 13, 2023 at 11:19:17AM +0800, Baoquan He wrote: > + spin_lock(&vb->lock); > + if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) { > + spin_unlock(&vb->lock); > + memset(buf, 0, count); > + return; > + } > + for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) { > + if (!count) > + break; > + start = vmap_block_vaddr(vb->va->va_start, rs); > + while (addr < start) { > + if (count == 0) > + break; > + *buf = '\0'; > + buf++; > + addr++; > + count--; > + } > + /*it could start reading from the middle of used region*/ > + offset = offset_in_page(addr); > + n = ((re - rs + 1) << PAGE_SHIFT) - offset; > + if (n > count) > + n = count; > + aligned_vread(buf, start+offset, n); The whole vread() interface is rather suboptimal. The only user is proc, which is trying to copy to userspace. But the vread() interface copies to a kernel address, so kcore has to copy to a bounce buffer. That makes this spinlock work, but the price is that we can't copy to a user address in the future. Ideally, read_kcore() would be kcore_read_iter() and we'd pass an iov_iter into vread(). vread() would then need to use a mutex rather than a spinlock. I don't think this needs to be done now, but if someone's looking for a project ...
On Mon, Jan 16, 2023 at 07:01:33PM +0000, Matthew Wilcox wrote: > On Fri, Jan 13, 2023 at 11:19:17AM +0800, Baoquan He wrote: > > + spin_lock(&vb->lock); > > + if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) { > > + spin_unlock(&vb->lock); > > + memset(buf, 0, count); > > + return; > > + } > > + for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) { > > + if (!count) > > + break; > > + start = vmap_block_vaddr(vb->va->va_start, rs); > > + while (addr < start) { > > + if (count == 0) > > + break; > > + *buf = '\0'; > > + buf++; > > + addr++; > > + count--; > > + } > > + /*it could start reading from the middle of used region*/ > > + offset = offset_in_page(addr); > > + n = ((re - rs + 1) << PAGE_SHIFT) - offset; > > + if (n > count) > > + n = count; > > + aligned_vread(buf, start+offset, n); > > The whole vread() interface is rather suboptimal. The only user is proc, > which is trying to copy to userspace. But the vread() interface copies > to a kernel address, so kcore has to copy to a bounce buffer. That makes > this spinlock work, but the price is that we can't copy to a user address > in the future. Ideally, read_kcore() would be kcore_read_iter() and > we'd pass an iov_iter into vread(). vread() would then need to use a > mutex rather than a spinlock. > > I don't think this needs to be done now, but if someone's looking for > a project ... Interesting! I may take a look at this if I get the time :)
On 01/16/23 at 04:08pm, Dan Carpenter wrote: > On Sun, Jan 15, 2023 at 10:08:55PM +0800, Baoquan He wrote: > > > f181234a5a21fd0 Chen Wandun 2021-09-02 3650 if ((unsigned long)addr + count <= va->va_start) > > > f181234a5a21fd0 Chen Wandun 2021-09-02 3651 goto finished; > > > f181234a5a21fd0 Chen Wandun 2021-09-02 3652 > > > f608788cd2d6cae Serapheim Dimitropoulos 2021-04-29 3653 list_for_each_entry_from(va, &vmap_area_list, list) { > > > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3654 if (!count) > > > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3655 break; > > > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3656 > > > 129dbdf298d7383 Baoquan He 2023-01-13 3657 vm = va->vm; > > > 129dbdf298d7383 Baoquan He 2023-01-13 3658 flags = va->flags & VMAP_FLAGS_MASK; > > > 129dbdf298d7383 Baoquan He 2023-01-13 3659 > > > 129dbdf298d7383 Baoquan He 2023-01-13 3660 if (!vm && !flags) > > > ^^^ > > > vm can be NULL if a flag in VMAP_FLAGS_MASK is set. > > > > > > e81ce85f960c2e2 Joonsoo Kim 2013-04-29 3661 continue; > > > > Right, after the 'continue;' line, only two cases could happen when it > > comes here. (vm != null) or (vm->flags & VMAP_RAM) is true. > > > > You're saying VMAP_RAM, but strictly speaking the code is checking > VMAP_FLAGS_MASK and not VMAP_RAM. > > +#define VMAP_RAM 0x1 /* indicates vm_map_ram area*/ > +#define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/ > +#define VMAP_FLAGS_MASK 0x3 > > If we assume that vm is NULL, VMAP_BLOCK is set and VMAP_RAM is clear > then it would lead to a NULL dereference. There might be reasons why > that combination is impossible outside the function but we can't tell > from the information we have here. VMAP_BLOCK has no chance to be set alone. It has to be set together with VMAP_RAM if needed. > > Which is fine, outside information is a common reason for false > positives with this check. But I was just concerned about the mix of > VMAP_FLAGS_MASK and VMAP_RAM. Thanks, I see your point now, will consider how to improve it.
On 01/16/23 at 12:50pm, Uladzislau Rezki wrote: > On Fri, Jan 13, 2023 at 11:19:17AM +0800, Baoquan He wrote: > > Currently, vread can read out vmalloc areas which is associated with > > a vm_struct. While this doesn't work for areas created by vm_map_ram() > > interface because it doesn't have an associated vm_struct. Then in vread(), > > these areas are all skipped. > > > > Here, add a new function vmap_ram_vread() to read out vm_map_ram areas. > > The area created with vmap_ram_vread() interface directly can be handled > > like the other normal vmap areas with aligned_vread(). While areas > > which will be further subdivided and managed with vmap_block need > > carefully read out page-aligned small regions and zero fill holes. > > > > Signed-off-by: Baoquan He <bhe@redhat.com> > > --- > > mm/vmalloc.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++----- > > 1 file changed, 73 insertions(+), 7 deletions(-) > > > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c > > index ab4825050b5c..13875bc41e27 100644 > > --- a/mm/vmalloc.c > > +++ b/mm/vmalloc.c > > @@ -3544,6 +3544,65 @@ static int aligned_vread(char *buf, char *addr, unsigned long count) > > return copied; > > } > > > > +static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags) > > +{ > > + char *start; > > + struct vmap_block *vb; > > + unsigned long offset; > > + unsigned int rs, re, n; > > + > > + /* > > + * If it's area created by vm_map_ram() interface directly, but > > + * not further subdividing and delegating management to vmap_block, > > + * handle it here. > > + */ > > + if (!(flags & VMAP_BLOCK)) { > > + aligned_vread(buf, addr, count); > > + return; > > + } > > + > > + /* > > + * Area is split into regions and tracked with vmap_block, read out > > + * each region and zero fill the hole between regions. > > + */ > > + vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr)); > > + > > + spin_lock(&vb->lock); > > + if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) { > > > CPU-X invokes free_vmap_block() whereas we take the vb->lock and do > some manipulations with vb that might be already freed over RCU-core. > > Should we protect it by the rcu_read_lock() also here? Just go over the vb and vbq code again, seems we don't need the rcu_read_lock() here. The rcu lock is needed when operating on the vmap_block_queue->free list. I don't see race between the vb accessing here and those list adding or removing on vmap_block_queue->free with rcu. If I miss some race windows between them, please help point out. However, when I check free_vmap_block(), I do find a risk. As you said, CPU-x invokes free_vmap_block() and executed xa_erase() to remove the vb from vmap_blocks tree. Then vread() comes into vmap_ram_vread() and call xa_load(), it would be null. I should check the returned vb in free_vmap_block(). static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags) { ...... if (!(flags & VMAP_BLOCK)) { aligned_vread(buf, addr, count); return; } /* * Area is split into regions and tracked with vmap_block, read out * each region and zero fill the hole between regions. */ vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr)); if (!vb) <-- vb need be checked here to avoid accessing erased vb from vmap_blocks tree memset(buf, 0, count); ...... }
On 01/19/23 at 05:52pm, Baoquan He wrote: > On 01/16/23 at 12:50pm, Uladzislau Rezki wrote: > > On Fri, Jan 13, 2023 at 11:19:17AM +0800, Baoquan He wrote: > > > Currently, vread can read out vmalloc areas which is associated with > > > a vm_struct. While this doesn't work for areas created by vm_map_ram() > > > interface because it doesn't have an associated vm_struct. Then in vread(), > > > these areas are all skipped. > > > > > > Here, add a new function vmap_ram_vread() to read out vm_map_ram areas. > > > The area created with vmap_ram_vread() interface directly can be handled > > > like the other normal vmap areas with aligned_vread(). While areas > > > which will be further subdivided and managed with vmap_block need > > > carefully read out page-aligned small regions and zero fill holes. > > > > > > Signed-off-by: Baoquan He <bhe@redhat.com> > > > --- > > > mm/vmalloc.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++----- > > > 1 file changed, 73 insertions(+), 7 deletions(-) > > > > > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c > > > index ab4825050b5c..13875bc41e27 100644 > > > --- a/mm/vmalloc.c > > > +++ b/mm/vmalloc.c > > > @@ -3544,6 +3544,65 @@ static int aligned_vread(char *buf, char *addr, unsigned long count) > > > return copied; > > > } > > > > > > +static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags) > > > +{ > > > + char *start; > > > + struct vmap_block *vb; > > > + unsigned long offset; > > > + unsigned int rs, re, n; > > > + > > > + /* > > > + * If it's area created by vm_map_ram() interface directly, but > > > + * not further subdividing and delegating management to vmap_block, > > > + * handle it here. > > > + */ > > > + if (!(flags & VMAP_BLOCK)) { > > > + aligned_vread(buf, addr, count); > > > + return; > > > + } > > > + > > > + /* > > > + * Area is split into regions and tracked with vmap_block, read out > > > + * each region and zero fill the hole between regions. > > > + */ > > > + vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr)); > > > + > > > + spin_lock(&vb->lock); > > > + if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) { > > > > > CPU-X invokes free_vmap_block() whereas we take the vb->lock and do > > some manipulations with vb that might be already freed over RCU-core. > > > > Should we protect it by the rcu_read_lock() also here? > > Just go over the vb and vbq code again, seems we don't need the > rcu_read_lock() here. The rcu lock is needed when operating on the > vmap_block_queue->free list. I don't see race between the vb accessing > here and those list adding or removing on vmap_block_queue->free with > rcu. If I miss some race windows between them, please help point out. > > However, when I check free_vmap_block(), I do find a risk. As you said, Forgot to add details about why there's no race between free_vmap_block() and vmap_ram_vread() because we have taken vmap_area_lock at the beginning of vread(). So, except of the missing checking on returned value from xa_load(), free_vmap_block() either is blocked to wait for vmap_area_lock before calling unlink_va(), or finishes calling unlink_va() to remove the vmap from vmap_area_root tree. In both cases, no race happened. > CPU-x invokes free_vmap_block() and executed xa_erase() to remove the vb > from vmap_blocks tree. Then vread() comes into vmap_ram_vread() and call > xa_load(), it would be null. I should check the returned vb in > free_vmap_block(). > > > static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags) > { > ...... > if (!(flags & VMAP_BLOCK)) { > aligned_vread(buf, addr, count); > return; > } > > /* > * Area is split into regions and tracked with vmap_block, read out > * each region and zero fill the hole between regions. > */ > vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr)); > if (!vb) <-- vb need be checked here to avoid accessing erased vb from vmap_blocks tree > memset(buf, 0, count); > ...... > } >
> > On 01/19/23 at 05:52pm, Baoquan He wrote: > > On 01/16/23 at 12:50pm, Uladzislau Rezki wrote: > > > On Fri, Jan 13, 2023 at 11:19:17AM +0800, Baoquan He wrote: > > > > Currently, vread can read out vmalloc areas which is associated with > > > > a vm_struct. While this doesn't work for areas created by vm_map_ram() > > > > interface because it doesn't have an associated vm_struct. Then in vread(), > > > > these areas are all skipped. > > > > > > > > Here, add a new function vmap_ram_vread() to read out vm_map_ram areas. > > > > The area created with vmap_ram_vread() interface directly can be handled > > > > like the other normal vmap areas with aligned_vread(). While areas > > > > which will be further subdivided and managed with vmap_block need > > > > carefully read out page-aligned small regions and zero fill holes. > > > > > > > > Signed-off-by: Baoquan He <bhe@redhat.com> > > > > --- > > > > mm/vmalloc.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++----- > > > > 1 file changed, 73 insertions(+), 7 deletions(-) > > > > > > > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c > > > > index ab4825050b5c..13875bc41e27 100644 > > > > --- a/mm/vmalloc.c > > > > +++ b/mm/vmalloc.c > > > > @@ -3544,6 +3544,65 @@ static int aligned_vread(char *buf, char *addr, unsigned long count) > > > > return copied; > > > > } > > > > > > > > +static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags) > > > > +{ > > > > + char *start; > > > > + struct vmap_block *vb; > > > > + unsigned long offset; > > > > + unsigned int rs, re, n; > > > > + > > > > + /* > > > > + * If it's area created by vm_map_ram() interface directly, but > > > > + * not further subdividing and delegating management to vmap_block, > > > > + * handle it here. > > > > + */ > > > > + if (!(flags & VMAP_BLOCK)) { > > > > + aligned_vread(buf, addr, count); > > > > + return; > > > > + } > > > > + > > > > + /* > > > > + * Area is split into regions and tracked with vmap_block, read out > > > > + * each region and zero fill the hole between regions. > > > > + */ > > > > + vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr)); > > > > + > > > > + spin_lock(&vb->lock); > > > > + if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) { > > > > > > > CPU-X invokes free_vmap_block() whereas we take the vb->lock and do > > > some manipulations with vb that might be already freed over RCU-core. > > > > > > Should we protect it by the rcu_read_lock() also here? > > > > Just go over the vb and vbq code again, seems we don't need the > > rcu_read_lock() here. The rcu lock is needed when operating on the > > vmap_block_queue->free list. I don't see race between the vb accessing > > here and those list adding or removing on vmap_block_queue->free with > > rcu. If I miss some race windows between them, please help point out. > > > > However, when I check free_vmap_block(), I do find a risk. As you said, > > Forgot to add details about why there's no race between free_vmap_block() > and vmap_ram_vread() because we have taken vmap_area_lock at the beginning > of vread(). So, except of the missing checking on returned value from > xa_load(), free_vmap_block() either is blocked to wait for vmap_area_lock > before calling unlink_va(), or finishes calling unlink_va() to remove > the vmap from vmap_area_root tree. In both cases, no race happened. > Agree. xa_load()s return value should be checked. Because it can be that there is no vmap_block associated with an address if xa_erase() was done earlier. -- Uladzislau Rezki
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ab4825050b5c..13875bc41e27 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3544,6 +3544,65 @@ static int aligned_vread(char *buf, char *addr, unsigned long count) return copied; } +static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags) +{ + char *start; + struct vmap_block *vb; + unsigned long offset; + unsigned int rs, re, n; + + /* + * If it's area created by vm_map_ram() interface directly, but + * not further subdividing and delegating management to vmap_block, + * handle it here. + */ + if (!(flags & VMAP_BLOCK)) { + aligned_vread(buf, addr, count); + return; + } + + /* + * Area is split into regions and tracked with vmap_block, read out + * each region and zero fill the hole between regions. + */ + vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr)); + + spin_lock(&vb->lock); + if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) { + spin_unlock(&vb->lock); + memset(buf, 0, count); + return; + } + for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) { + if (!count) + break; + start = vmap_block_vaddr(vb->va->va_start, rs); + while (addr < start) { + if (count == 0) + break; + *buf = '\0'; + buf++; + addr++; + count--; + } + /*it could start reading from the middle of used region*/ + offset = offset_in_page(addr); + n = ((re - rs + 1) << PAGE_SHIFT) - offset; + if (n > count) + n = count; + aligned_vread(buf, start+offset, n); + + buf += n; + addr += n; + count -= n; + } + spin_unlock(&vb->lock); + + /* zero-fill the left dirty or free regions */ + if (count) + memset(buf, 0, count); +} + /** * vread() - read vmalloc area in a safe way. * @buf: buffer for reading data @@ -3574,7 +3633,7 @@ long vread(char *buf, char *addr, unsigned long count) struct vm_struct *vm; char *vaddr, *buf_start = buf; unsigned long buflen = count; - unsigned long n; + unsigned long n, size, flags; addr = kasan_reset_tag(addr); @@ -3595,12 +3654,16 @@ long vread(char *buf, char *addr, unsigned long count) if (!count) break; - if (!va->vm) + vm = va->vm; + flags = va->flags & VMAP_FLAGS_MASK; + + if (!vm && !flags) continue; - vm = va->vm; - vaddr = (char *) vm->addr; - if (addr >= vaddr + get_vm_area_size(vm)) + vaddr = (char *) va->va_start; + size = vm ? get_vm_area_size(vm) : va_size(va); + + if (addr >= vaddr + size) continue; while (addr < vaddr) { if (count == 0) @@ -3610,10 +3673,13 @@ long vread(char *buf, char *addr, unsigned long count) addr++; count--; } - n = vaddr + get_vm_area_size(vm) - addr; + n = vaddr + size - addr; if (n > count) n = count; - if (!(vm->flags & VM_IOREMAP)) + + if (flags & VMAP_RAM) + vmap_ram_vread(buf, addr, n, flags); + else if (!(vm->flags & VM_IOREMAP)) aligned_vread(buf, addr, n); else /* IOREMAP area is treated as memory hole */ memset(buf, 0, n);
Currently, vread can read out vmalloc areas which is associated with a vm_struct. While this doesn't work for areas created by vm_map_ram() interface because it doesn't have an associated vm_struct. Then in vread(), these areas are all skipped. Here, add a new function vmap_ram_vread() to read out vm_map_ram areas. The area created with vmap_ram_vread() interface directly can be handled like the other normal vmap areas with aligned_vread(). While areas which will be further subdivided and managed with vmap_block need carefully read out page-aligned small regions and zero fill holes. Signed-off-by: Baoquan He <bhe@redhat.com> --- mm/vmalloc.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 73 insertions(+), 7 deletions(-)