Message ID | 20221109033535.269229-4-bhe@redhat.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm/vmalloc.c: allow vread() to read out vm_map_ram areas | expand |
Baoquan He <bhe@redhat.com> writes: > Currently, vread() can read out vmalloc areas which is associated with > a vm_struct. While this doesn't work for areas created by vm_map_ram() > interface because it doesn't allocate a vm_struct. Then in vread(), > these areas will be skipped. > > Here, add a new function vb_vread() to read out areas managed by > vmap_block specifically. Then recognize vm_map_ram areas via vmap->flags > and handle them respectively. > > Stephen Brennan <stephen.s.brennan@oracle.com> > Signed-off-by: Baoquan He <bhe@redhat.com> > Link: https://lore.kernel.org/all/87ilk6gos2.fsf@oracle.com/T/#u > --- > mm/vmalloc.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++------ > 1 file changed, 51 insertions(+), 6 deletions(-) > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c > index 41d82dc07e13..5a8d5659bfb0 100644 > --- a/mm/vmalloc.c > +++ b/mm/vmalloc.c > @@ -3518,6 +3518,46 @@ static int aligned_vread(char *buf, char *addr, unsigned long count) > return copied; > } > > +static void vb_vread(char *buf, char *addr, int count) > +{ > + char *start; > + struct vmap_block *vb; > + unsigned long offset; > + unsigned int rs, re, n; > + > + offset = ((unsigned long)addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; > + vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr)); > + > + spin_lock(&vb->lock); > + if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) { > + spin_unlock(&vb->lock); > + memset(buf, 0, count); > + return; > + } > + for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) { > + if (!count) > + break; > + start = vmap_block_vaddr(vb->va->va_start, rs); > + if (addr < start) { > + if (count == 0) > + break; > + *buf = '\0'; > + buf++; > + addr++; > + count--; > + } > + n = (re - rs + 1) << PAGE_SHIFT; > + if (n > count) > + n = count; > + aligned_vread(buf, start, n); > + > + buf += n; > + addr += n; > + count -= n; > + } > + spin_unlock(&vb->lock); > +} > + > /** > * vread() - read vmalloc area in a safe way. > * @buf: buffer for reading data > @@ -3548,7 +3588,7 @@ long vread(char *buf, char *addr, unsigned long count) > struct vm_struct *vm; > char *vaddr, *buf_start = buf; > unsigned long buflen = count; > - unsigned long n; > + unsigned long n, size; > > addr = kasan_reset_tag(addr); > > @@ -3569,12 +3609,14 @@ long vread(char *buf, char *addr, unsigned long count) > if (!count) > break; > > - if (!va->vm) > + if (!(va->flags & VMAP_RAM) && !va->vm) > continue; > > vm = va->vm; > - vaddr = (char *) vm->addr; > - if (addr >= vaddr + get_vm_area_size(vm)) > + vaddr = (char *) va->va_start; > + size = vm ? get_vm_area_size(vm) : va_size(va); Hi Baoquan, Thanks for working on this. I tested your patches out by using drgn to debug /proc/kcore. I have a kernel module[1] to do a vm_map_ram() call and print the virtual address to the kernel log so I can try to read that memory address in drgn. When I did this test, I got a panic on the above line of code. [ 167.101113] BUG: kernel NULL pointer dereference, address: 0000000000000013 [ 167.104538] #PF: supervisor read access in kernel mode [ 167.106446] #PF: error_code(0x0000) - not-present page [ 167.108474] PGD 0 P4D 0 [ 167.109311] Oops: 0000 [#1] PREEMPT SMP NOPTI [ 167.111727] CPU: 3 PID: 7647 Comm: drgn Kdump: loaded Tainted: G OE 6.1.0-rc4.bugvreadtest.el8.dev02.x86_64 #1 [ 167.115076] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.5.1 06/16/2021 [ 167.117348] RIP: 0010:vread+0xaf/0x210 [ 167.118345] Code: 86 3e 01 00 00 48 85 db 0f 84 35 01 00 00 49 8d 47 28 48 3d 10 f8 a7 8f 0f 84 25 01 00 00 4d 89 f4 49 8b 57 38 48 85 d2 74 21 <48> 8b 42 10 f6 42 18 40 48 89 d6 49 8b 0f 48 8d b8 00 f0 ff ff 48 [ 167.123776] RSP: 0018:ffffaeb380a1fb90 EFLAGS: 00010206 [ 167.125669] RAX: ffff9853a1397b28 RBX: 0000000000000040 RCX: 0000000000000000 [ 167.128401] RDX: 0000000000000003 RSI: 0000000000000000 RDI: 0000000000000000 [ 167.130948] RBP: ffffaeb382400000 R08: 0000000000000000 R09: 0000000000000000 [ 167.133372] R10: 0000000000000000 R11: 0000000000000000 R12: ffff985385877000 [ 167.135397] R13: 0000000000000040 R14: ffff985385877000 R15: ffff9853a1397b00 [ 167.137533] FS: 00007f71eae33b80(0000) GS:ffff9856afd80000(0000) knlGS:0000000000000000 [ 167.140210] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 167.142440] CR2: 0000000000000013 CR3: 000000012048a000 CR4: 00000000003506e0 [ 167.144640] Call Trace: [ 167.145494] <TASK> [ 167.146263] read_kcore+0x33a/0xa30 [ 167.147392] ? remove_entity_load_avg+0x2e/0x70 [ 167.148425] ? _raw_spin_unlock_irqrestore+0x11/0x60 [ 167.150657] ? __wake_up_common_lock+0x8b/0xd0 [ 167.152261] ? tty_set_termios+0x211/0x280 [ 167.153397] ? set_termios+0x16b/0x1d0 [ 167.154698] ? _raw_spin_unlock+0xe/0x40 [ 167.155737] ? wp_page_reuse+0x60/0x80 [ 167.157138] ? do_wp_page+0x169/0x3a0 [ 167.158752] ? pmd_pfn+0x9/0x50 [ 167.159645] ? __handle_mm_fault+0x3b0/0x690 [ 167.160837] ? inode_security+0x22/0x60 [ 167.161761] proc_reg_read+0x5a/0xb0 [ 167.162777] vfs_read+0xa7/0x320 [ 167.163512] ? handle_mm_fault+0xb6/0x2c0 [ 167.164400] __x64_sys_pread64+0x9c/0xd0 [ 167.165763] do_syscall_64+0x3f/0xa0 [ 167.167610] entry_SYSCALL_64_after_hwframe+0x63/0xcd [ 167.169951] RIP: 0033:0x7f71e9c123d7 I debugged the resulting core dump and found the reason: >>> stack_trace = prog.crashed_thread().stack_trace() >>> stack_trace #0 crash_setup_regs (./arch/x86/include/asm/kexec.h:95:3) #1 __crash_kexec (kernel/kexec_core.c:974:4) #2 panic (kernel/panic.c:330:3) #3 oops_end (arch/x86/kernel/dumpstack.c:379:3) #4 page_fault_oops (arch/x86/mm/fault.c:729:2) #5 handle_page_fault (arch/x86/mm/fault.c:1519:3) #6 exc_page_fault (arch/x86/mm/fault.c:1575:2) #7 asm_exc_page_fault+0x26/0x2b (./arch/x86/include/asm/idtentry.h:570) #8 get_vm_area_size (./include/linux/vmalloc.h:203:14) #9 vread (mm/vmalloc.c:3617:15) #10 read_kcore (fs/proc/kcore.c:510:4) #11 pde_read (fs/proc/inode.c:316:10) #12 proc_reg_read (fs/proc/inode.c:328:8) #13 vfs_read (fs/read_write.c:468:9) #14 ksys_pread64 (fs/read_write.c:665:10) #15 __do_sys_pread64 (fs/read_write.c:675:9) #16 __se_sys_pread64 (fs/read_write.c:672:1) #17 __x64_sys_pread64 (fs/read_write.c:672:1) #18 do_syscall_x64 (arch/x86/entry/common.c:50:14) #19 do_syscall_64 (arch/x86/entry/common.c:80:7) #20 entry_SYSCALL_64+0x9f/0x19b (arch/x86/entry/entry_64.S:120) #21 0x7f71e9c123d7 >>> stack_trace[9]["va"] *(struct vmap_area *)0xffff9853a1397b00 = { .va_start = (unsigned long)18446654684740452352, .va_end = (unsigned long)18446654684741500928, .rb_node = (struct rb_node){ .__rb_parent_color = (unsigned long)18446630083335569168, .rb_right = (struct rb_node *)0x0, .rb_left = (struct rb_node *)0x0, }, .list = (struct list_head){ .next = (struct list_head *)0xffff98538c403f28, .prev = (struct list_head *)0xffff98538c54e1e8, }, .subtree_max_size = (unsigned long)3, .vm = (struct vm_struct *)0x3, .flags = (unsigned long)3, } Since flags is in a union, it shadows "vm" and causes the condition to be true, and then get_vm_area_size() tries to follow the pointer defined by flags. I'm not sure if the fix is to have flags be a separate field inside vmap_area, or to have more careful handling in the vread path. Thanks, Stephen > + > + if (addr >= vaddr + size) > continue; > while (addr < vaddr) { > if (count == 0) > @@ -3584,10 +3626,13 @@ long vread(char *buf, char *addr, unsigned long count) > addr++; > count--; > } > - n = vaddr + get_vm_area_size(vm) - addr; > + n = vaddr + size - addr; > if (n > count) > n = count; > - if (!(vm->flags & VM_IOREMAP)) > + > + if ((va->flags & (VMAP_RAM|VMAP_BLOCK)) == (VMAP_RAM|VMAP_BLOCK)) > + vb_vread(buf, addr, n); > + else if ((va->flags & VMAP_RAM) || !(vm->flags & VM_IOREMAP)) > aligned_vread(buf, addr, n); > else /* IOREMAP area is treated as memory hole */ > memset(buf, 0, n); > -- > 2.34.1
On 11/09/22 at 04:59pm, Stephen Brennan wrote: ...... > > @@ -3569,12 +3609,14 @@ long vread(char *buf, char *addr, unsigned long count) > > if (!count) > > break; > > > > - if (!va->vm) > > + if (!(va->flags & VMAP_RAM) && !va->vm) > > continue; > > > > vm = va->vm; > > - vaddr = (char *) vm->addr; > > - if (addr >= vaddr + get_vm_area_size(vm)) > > + vaddr = (char *) va->va_start; > > + size = vm ? get_vm_area_size(vm) : va_size(va); > > Hi Baoquan, > > Thanks for working on this. I tested your patches out by using drgn to > debug /proc/kcore. I have a kernel module[1] to do a vm_map_ram() call > and print the virtual address to the kernel log so I can try to read > that memory address in drgn. When I did this test, I got a panic on the > above line of code. ...... > Since flags is in a union, it shadows "vm" and causes the condition to > be true, and then get_vm_area_size() tries to follow the pointer defined > by flags. I'm not sure if the fix is to have flags be a separate field > inside vmap_area, or to have more careful handling in the vread path. Sorry, my bad. Thanks for testing this and catching the error, Stephen. About the fix, both way are fine to me. I made a draft fix based on the current patchset. To me, adding flags in a separate field makes code easier, but cost extra memory. I will see what other people say about this, firstly if the solution is acceptable, then reusing the union field or adding anohter flags. Could you try below code to see if it works? diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5a8d5659bfb0..78cae59170d8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1890,6 +1890,7 @@ struct vmap_area *find_vmap_area(unsigned long addr) #define VMAP_RAM 0x1 #define VMAP_BLOCK 0x2 +#define VMAP_FLAGS_MASK 0x3 struct vmap_block_queue { spinlock_t lock; @@ -3588,7 +3589,7 @@ long vread(char *buf, char *addr, unsigned long count) struct vm_struct *vm; char *vaddr, *buf_start = buf; unsigned long buflen = count; - unsigned long n, size; + unsigned long n, size, flags; addr = kasan_reset_tag(addr); @@ -3609,12 +3610,14 @@ long vread(char *buf, char *addr, unsigned long count) if (!count) break; - if (!(va->flags & VMAP_RAM) && !va->vm) + if (!va->vm) continue; + flags = va->flags & VMAP_FLAGS_MASK; vm = va->vm; + vaddr = (char *) va->va_start; - size = vm ? get_vm_area_size(vm) : va_size(va); + size = flags ? va_size(va) : get_vm_area_size(vm); if (addr >= vaddr + size) continue; @@ -3630,9 +3633,9 @@ long vread(char *buf, char *addr, unsigned long count) if (n > count) n = count; - if ((va->flags & (VMAP_RAM|VMAP_BLOCK)) == (VMAP_RAM|VMAP_BLOCK)) + if ((flags & (VMAP_RAM|VMAP_BLOCK)) == (VMAP_RAM|VMAP_BLOCK)) vb_vread(buf, addr, n); - else if ((va->flags & VMAP_RAM) || !(vm->flags & VM_IOREMAP)) + else if ((flags & VMAP_RAM) || !(vm->flags & VM_IOREMAP)) aligned_vread(buf, addr, n); else /* IOREMAP area is treated as memory hole */ memset(buf, 0, n);
Baoquan He <bhe@redhat.com> writes: > On 11/09/22 at 04:59pm, Stephen Brennan wrote: > ...... >> > @@ -3569,12 +3609,14 @@ long vread(char *buf, char *addr, unsigned long count) >> > if (!count) >> > break; >> > >> > - if (!va->vm) >> > + if (!(va->flags & VMAP_RAM) && !va->vm) >> > continue; >> > >> > vm = va->vm; >> > - vaddr = (char *) vm->addr; >> > - if (addr >= vaddr + get_vm_area_size(vm)) >> > + vaddr = (char *) va->va_start; >> > + size = vm ? get_vm_area_size(vm) : va_size(va); >> >> Hi Baoquan, >> >> Thanks for working on this. I tested your patches out by using drgn to >> debug /proc/kcore. I have a kernel module[1] to do a vm_map_ram() call >> and print the virtual address to the kernel log so I can try to read >> that memory address in drgn. When I did this test, I got a panic on the >> above line of code. > ...... >> Since flags is in a union, it shadows "vm" and causes the condition to >> be true, and then get_vm_area_size() tries to follow the pointer defined >> by flags. I'm not sure if the fix is to have flags be a separate field >> inside vmap_area, or to have more careful handling in the vread path. > > Sorry, my bad. Thanks for testing this and catching the error, Stephen. > > About the fix, both way are fine to me. I made a draft fix based on the > current patchset. To me, adding flags in a separate field makes code > easier, but cost extra memory. I will see what other people say about > this, firstly if the solution is acceptable, then reusing the union > field or adding anohter flags. > > Could you try below code to see if it works? I tried the patch below and it worked for me: reading from vm_map_ram() regions in drgn was fine, gave me the correct values, and I also tested reads which overlapped the beginning and end of the region. That said (and I don't know the vmalloc code well at all), I wonder whether you can be confident that the lower 2 bits of the va->vm pointer are always clear? It looks like it comes from kmalloc, and so it should be aligned, but can slab red zones mess up that alignment? I also tested out this patch on top of yours, to be a bit more cautious. I think we can be confident that the remaining bits are zero when used as flags, and non-zero when used as a pointer, so you can test them to avoid any overlap. But it's probably too cautious. diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 78cae59170d8..911974f32b23 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3613,7 +3613,7 @@ long vread(char *buf, char *addr, unsigned long count) if (!va->vm) continue; - flags = va->flags & VMAP_FLAGS_MASK; + flags = (va->flags & ~VMAP_FLAGS_MASK) ? 0 : (va->flags & VMAP_FLAGS_MASK); vm = va->vm; vaddr = (char *) va->va_start;
On 11/10/22 at 10:48am, Stephen Brennan wrote: > Baoquan He <bhe@redhat.com> writes: > > > On 11/09/22 at 04:59pm, Stephen Brennan wrote: > > ...... > >> > @@ -3569,12 +3609,14 @@ long vread(char *buf, char *addr, unsigned long count) > >> > if (!count) > >> > break; > >> > > >> > - if (!va->vm) > >> > + if (!(va->flags & VMAP_RAM) && !va->vm) > >> > continue; > >> > > >> > vm = va->vm; > >> > - vaddr = (char *) vm->addr; > >> > - if (addr >= vaddr + get_vm_area_size(vm)) > >> > + vaddr = (char *) va->va_start; > >> > + size = vm ? get_vm_area_size(vm) : va_size(va); > >> > >> Hi Baoquan, > >> > >> Thanks for working on this. I tested your patches out by using drgn to > >> debug /proc/kcore. I have a kernel module[1] to do a vm_map_ram() call > >> and print the virtual address to the kernel log so I can try to read > >> that memory address in drgn. When I did this test, I got a panic on the > >> above line of code. > > ...... > >> Since flags is in a union, it shadows "vm" and causes the condition to > >> be true, and then get_vm_area_size() tries to follow the pointer defined > >> by flags. I'm not sure if the fix is to have flags be a separate field > >> inside vmap_area, or to have more careful handling in the vread path. > > > > Sorry, my bad. Thanks for testing this and catching the error, Stephen. > > > > About the fix, both way are fine to me. I made a draft fix based on the > > current patchset. To me, adding flags in a separate field makes code > > easier, but cost extra memory. I will see what other people say about > > this, firstly if the solution is acceptable, then reusing the union > > field or adding anohter flags. > > > > Could you try below code to see if it works? > > I tried the patch below and it worked for me: reading from vm_map_ram() > regions in drgn was fine, gave me the correct values, and I also tested > reads which overlapped the beginning and end of the region. Thanks a lot for the testing. > > That said (and I don't know the vmalloc code well at all), I wonder > whether you can be confident that the lower 2 bits of the va->vm pointer > are always clear? It looks like it comes from kmalloc, and so it should > be aligned, but can slab red zones mess up that alignment? Hmm, it should be OK. I am also worried about the other places of va->vm checking. I will check code again to see if there's risk in the case you mentioned. I may change to add another ->flags field into vmap_area in v2 post. > > I also tested out this patch on top of yours, to be a bit more cautious. > I think we can be confident that the remaining bits are zero when used > as flags, and non-zero when used as a pointer, so you can test them to > avoid any overlap. But it's probably too cautious. > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c > index 78cae59170d8..911974f32b23 100644 > --- a/mm/vmalloc.c > +++ b/mm/vmalloc.c > @@ -3613,7 +3613,7 @@ long vread(char *buf, char *addr, unsigned long count) > if (!va->vm) > continue; > > - flags = va->flags & VMAP_FLAGS_MASK; > + flags = (va->flags & ~VMAP_FLAGS_MASK) ? 0 : (va->flags & VMAP_FLAGS_MASK); > vm = va->vm; > > vaddr = (char *) va->va_start; >
On Wed, Nov 09, 2022 at 11:35:34AM +0800, Baoquan He wrote: > Currently, vread() can read out vmalloc areas which is associated with > a vm_struct. While this doesn't work for areas created by vm_map_ram() > interface because it doesn't allocate a vm_struct. Then in vread(), > these areas will be skipped. > > Here, add a new function vb_vread() to read out areas managed by > vmap_block specifically. Then recognize vm_map_ram areas via vmap->flags > and handle them respectively. i don't understand how this deals with the original problem identified, that the vread() can race with an unmap.
On 11/18/22 at 08:01am, Matthew Wilcox wrote: > On Wed, Nov 09, 2022 at 11:35:34AM +0800, Baoquan He wrote: > > Currently, vread() can read out vmalloc areas which is associated with > > a vm_struct. While this doesn't work for areas created by vm_map_ram() > > interface because it doesn't allocate a vm_struct. Then in vread(), > > these areas will be skipped. > > > > Here, add a new function vb_vread() to read out areas managed by > > vmap_block specifically. Then recognize vm_map_ram areas via vmap->flags > > and handle them respectively. > > i don't understand how this deals with the original problem identified, > that the vread() can race with an unmap. Thanks for checking. I wrote a paragraph, then realized I misunderstood your concern. You are saying the comment from Uladzislau about my original draft patch, right? Paste the link of Uladzislau's reply here in case other people want to know the background: https://lore.kernel.org/all/Y1uKSmgURNEa3nQu@pc636/T/#u When Stephen raised the issue originally, I posted a draft patch as below trying to fix it: https://lore.kernel.org/all/Y1pHTj2wuhoWmeV3@MiWiFi-R3L-srv/T/#u In above draft patch, I tried to differentiate normal vmalloc area and vm_map_ram area with the fact that vmalloc area is associated with a vm_struct, while vm_map_ram area has ->vm as NULL. And I thought their only difference is normal vmalloc area has guard page, so its size need consider the guard page; while vm_map_ram area has no guard page, only consider its own actual size. Uladzislau's comment reminded me I was wrong. And the things we need handle are beyond that. Currently there are three kinds of vmalloc areas in kernel: 1) normal vmalloc areas, associated with a vm_struct, this is allocated in __get_vm_area_node(). When freeing, it set ->vm to NULL firstly, then unmap and free vmap_area, see remove_vm_area(). 2) areas allocated via vm_map_ram() and size is larger than VMAP_MAX_ALLOC. The entire area is not associated with vm_struct, and freed at one time in vm_unmap_ram() with unmapping and freeing vmap_area; 3) areas allocated via vm_map_ram(), then delegate to vb_alloc() when size <= VMAP_MAX_ALLOC. Its vmap_area is allocated at one time with VMAP_BLOCK_SIZE big, and split and used later through vb_alloc(), freed via vb_free(). When the entire area is dirty, it will be unmapped and freed. Based on above facts, we need add flags to differentiate the normal vmalloc area from the vm_map_ram area, namely area 1) and 2). And we also need flags to differentiate the area 2) and 3). Because area 3) are pieces of a entire vmap_area, vb_free() will unmap the piece of area and set the part dirty, but the entire vmap_area will kept there. So when we will read area 3), we need take vb->lock and only read out the still mapped part, but not dirty or free part of the vmap_area. Thanks Baoquan
On Wed, Nov 23, 2022 at 11:38:54AM +0800, Baoquan He wrote: > On 11/18/22 at 08:01am, Matthew Wilcox wrote: > > On Wed, Nov 09, 2022 at 11:35:34AM +0800, Baoquan He wrote: > > > Currently, vread() can read out vmalloc areas which is associated with > > > a vm_struct. While this doesn't work for areas created by vm_map_ram() > > > interface because it doesn't allocate a vm_struct. Then in vread(), > > > these areas will be skipped. > > > > > > Here, add a new function vb_vread() to read out areas managed by > > > vmap_block specifically. Then recognize vm_map_ram areas via vmap->flags > > > and handle them respectively. > > > > i don't understand how this deals with the original problem identified, > > that the vread() can race with an unmap. > > Thanks for checking. > > I wrote a paragraph, then realized I misunderstood your concern. You are > saying the comment from Uladzislau about my original draft patch, right? > Paste the link of Uladzislau's reply here in case other people want to > know the background: > https://lore.kernel.org/all/Y1uKSmgURNEa3nQu@pc636/T/#u > > When Stephen raised the issue originally, I posted a draft patch as > below trying to fix it: > https://lore.kernel.org/all/Y1pHTj2wuhoWmeV3@MiWiFi-R3L-srv/T/#u > > In above draft patch, I tried to differentiate normal vmalloc area and > vm_map_ram area with the fact that vmalloc area is associated with a > vm_struct, while vm_map_ram area has ->vm as NULL. And I thought their > only difference is normal vmalloc area has guard page, so its size need > consider the guard page; while vm_map_ram area has no guard page, only > consider its own actual size. Uladzislau's comment reminded me I was > wrong. And the things we need handle are beyond that. > > Currently there are three kinds of vmalloc areas in kernel: > > 1) normal vmalloc areas, associated with a vm_struct, this is allocated > in __get_vm_area_node(). When freeing, it set ->vm to NULL > firstly, then unmap and free vmap_area, see remove_vm_area(). > > 2) areas allocated via vm_map_ram() and size is larger than > VMAP_MAX_ALLOC. The entire area is not associated with vm_struct, and > freed at one time in vm_unmap_ram() with unmapping and freeing vmap_area; > > 3) areas allocated via vm_map_ram(), then delegate to vb_alloc() when > size <= VMAP_MAX_ALLOC. Its vmap_area is allocated at one time with > VMAP_BLOCK_SIZE big, and split and used later through vb_alloc(), freed > via vb_free(). When the entire area is dirty, it will be unmapped and > freed. > > Based on above facts, we need add flags to differentiate the normal > vmalloc area from the vm_map_ram area, namely area 1) and 2). And we > also need flags to differentiate the area 2) and 3). Because area 3) are > pieces of a entire vmap_area, vb_free() will unmap the piece of area and > set the part dirty, but the entire vmap_area will kept there. So when we > will read area 3), we need take vb->lock and only read out the still > mapped part, but not dirty or free part of the vmap_area. I don't think you understand the problem. Task A: Task B: Task C: p = vm_map_ram() vread(p); ... preempted ... vm_unmap_ram(p); q = vm_map_ram(); vread continues If C has reused the address space allocated by A, task B is now reading the memory mapped by task C instead of task A. If it hasn't, it's now trying to read from unmapped, and quite possibly freed memory. Which might have been allocated by task D. Unless there's some kind of reference count so that B knows that both the address range and the underlying memory can't be freed while it's in the middle of the vread(), this is just unsafe.
On 11/23/22 at 01:24pm, Matthew Wilcox wrote: > On Wed, Nov 23, 2022 at 11:38:54AM +0800, Baoquan He wrote: > > On 11/18/22 at 08:01am, Matthew Wilcox wrote: > > > On Wed, Nov 09, 2022 at 11:35:34AM +0800, Baoquan He wrote: > > > > Currently, vread() can read out vmalloc areas which is associated with > > > > a vm_struct. While this doesn't work for areas created by vm_map_ram() > > > > interface because it doesn't allocate a vm_struct. Then in vread(), > > > > these areas will be skipped. > > > > > > > > Here, add a new function vb_vread() to read out areas managed by > > > > vmap_block specifically. Then recognize vm_map_ram areas via vmap->flags > > > > and handle them respectively. > > > > > > i don't understand how this deals with the original problem identified, > > > that the vread() can race with an unmap. > > > > Thanks for checking. > > > > I wrote a paragraph, then realized I misunderstood your concern. You are > > saying the comment from Uladzislau about my original draft patch, right? > > Paste the link of Uladzislau's reply here in case other people want to > > know the background: > > https://lore.kernel.org/all/Y1uKSmgURNEa3nQu@pc636/T/#u > > > > When Stephen raised the issue originally, I posted a draft patch as > > below trying to fix it: > > https://lore.kernel.org/all/Y1pHTj2wuhoWmeV3@MiWiFi-R3L-srv/T/#u > > > > In above draft patch, I tried to differentiate normal vmalloc area and > > vm_map_ram area with the fact that vmalloc area is associated with a > > vm_struct, while vm_map_ram area has ->vm as NULL. And I thought their > > only difference is normal vmalloc area has guard page, so its size need > > consider the guard page; while vm_map_ram area has no guard page, only > > consider its own actual size. Uladzislau's comment reminded me I was > > wrong. And the things we need handle are beyond that. > > > > Currently there are three kinds of vmalloc areas in kernel: > > > > 1) normal vmalloc areas, associated with a vm_struct, this is allocated > > in __get_vm_area_node(). When freeing, it set ->vm to NULL > > firstly, then unmap and free vmap_area, see remove_vm_area(). > > > > 2) areas allocated via vm_map_ram() and size is larger than > > VMAP_MAX_ALLOC. The entire area is not associated with vm_struct, and > > freed at one time in vm_unmap_ram() with unmapping and freeing vmap_area; > > > > 3) areas allocated via vm_map_ram(), then delegate to vb_alloc() when > > size <= VMAP_MAX_ALLOC. Its vmap_area is allocated at one time with > > VMAP_BLOCK_SIZE big, and split and used later through vb_alloc(), freed > > via vb_free(). When the entire area is dirty, it will be unmapped and > > freed. > > > > Based on above facts, we need add flags to differentiate the normal > > vmalloc area from the vm_map_ram area, namely area 1) and 2). And we > > also need flags to differentiate the area 2) and 3). Because area 3) are > > pieces of a entire vmap_area, vb_free() will unmap the piece of area and > > set the part dirty, but the entire vmap_area will kept there. So when we > > will read area 3), we need take vb->lock and only read out the still > > mapped part, but not dirty or free part of the vmap_area. > > I don't think you understand the problem. > > Task A: Task B: Task C: > p = vm_map_ram() > vread(p); > ... preempted ... > vm_unmap_ram(p); > q = vm_map_ram(); > vread continues > > If C has reused the address space allocated by A, task B is now reading > the memory mapped by task C instead of task A. If it hasn't, it's now > trying to read from unmapped, and quite possibly freed memory. Which > might have been allocated by task D. Hmm, it may not be like that. Firstly, I would remind that vread() takes vmap_area_lock during the whole reading process. Before this patchset, the vread() and other area manipulation will have below status: 1) __get_vm_area_node() could only finish insert_vmap_area(), then free the vmap_area_lock, then warting; 2) __get_vm_area_node() finishs setup_vmalloc_vm() 2.1) doing mapping but not finished; 2.2) clear_vm_uninitialized_flag() is called after mapping is done; 3) remove_vm_area() is called to set -> = NULL, then free vmap_area_lock; Task A: Task B: Task C: p = __get_vm_area_node() remove_vm_area(p); vread(p); vread end q = __get_vm_area_node(); So, as you can see, the checking "if (!va->vm)" in vread() will filter out vmap_area: a) areas if only insert_vmap_area() is called, but ->vm is still NULL; b) areas if remove_vm_area() is called to clear ->vm to NULL; c) areas created through vm_map_ram() since its ->vm is always NULL; Means vread() will read out vmap_area: d) areas if setup_vmalloc_vm() is called; 1) mapping is done on areas, e.g clear_vm_uninitialized_flag() is called; 2) mapping is being handled, just after returning from setup_vmalloc_vm(); ******* after this patchset applied: Task A: Task B: Task C: p = vm_map_ram() vm_unmap_ram(p); vread(p); vb_vread() vread end q = vm_map_ram(); With this patchset applied, other than normal areas, for the vm_map_ram() areas: 1) In vm_map_ram(), set vmap_area->flags = VMAP_RAM when vmap_area_lock is taken; In vm_unmap_ram(), clear it wiht "va->flags &= ~VMAP_RAM" when vmap_area_lock is taken; 2) If vmap_block, set va->flags = VMAP_RAM|VMAP_BLOCK; And set vmap_block->used_map to track the used region, filter out the dirty and free region; 3) In vb_vread(), we take vb->lock to avoid reading out dirty regions. Please help point out what is wrong or I missed. > > Unless there's some kind of reference count so that B knows that both > the address range and the underlying memory can't be freed while it's > in the middle of the vread(), this is just unsafe. >
On Thu, Nov 24, 2022 at 05:52:34PM +0800, Baoquan He wrote: > On 11/23/22 at 01:24pm, Matthew Wilcox wrote: > > On Wed, Nov 23, 2022 at 11:38:54AM +0800, Baoquan He wrote: > > > On 11/18/22 at 08:01am, Matthew Wilcox wrote: > > > > On Wed, Nov 09, 2022 at 11:35:34AM +0800, Baoquan He wrote: > > > > > Currently, vread() can read out vmalloc areas which is associated with > > > > > a vm_struct. While this doesn't work for areas created by vm_map_ram() > > > > > interface because it doesn't allocate a vm_struct. Then in vread(), > > > > > these areas will be skipped. > > > > > > > > > > Here, add a new function vb_vread() to read out areas managed by > > > > > vmap_block specifically. Then recognize vm_map_ram areas via vmap->flags > > > > > and handle them respectively. > > > > > > > > i don't understand how this deals with the original problem identified, > > > > that the vread() can race with an unmap. > > > > > > Thanks for checking. > > > > > > I wrote a paragraph, then realized I misunderstood your concern. You are > > > saying the comment from Uladzislau about my original draft patch, right? > > > Paste the link of Uladzislau's reply here in case other people want to > > > know the background: > > > https://lore.kernel.org/all/Y1uKSmgURNEa3nQu@pc636/T/#u > > > > > > When Stephen raised the issue originally, I posted a draft patch as > > > below trying to fix it: > > > https://lore.kernel.org/all/Y1pHTj2wuhoWmeV3@MiWiFi-R3L-srv/T/#u > > > > > > In above draft patch, I tried to differentiate normal vmalloc area and > > > vm_map_ram area with the fact that vmalloc area is associated with a > > > vm_struct, while vm_map_ram area has ->vm as NULL. And I thought their > > > only difference is normal vmalloc area has guard page, so its size need > > > consider the guard page; while vm_map_ram area has no guard page, only > > > consider its own actual size. Uladzislau's comment reminded me I was > > > wrong. And the things we need handle are beyond that. > > > > > > Currently there are three kinds of vmalloc areas in kernel: > > > > > > 1) normal vmalloc areas, associated with a vm_struct, this is allocated > > > in __get_vm_area_node(). When freeing, it set ->vm to NULL > > > firstly, then unmap and free vmap_area, see remove_vm_area(). > > > > > > 2) areas allocated via vm_map_ram() and size is larger than > > > VMAP_MAX_ALLOC. The entire area is not associated with vm_struct, and > > > freed at one time in vm_unmap_ram() with unmapping and freeing vmap_area; > > > > > > 3) areas allocated via vm_map_ram(), then delegate to vb_alloc() when > > > size <= VMAP_MAX_ALLOC. Its vmap_area is allocated at one time with > > > VMAP_BLOCK_SIZE big, and split and used later through vb_alloc(), freed > > > via vb_free(). When the entire area is dirty, it will be unmapped and > > > freed. > > > > > > Based on above facts, we need add flags to differentiate the normal > > > vmalloc area from the vm_map_ram area, namely area 1) and 2). And we > > > also need flags to differentiate the area 2) and 3). Because area 3) are > > > pieces of a entire vmap_area, vb_free() will unmap the piece of area and > > > set the part dirty, but the entire vmap_area will kept there. So when we > > > will read area 3), we need take vb->lock and only read out the still > > > mapped part, but not dirty or free part of the vmap_area. > > > > I don't think you understand the problem. > > > > Task A: Task B: Task C: > > p = vm_map_ram() > > vread(p); > > ... preempted ... > > vm_unmap_ram(p); > > q = vm_map_ram(); > > vread continues > > > > > > > If C has reused the address space allocated by A, task B is now reading > > the memory mapped by task C instead of task A. If it hasn't, it's now > > trying to read from unmapped, and quite possibly freed memory. Which > > might have been allocated by task D. > > Hmm, it may not be like that. > > Firstly, I would remind that vread() takes vmap_area_lock during the > whole reading process. Before this patchset, the vread() and other area > manipulation will have below status: > 1) __get_vm_area_node() could only finish insert_vmap_area(), then free > the vmap_area_lock, then warting; > 2) __get_vm_area_node() finishs setup_vmalloc_vm() > 2.1) doing mapping but not finished; > 2.2) clear_vm_uninitialized_flag() is called after mapping is done; > 3) remove_vm_area() is called to set -> = NULL, then free vmap_area_lock; > > Task A: Task B: Task C: > p = __get_vm_area_node() > remove_vm_area(p); > vread(p); > > vread end > q = __get_vm_area_node(); > > So, as you can see, the checking "if (!va->vm)" in vread() will filter > out vmap_area: > a) areas if only insert_vmap_area() is called, but ->vm is still NULL; > b) areas if remove_vm_area() is called to clear ->vm to NULL; > c) areas created through vm_map_ram() since its ->vm is always NULL; > > Means vread() will read out vmap_area: > d) areas if setup_vmalloc_vm() is called; > 1) mapping is done on areas, e.g clear_vm_uninitialized_flag() is > called; > 2) mapping is being handled, just after returning from setup_vmalloc_vm(); > > > ******* after this patchset applied: > > Task A: Task B: Task C: > p = vm_map_ram() > vm_unmap_ram(p); > vread(p); > vb_vread() > vread end > > q = vm_map_ram(); > > With this patchset applied, other than normal areas, for the > vm_map_ram() areas: > 1) In vm_map_ram(), set vmap_area->flags = VMAP_RAM when vmap_area_lock > is taken; In vm_unmap_ram(), clear it wiht "va->flags &= ~VMAP_RAM" > when vmap_area_lock is taken; > 2) If vmap_block, set va->flags = VMAP_RAM|VMAP_BLOCK; And set > vmap_block->used_map to track the used region, filter out the dirty > and free region; > 3) In vb_vread(), we take vb->lock to avoid reading out dirty regions. > > Please help point out what is wrong or I missed. > One thing is we still can read-out un-mapped pages, i.e. a text instead: <snip> static void vb_free(unsigned long addr, unsigned long size) { unsigned long offset; unsigned int order; struct vmap_block *vb; BUG_ON(offset_in_page(size)); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); flush_cache_vunmap(addr, addr + size); order = get_order(size); offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); vunmap_range_noflush(addr, addr + size); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(addr, addr + size); spin_lock(&vb->lock); ... <snip> or am i missing something? Is it a problem? It might be. Another thing it would be good if you upload a new patchset so it is easier to review it. Thanks! -- Uladzislau Rezki
On 11/30/22 at 02:06pm, Uladzislau Rezki wrote: > On Thu, Nov 24, 2022 at 05:52:34PM +0800, Baoquan He wrote: ...... > > > I don't think you understand the problem. > > > > > > Task A: Task B: Task C: > > > p = vm_map_ram() > > > vread(p); > > > ... preempted ... > > > vm_unmap_ram(p); > > > q = vm_map_ram(); > > > vread continues > > > > > > > > > > > > If C has reused the address space allocated by A, task B is now reading > > > the memory mapped by task C instead of task A. If it hasn't, it's now > > > trying to read from unmapped, and quite possibly freed memory. Which > > > might have been allocated by task D. > > > > Hmm, it may not be like that. > > > > Firstly, I would remind that vread() takes vmap_area_lock during the > > whole reading process. Before this patchset, the vread() and other area > > manipulation will have below status: > > 1) __get_vm_area_node() could only finish insert_vmap_area(), then free > > the vmap_area_lock, then warting; > > 2) __get_vm_area_node() finishs setup_vmalloc_vm() > > 2.1) doing mapping but not finished; > > 2.2) clear_vm_uninitialized_flag() is called after mapping is done; > > 3) remove_vm_area() is called to set -> = NULL, then free vmap_area_lock; > > > > Task A: Task B: Task C: > > p = __get_vm_area_node() > > remove_vm_area(p); > > vread(p); > > > > vread end > > q = __get_vm_area_node(); > > > > So, as you can see, the checking "if (!va->vm)" in vread() will filter > > out vmap_area: > > a) areas if only insert_vmap_area() is called, but ->vm is still NULL; > > b) areas if remove_vm_area() is called to clear ->vm to NULL; > > c) areas created through vm_map_ram() since its ->vm is always NULL; > > > > Means vread() will read out vmap_area: > > d) areas if setup_vmalloc_vm() is called; > > 1) mapping is done on areas, e.g clear_vm_uninitialized_flag() is > > called; > > 2) mapping is being handled, just after returning from setup_vmalloc_vm(); > > > > > > ******* after this patchset applied: > > > > Task A: Task B: Task C: > > p = vm_map_ram() > > vm_unmap_ram(p); > > vread(p); > > vb_vread() > > vread end > > > > q = vm_map_ram(); > > > > With this patchset applied, other than normal areas, for the > > vm_map_ram() areas: > > 1) In vm_map_ram(), set vmap_area->flags = VMAP_RAM when vmap_area_lock > > is taken; In vm_unmap_ram(), clear it wiht "va->flags &= ~VMAP_RAM" > > when vmap_area_lock is taken; > > 2) If vmap_block, set va->flags = VMAP_RAM|VMAP_BLOCK; And set > > vmap_block->used_map to track the used region, filter out the dirty > > and free region; > > 3) In vb_vread(), we take vb->lock to avoid reading out dirty regions. > > > > Please help point out what is wrong or I missed. > > > One thing is we still can read-out un-mapped pages, i.e. a text instead: > > <snip> > static void vb_free(unsigned long addr, unsigned long size) > { > unsigned long offset; > unsigned int order; > struct vmap_block *vb; > > BUG_ON(offset_in_page(size)); > BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); > > flush_cache_vunmap(addr, addr + size); > > order = get_order(size); > offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; > vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); > > vunmap_range_noflush(addr, addr + size); > > if (debug_pagealloc_enabled_static()) > flush_tlb_kernel_range(addr, addr + size); > > spin_lock(&vb->lock); > ... > <snip> > > or am i missing something? Is it a problem? It might be. Another thing > it would be good if you upload a new patchset so it is easier to review > it. Thanks for checking. Please check patch 1, vmap_block->used_map is introduced to track the vb regions allocation and free via vb_alloc/vb_free(). The vb->used_map only set for pages being used, the dirty and free regions are all cleared. In the added vb_vread() of patch 3, vb->lock is required and taken during the whole vb vmap reading, and only page of regions set in vb->used_map will be read out. So if vb_free() is called, and vb->used_map is cleared away, it won't be read out in vb_vread(). If vb_free() is requiring vb->lock and waiting, the region hasn't been unmapped and can be read out. @@ -2114,6 +2118,9 @@ static void vb_free(unsigned long addr, unsigned long size) order = get_order(size); offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); + spin_lock(&vb->lock); + bitmap_clear(vb->used_map, offset, (1UL << order)); + spin_unlock(&vb->lock); vunmap_range_noflush(addr, addr + size); I will work out a formal patchset for reviewing, will post and CC all reviewers. Thanks Baoquan
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 41d82dc07e13..5a8d5659bfb0 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3518,6 +3518,46 @@ static int aligned_vread(char *buf, char *addr, unsigned long count) return copied; } +static void vb_vread(char *buf, char *addr, int count) +{ + char *start; + struct vmap_block *vb; + unsigned long offset; + unsigned int rs, re, n; + + offset = ((unsigned long)addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; + vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr)); + + spin_lock(&vb->lock); + if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) { + spin_unlock(&vb->lock); + memset(buf, 0, count); + return; + } + for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) { + if (!count) + break; + start = vmap_block_vaddr(vb->va->va_start, rs); + if (addr < start) { + if (count == 0) + break; + *buf = '\0'; + buf++; + addr++; + count--; + } + n = (re - rs + 1) << PAGE_SHIFT; + if (n > count) + n = count; + aligned_vread(buf, start, n); + + buf += n; + addr += n; + count -= n; + } + spin_unlock(&vb->lock); +} + /** * vread() - read vmalloc area in a safe way. * @buf: buffer for reading data @@ -3548,7 +3588,7 @@ long vread(char *buf, char *addr, unsigned long count) struct vm_struct *vm; char *vaddr, *buf_start = buf; unsigned long buflen = count; - unsigned long n; + unsigned long n, size; addr = kasan_reset_tag(addr); @@ -3569,12 +3609,14 @@ long vread(char *buf, char *addr, unsigned long count) if (!count) break; - if (!va->vm) + if (!(va->flags & VMAP_RAM) && !va->vm) continue; vm = va->vm; - vaddr = (char *) vm->addr; - if (addr >= vaddr + get_vm_area_size(vm)) + vaddr = (char *) va->va_start; + size = vm ? get_vm_area_size(vm) : va_size(va); + + if (addr >= vaddr + size) continue; while (addr < vaddr) { if (count == 0) @@ -3584,10 +3626,13 @@ long vread(char *buf, char *addr, unsigned long count) addr++; count--; } - n = vaddr + get_vm_area_size(vm) - addr; + n = vaddr + size - addr; if (n > count) n = count; - if (!(vm->flags & VM_IOREMAP)) + + if ((va->flags & (VMAP_RAM|VMAP_BLOCK)) == (VMAP_RAM|VMAP_BLOCK)) + vb_vread(buf, addr, n); + else if ((va->flags & VMAP_RAM) || !(vm->flags & VM_IOREMAP)) aligned_vread(buf, addr, n); else /* IOREMAP area is treated as memory hole */ memset(buf, 0, n);
Currently, vread() can read out vmalloc areas which is associated with a vm_struct. While this doesn't work for areas created by vm_map_ram() interface because it doesn't allocate a vm_struct. Then in vread(), these areas will be skipped. Here, add a new function vb_vread() to read out areas managed by vmap_block specifically. Then recognize vm_map_ram areas via vmap->flags and handle them respectively. Stephen Brennan <stephen.s.brennan@oracle.com> Signed-off-by: Baoquan He <bhe@redhat.com> Link: https://lore.kernel.org/all/87ilk6gos2.fsf@oracle.com/T/#u --- mm/vmalloc.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 6 deletions(-)