Message ID | 20220818224218.2399791-2-song@kernel.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | vmalloc_exec for modules and BPF programs | expand |
On Thu, Aug 18, 2022 at 03:42:14PM -0700, Song Liu wrote: > --- a/mm/nommu.c > +++ b/mm/nommu.c > @@ -372,6 +372,13 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, > } > EXPORT_SYMBOL(vm_map_pages_zero); > > +void *vmalloc_exec(unsigned long size, unsigned long align) > +{ > + return NULL; > +} Well that's not so nice for no-mmu systems. Shouldn't we have a fallback? > diff --git a/mm/vmalloc.c b/mm/vmalloc.c > index effd1ff6a4b4..472287e71bf1 100644 > --- a/mm/vmalloc.c > +++ b/mm/vmalloc.c > @@ -1583,9 +1592,17 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, > va->va_end = addr + size; > va->vm = NULL; > > - spin_lock(&vmap_area_lock); > - insert_vmap_area(va, &vmap_area_root, &vmap_area_list); > - spin_unlock(&vmap_area_lock); > + if (vm_flags & VM_KERNEL_EXEC) { > + spin_lock(&free_text_area_lock); > + insert_vmap_area(va, &free_text_area_root, &free_text_area_list); > + /* update subtree_max_size now as we need this soon */ > + augment_tree_propagate_from(va); Sorry, it is not clear to me why its needed only for exec, can you elaborate a bit more? > + spin_unlock(&free_text_area_lock); > + } else { > + spin_lock(&vmap_area_lock); > + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); > + spin_unlock(&vmap_area_lock); > + } > > BUG_ON(!IS_ALIGNED(va->va_start, align)); > BUG_ON(va->va_start < vstart); <-- snip --> > @@ -3265,6 +3282,97 @@ void *vmalloc(unsigned long size) > } > EXPORT_SYMBOL(vmalloc); > > +void *vmalloc_exec(unsigned long size, unsigned long align) > +{ > + struct vmap_area *va, *tmp; > + unsigned long addr; > + enum fit_type type; > + int ret; > + > + va = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, NUMA_NO_NODE); > + if (unlikely(!va)) > + return ERR_PTR(-ENOMEM); > + > +again: > + preload_this_cpu_lock(&free_text_area_lock, GFP_KERNEL, NUMA_NO_NODE); > + tmp = find_vmap_lowest_match(free_text_area_root.rb_node, > + size, align, 1, false); > + > + if (!tmp) { > + unsigned long alloc_size; > + void *ptr; > + > + spin_unlock(&free_text_area_lock); > + > + alloc_size = roundup(size, PMD_SIZE * num_online_nodes()); > + ptr = __vmalloc_node_range(alloc_size, PMD_SIZE, MODULES_VADDR, > + MODULES_END, GFP_KERNEL, PAGE_KERNEL, > + VM_KERNEL_EXEC | VM_ALLOW_HUGE_VMAP | VM_NO_GUARD, > + NUMA_NO_NODE, __builtin_return_address(0)); We can review the guard stuff on the other thread with Peter. > + if (unlikely(!ptr)) { > + ret = -ENOMEM; > + goto err_out; > + } > + memset(ptr, 0, alloc_size); > + set_memory_ro((unsigned long)ptr, alloc_size >> PAGE_SHIFT); > + set_memory_x((unsigned long)ptr, alloc_size >> PAGE_SHIFT); I *really* like that this is now not something users have to muck with thanks! > + > + goto again; > + } > + > + addr = roundup(tmp->va_start, align); > + type = classify_va_fit_type(tmp, addr, size); > + if (WARN_ON_ONCE(type == NOTHING_FIT)) { > + addr = -ENOMEM; > + goto err_out; > + } > + > + ret = adjust_va_to_fit_type(&free_text_area_root, &free_text_area_list, > + tmp, addr, size, type); > + if (ret) { > + addr = ret; > + goto err_out; > + } > + spin_unlock(&free_text_area_lock); > + > + va->va_start = addr; > + va->va_end = addr + size; > + va->vm = tmp->vm; > + > + spin_lock(&vmap_area_lock); > + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); > + spin_unlock(&vmap_area_lock); > + > + return (void *)addr; > + > +err_out: > + spin_unlock(&free_text_area_lock); > + return ERR_PTR(ret); > +} > + > +void vfree_exec(const void *addr) > +{ > + struct vmap_area *va; > + > + might_sleep(); > + > + spin_lock(&vmap_area_lock); > + va = __find_vmap_area((unsigned long)addr, vmap_area_root.rb_node); > + if (WARN_ON_ONCE(!va)) { > + spin_unlock(&vmap_area_lock); > + return; > + } > + > + unlink_va(va, &vmap_area_root); Curious why we don't memset to 0 before merge_or_add_vmap_area_augment()? I realize other code doesn't seem to do it, though. > + spin_unlock(&vmap_area_lock); > + > + spin_lock(&free_text_area_lock); > + merge_or_add_vmap_area_augment(va, > + &free_text_area_root, &free_text_area_list); I have concern that we can be using precious physically contigous memory from huge pages to then end up in a situation where we create our own pool and allow things to be non-contigous afterwards. I'm starting to suspect that if the allocation is > PAGE_SIZE we just give it back generally. Otherwise wouldn't the fragmentation cause us to eventually just eat up most huge pages available? Probably not for eBPF but if we use this on a system with tons of module insertions / deletions this seems like it could happen? Luis > + spin_unlock(&free_text_area_lock); > + /* TODO: when the whole vm_struct is not in use, free it */ > +} > + > /** > * vmalloc_huge - allocate virtually contiguous memory, allow huge pages > * @size: allocation size > @@ -3851,7 +3959,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, > /* It is a BUG(), but trigger recovery instead. */ > goto recovery; > > - ret = adjust_va_to_fit_type(va, start, size, type); > + ret = adjust_va_to_fit_type(&free_vmap_area_root, &free_vmap_area_list, > + va, start, size, type); > if (unlikely(ret)) > goto recovery; > > -- > 2.30.2 >
> On Oct 6, 2022, at 4:15 PM, Luis Chamberlain <mcgrof@kernel.org> wrote: > > On Thu, Aug 18, 2022 at 03:42:14PM -0700, Song Liu wrote: >> --- a/mm/nommu.c >> +++ b/mm/nommu.c >> @@ -372,6 +372,13 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, >> } >> EXPORT_SYMBOL(vm_map_pages_zero); >> >> +void *vmalloc_exec(unsigned long size, unsigned long align) >> +{ >> + return NULL; >> +} > > Well that's not so nice for no-mmu systems. Shouldn't we have a > fallback? This is still early version, so I am not quite sure whether we need the fallback for no-mmu system. > >> diff --git a/mm/vmalloc.c b/mm/vmalloc.c >> index effd1ff6a4b4..472287e71bf1 100644 >> --- a/mm/vmalloc.c >> +++ b/mm/vmalloc.c >> @@ -1583,9 +1592,17 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, >> va->va_end = addr + size; >> va->vm = NULL; >> >> - spin_lock(&vmap_area_lock); >> - insert_vmap_area(va, &vmap_area_root, &vmap_area_list); >> - spin_unlock(&vmap_area_lock); >> + if (vm_flags & VM_KERNEL_EXEC) { >> + spin_lock(&free_text_area_lock); >> + insert_vmap_area(va, &free_text_area_root, &free_text_area_list); >> + /* update subtree_max_size now as we need this soon */ >> + augment_tree_propagate_from(va); > > Sorry, it is not clear to me why its needed only for exec, can you elaborate a > bit more? This version was wrong. We should use insert_vmap_area_augment() here. Actually, I changed this in latest version. > >> + spin_unlock(&free_text_area_lock); >> + } else { >> + spin_lock(&vmap_area_lock); >> + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); >> + spin_unlock(&vmap_area_lock); >> + } >> >> BUG_ON(!IS_ALIGNED(va->va_start, align)); >> BUG_ON(va->va_start < vstart); > > <-- snip --> > >> @@ -3265,6 +3282,97 @@ void *vmalloc(unsigned long size) >> } >> EXPORT_SYMBOL(vmalloc); >> >> +void *vmalloc_exec(unsigned long size, unsigned long align) >> +{ >> + struct vmap_area *va, *tmp; >> + unsigned long addr; >> + enum fit_type type; >> + int ret; >> + >> + va = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, NUMA_NO_NODE); >> + if (unlikely(!va)) >> + return ERR_PTR(-ENOMEM); >> + >> +again: >> + preload_this_cpu_lock(&free_text_area_lock, GFP_KERNEL, NUMA_NO_NODE); >> + tmp = find_vmap_lowest_match(free_text_area_root.rb_node, >> + size, align, 1, false); >> + >> + if (!tmp) { >> + unsigned long alloc_size; >> + void *ptr; >> + >> + spin_unlock(&free_text_area_lock); >> + >> + alloc_size = roundup(size, PMD_SIZE * num_online_nodes()); >> + ptr = __vmalloc_node_range(alloc_size, PMD_SIZE, MODULES_VADDR, >> + MODULES_END, GFP_KERNEL, PAGE_KERNEL, >> + VM_KERNEL_EXEC | VM_ALLOW_HUGE_VMAP | VM_NO_GUARD, >> + NUMA_NO_NODE, __builtin_return_address(0)); > > We can review the guard stuff on the other thread with Peter. > >> + if (unlikely(!ptr)) { >> + ret = -ENOMEM; >> + goto err_out; >> + } >> + memset(ptr, 0, alloc_size); >> + set_memory_ro((unsigned long)ptr, alloc_size >> PAGE_SHIFT); >> + set_memory_x((unsigned long)ptr, alloc_size >> PAGE_SHIFT); > > I *really* like that this is now not something users have to muck with thanks! Well, this pushed some other complexity to the user side, for example, all those hacks with text_poke in 3/5. > >> + >> + goto again; >> + } >> + >> + addr = roundup(tmp->va_start, align); >> + type = classify_va_fit_type(tmp, addr, size); >> + if (WARN_ON_ONCE(type == NOTHING_FIT)) { >> + addr = -ENOMEM; >> + goto err_out; >> + } >> + >> + ret = adjust_va_to_fit_type(&free_text_area_root, &free_text_area_list, >> + tmp, addr, size, type); >> + if (ret) { >> + addr = ret; >> + goto err_out; >> + } >> + spin_unlock(&free_text_area_lock); >> + >> + va->va_start = addr; >> + va->va_end = addr + size; >> + va->vm = tmp->vm; >> + >> + spin_lock(&vmap_area_lock); >> + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); >> + spin_unlock(&vmap_area_lock); >> + >> + return (void *)addr; >> + >> +err_out: >> + spin_unlock(&free_text_area_lock); >> + return ERR_PTR(ret); >> +} >> + >> +void vfree_exec(const void *addr) >> +{ >> + struct vmap_area *va; >> + >> + might_sleep(); >> + >> + spin_lock(&vmap_area_lock); >> + va = __find_vmap_area((unsigned long)addr, vmap_area_root.rb_node); >> + if (WARN_ON_ONCE(!va)) { >> + spin_unlock(&vmap_area_lock); >> + return; >> + } >> + >> + unlink_va(va, &vmap_area_root); > > Curious why we don't memset to 0 before merge_or_add_vmap_area_augment()? > I realize other code doesn't seem to do it, though. We should do the memset here. We will need the text_poke version of it. > >> + spin_unlock(&vmap_area_lock); >> + >> + spin_lock(&free_text_area_lock); >> + merge_or_add_vmap_area_augment(va, >> + &free_text_area_root, &free_text_area_list); > > I have concern that we can be using precious physically contigous memory > from huge pages to then end up in a situation where we create our own > pool and allow things to be non-contigous afterwards. > > I'm starting to suspect that if the allocation is > PAGE_SIZE we just > give it back generally. Otherwise wouldn't the fragmentation cause us > to eventually just eat up most huge pages available? Probably not for > eBPF but if we use this on a system with tons of module insertions / > deletions this seems like it could happen? Currently, bpf_prog_pack doesn't let allocation > PMD_SIZE to share with smaller allocations. I guess it is similar to the idea here? I am not sure what's the proper threshold for modules. We can discuss this later. Thanks, Song > > Luis > >> + spin_unlock(&free_text_area_lock); >> + /* TODO: when the whole vm_struct is not in use, free it */ >> +} >> + >> /** >> * vmalloc_huge - allocate virtually contiguous memory, allow huge pages >> * @size: allocation size >> @@ -3851,7 +3959,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, >> /* It is a BUG(), but trigger recovery instead. */ >> goto recovery; >> >> - ret = adjust_va_to_fit_type(va, start, size, type); >> + ret = adjust_va_to_fit_type(&free_vmap_area_root, &free_vmap_area_list, >> + va, start, size, type); >> if (unlikely(ret)) >> goto recovery; >> >> -- >> 2.30.2 >>
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 096d48aa3437..691c02ffe3db 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -35,6 +35,8 @@ struct notifier_block; /* in notifier.h */ #define VM_DEFER_KMEMLEAK 0 #endif +#define VM_KERNEL_EXEC 0x00001000 /* kernel text mapped as RO+X */ + /* bits [20..32] reserved for arch specific ioremap internals */ /* @@ -154,6 +156,8 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) __alloc_size(1); void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __alloc_size(1); +void *vmalloc_exec(unsigned long size, unsigned long align) __alloc_size(1); +void vfree_exec(const void *addr); extern void *__vmalloc_array(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); extern void *vmalloc_array(size_t n, size_t size) __alloc_size(1, 2); diff --git a/mm/nommu.c b/mm/nommu.c index 9d7afc2d959e..11e0fc996006 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -372,6 +372,13 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, } EXPORT_SYMBOL(vm_map_pages_zero); +void *vmalloc_exec(unsigned long size, unsigned long align) +{ + return NULL; +} + +void vfree_exec(const void *addr) { } + /* * sys_brk() for the most part doesn't need the global kernel * lock, except when an application is doing something nasty diff --git a/mm/vmalloc.c b/mm/vmalloc.c index effd1ff6a4b4..472287e71bf1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -753,6 +753,10 @@ static LIST_HEAD(free_vmap_area_list); */ static struct rb_root free_vmap_area_root = RB_ROOT; +static DEFINE_SPINLOCK(free_text_area_lock); +static LIST_HEAD(free_text_area_list); +static struct rb_root free_text_area_root = RB_ROOT; + /* * Preload a CPU with one object for "no edge" split case. The * aim is to get rid of allocations from the atomic context, thus @@ -814,9 +818,11 @@ static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) return va; } -static struct vmap_area *__find_vmap_area(unsigned long addr) +static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_node *root) { - struct rb_node *n = vmap_area_root.rb_node; + struct rb_node *n; + + n = root ? root : vmap_area_root.rb_node; addr = (unsigned long)kasan_reset_tag((void *)addr); @@ -926,7 +932,7 @@ link_va(struct vmap_area *va, struct rb_root *root, /* Insert to the rb-tree */ rb_link_node(&va->rb_node, parent, link); - if (root == &free_vmap_area_root) { + if (root == &free_vmap_area_root || root == &free_text_area_root) { /* * Some explanation here. Just perform simple insertion * to the tree. We do not set va->subtree_max_size to @@ -955,7 +961,7 @@ unlink_va(struct vmap_area *va, struct rb_root *root) if (WARN_ON(RB_EMPTY_NODE(&va->rb_node))) return; - if (root == &free_vmap_area_root) + if (root == &free_vmap_area_root || root == &free_text_area_root) rb_erase_augmented(&va->rb_node, root, &free_vmap_area_rb_augment_cb); else @@ -1198,15 +1204,15 @@ is_within_this_va(struct vmap_area *va, unsigned long size, * overhead. */ static __always_inline struct vmap_area * -find_vmap_lowest_match(unsigned long size, unsigned long align, - unsigned long vstart, bool adjust_search_size) +find_vmap_lowest_match(struct rb_node *root, unsigned long size, + unsigned long align, unsigned long vstart, bool adjust_search_size) { struct vmap_area *va; struct rb_node *node; unsigned long length; /* Start from the root. */ - node = free_vmap_area_root.rb_node; + node = root; /* Adjust the search size for alignment overhead. */ length = adjust_search_size ? size + align - 1 : size; @@ -1290,8 +1296,9 @@ find_vmap_lowest_match_check(unsigned long size, unsigned long align) get_random_bytes(&rnd, sizeof(rnd)); vstart = VMALLOC_START + rnd; - va_1 = find_vmap_lowest_match(size, align, vstart, false); - va_2 = find_vmap_lowest_linear_match(size, align, vstart); + va_1 = find_vmap_lowest_match(free_vmap_area_root.rb_node, size, + align, vstart, false); + va_2 = find_vmap_lowest_linear_match(root, size, align, vstart); if (va_1 != va_2) pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", @@ -1334,7 +1341,8 @@ classify_va_fit_type(struct vmap_area *va, } static __always_inline int -adjust_va_to_fit_type(struct vmap_area *va, +adjust_va_to_fit_type(struct rb_root *root, struct list_head *head, + struct vmap_area *va, unsigned long nva_start_addr, unsigned long size, enum fit_type type) { @@ -1348,7 +1356,7 @@ adjust_va_to_fit_type(struct vmap_area *va, * V NVA V * |---------------| */ - unlink_va(va, &free_vmap_area_root); + unlink_va(va, root); kmem_cache_free(vmap_area_cachep, va); } else if (type == LE_FIT_TYPE) { /* @@ -1426,8 +1434,7 @@ adjust_va_to_fit_type(struct vmap_area *va, augment_tree_propagate_from(va); if (lva) /* type == NE_FIT_TYPE */ - insert_vmap_area_augment(lva, &va->rb_node, - &free_vmap_area_root, &free_vmap_area_list); + insert_vmap_area_augment(lva, &va->rb_node, root, head); } return 0; @@ -1459,7 +1466,8 @@ __alloc_vmap_area(unsigned long size, unsigned long align, if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size)) adjust_search_size = false; - va = find_vmap_lowest_match(size, align, vstart, adjust_search_size); + va = find_vmap_lowest_match(free_vmap_area_root.rb_node, + size, align, vstart, adjust_search_size); if (unlikely(!va)) return vend; @@ -1478,7 +1486,8 @@ __alloc_vmap_area(unsigned long size, unsigned long align, return vend; /* Update the free vmap_area. */ - ret = adjust_va_to_fit_type(va, nva_start_addr, size, type); + ret = adjust_va_to_fit_type(&free_vmap_area_root, &free_vmap_area_list, + va, nva_start_addr, size, type); if (ret) return vend; @@ -1539,7 +1548,7 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend, - int node, gfp_t gfp_mask) + int node, unsigned long vm_flags, gfp_t gfp_mask) { struct vmap_area *va; unsigned long freed; @@ -1583,9 +1592,17 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, va->va_end = addr + size; va->vm = NULL; - spin_lock(&vmap_area_lock); - insert_vmap_area(va, &vmap_area_root, &vmap_area_list); - spin_unlock(&vmap_area_lock); + if (vm_flags & VM_KERNEL_EXEC) { + spin_lock(&free_text_area_lock); + insert_vmap_area(va, &free_text_area_root, &free_text_area_list); + /* update subtree_max_size now as we need this soon */ + augment_tree_propagate_from(va); + spin_unlock(&free_text_area_lock); + } else { + spin_lock(&vmap_area_lock); + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + spin_unlock(&vmap_area_lock); + } BUG_ON(!IS_ALIGNED(va->va_start, align)); BUG_ON(va->va_start < vstart); @@ -1803,7 +1820,7 @@ struct vmap_area *find_vmap_area(unsigned long addr) struct vmap_area *va; spin_lock(&vmap_area_lock); - va = __find_vmap_area(addr); + va = __find_vmap_area(addr, vmap_area_root.rb_node); spin_unlock(&vmap_area_lock); return va; @@ -1912,8 +1929,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) return ERR_PTR(-ENOMEM); va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, - VMALLOC_START, VMALLOC_END, - node, gfp_mask); + VMALLOC_START, VMALLOC_END, + node, 0, gfp_mask); if (IS_ERR(va)) { kfree(vb); return ERR_CAST(va); @@ -2209,8 +2226,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) addr = (unsigned long)mem; } else { struct vmap_area *va; - va = alloc_vmap_area(size, PAGE_SIZE, - VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); + va = alloc_vmap_area(size, PAGE_SIZE, VMALLOC_START, VMALLOC_END, + node, 0, GFP_KERNEL); if (IS_ERR(va)) return NULL; @@ -2450,7 +2467,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, if (!(flags & VM_NO_GUARD)) size += PAGE_SIZE; - va = alloc_vmap_area(size, align, start, end, node, gfp_mask); + va = alloc_vmap_area(size, align, start, end, node, flags, gfp_mask); if (IS_ERR(va)) { kfree(area); return NULL; @@ -2546,7 +2563,7 @@ struct vm_struct *remove_vm_area(const void *addr) might_sleep(); spin_lock(&vmap_area_lock); - va = __find_vmap_area((unsigned long)addr); + va = __find_vmap_area((unsigned long)addr, vmap_area_root.rb_node); if (va && va->vm) { struct vm_struct *vm = va->vm; @@ -3265,6 +3282,97 @@ void *vmalloc(unsigned long size) } EXPORT_SYMBOL(vmalloc); +void *vmalloc_exec(unsigned long size, unsigned long align) +{ + struct vmap_area *va, *tmp; + unsigned long addr; + enum fit_type type; + int ret; + + va = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, NUMA_NO_NODE); + if (unlikely(!va)) + return ERR_PTR(-ENOMEM); + +again: + preload_this_cpu_lock(&free_text_area_lock, GFP_KERNEL, NUMA_NO_NODE); + tmp = find_vmap_lowest_match(free_text_area_root.rb_node, + size, align, 1, false); + + if (!tmp) { + unsigned long alloc_size; + void *ptr; + + spin_unlock(&free_text_area_lock); + + alloc_size = roundup(size, PMD_SIZE * num_online_nodes()); + ptr = __vmalloc_node_range(alloc_size, PMD_SIZE, MODULES_VADDR, + MODULES_END, GFP_KERNEL, PAGE_KERNEL, + VM_KERNEL_EXEC | VM_ALLOW_HUGE_VMAP | VM_NO_GUARD, + NUMA_NO_NODE, __builtin_return_address(0)); + if (unlikely(!ptr)) { + ret = -ENOMEM; + goto err_out; + } + memset(ptr, 0, alloc_size); + set_memory_ro((unsigned long)ptr, alloc_size >> PAGE_SHIFT); + set_memory_x((unsigned long)ptr, alloc_size >> PAGE_SHIFT); + + goto again; + } + + addr = roundup(tmp->va_start, align); + type = classify_va_fit_type(tmp, addr, size); + if (WARN_ON_ONCE(type == NOTHING_FIT)) { + addr = -ENOMEM; + goto err_out; + } + + ret = adjust_va_to_fit_type(&free_text_area_root, &free_text_area_list, + tmp, addr, size, type); + if (ret) { + addr = ret; + goto err_out; + } + spin_unlock(&free_text_area_lock); + + va->va_start = addr; + va->va_end = addr + size; + va->vm = tmp->vm; + + spin_lock(&vmap_area_lock); + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + spin_unlock(&vmap_area_lock); + + return (void *)addr; + +err_out: + spin_unlock(&free_text_area_lock); + return ERR_PTR(ret); +} + +void vfree_exec(const void *addr) +{ + struct vmap_area *va; + + might_sleep(); + + spin_lock(&vmap_area_lock); + va = __find_vmap_area((unsigned long)addr, vmap_area_root.rb_node); + if (WARN_ON_ONCE(!va)) { + spin_unlock(&vmap_area_lock); + return; + } + + unlink_va(va, &vmap_area_root); + spin_unlock(&vmap_area_lock); + + spin_lock(&free_text_area_lock); + merge_or_add_vmap_area_augment(va, + &free_text_area_root, &free_text_area_list); + spin_unlock(&free_text_area_lock); + /* TODO: when the whole vm_struct is not in use, free it */ +} + /** * vmalloc_huge - allocate virtually contiguous memory, allow huge pages * @size: allocation size @@ -3851,7 +3959,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, /* It is a BUG(), but trigger recovery instead. */ goto recovery; - ret = adjust_va_to_fit_type(va, start, size, type); + ret = adjust_va_to_fit_type(&free_vmap_area_root, &free_vmap_area_list, + va, start, size, type); if (unlikely(ret)) goto recovery;