Message ID | 20250305152555.318159-2-ryasuoka@redhat.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Enhance drm_panic Support for Virtio-GPU | expand |
> Some drivers can use vmap in drm_panic, however, vmap is sleepable and > takes locks. Since drm_panic will vmap in panic handler, atomic_vmap > requests pages with GFP_ATOMIC and maps KVA without locks and sleep. See also: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/submitting-patches.rst?h=v6.14-rc5#n94 Regards, Markus
On Thu, Mar 06, 2025 at 12:25:53AM +0900, Ryosuke Yasuoka wrote: > Some drivers can use vmap in drm_panic, however, vmap is sleepable and > takes locks. Since drm_panic will vmap in panic handler, atomic_vmap > requests pages with GFP_ATOMIC and maps KVA without locks and sleep. > > Signed-off-by: Ryosuke Yasuoka <ryasuoka@redhat.com> > --- > include/linux/vmalloc.h | 2 + > mm/internal.h | 5 ++ > mm/vmalloc.c | 105 ++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 112 insertions(+) > > diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h > index 31e9ffd936e3..c7a2a9a1976d 100644 > --- a/include/linux/vmalloc.h > +++ b/include/linux/vmalloc.h > @@ -190,6 +190,8 @@ void * __must_check vrealloc_noprof(const void *p, size_t size, gfp_t flags) > extern void vfree(const void *addr); > extern void vfree_atomic(const void *addr); > > +extern void *atomic_vmap(struct page **pages, unsigned int count, > + unsigned long flags, pgprot_t prot); > extern void *vmap(struct page **pages, unsigned int count, > unsigned long flags, pgprot_t prot); > void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot); > diff --git a/mm/internal.h b/mm/internal.h > index 109ef30fee11..134b332bf5b9 100644 > --- a/mm/internal.h > +++ b/mm/internal.h > @@ -1278,6 +1278,11 @@ int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, > void free_zone_device_folio(struct folio *folio); > int migrate_device_coherent_folio(struct folio *folio); > > +struct vm_struct *atomic_get_vm_area_node(unsigned long size, unsigned long align, > + unsigned long shift, unsigned long flags, > + unsigned long start, unsigned long end, int node, > + gfp_t gfp_mask, const void *caller); > + > struct vm_struct *__get_vm_area_node(unsigned long size, > unsigned long align, unsigned long shift, > unsigned long flags, unsigned long start, > diff --git a/mm/vmalloc.c b/mm/vmalloc.c > index a6e7acebe9ad..f5c93779c60a 100644 > --- a/mm/vmalloc.c > +++ b/mm/vmalloc.c > @@ -1945,6 +1945,57 @@ static inline void setup_vmalloc_vm(struct vm_struct *vm, > va->vm = vm; > } > > +static struct vmap_area *atomic_alloc_vmap_area(unsigned long size, > + unsigned long align, > + unsigned long vstart, unsigned long vend, > + int node, gfp_t gfp_mask, > + unsigned long va_flags, struct vm_struct *vm) > +{ > + struct vmap_node *vn; > + struct vmap_area *va; > + unsigned long addr; > + > + if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align))) > + return ERR_PTR(-EINVAL); > + > + if (unlikely(!vmap_initialized)) > + return ERR_PTR(-EBUSY); > + > + va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); > + if (unlikely(!va)) > + return ERR_PTR(-ENOMEM); > + > + /* > + * Only scan the relevant parts containing pointers to other objects > + * to avoid false negatives. > + */ > + kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); > + > + addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list, > + size, align, vstart, vend); > + > + trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend); > + > + va->va_start = addr; > + va->va_end = addr + size; > + va->vm = NULL; > + va->flags = va_flags; > + > + vm->addr = (void *)va->va_start; > + vm->size = va_size(va); > + va->vm = vm; > + > + vn = addr_to_node(va->va_start); > + > + insert_vmap_area(va, &vn->busy.root, &vn->busy.head); > + > + BUG_ON(!IS_ALIGNED(va->va_start, align)); > + BUG_ON(va->va_start < vstart); > + BUG_ON(va->va_end > vend); > + > + return va; > +} > + > /* > * Allocate a region of KVA of the specified size and alignment, within the > * vstart and vend. If vm is passed in, the two will also be bound. > @@ -3106,6 +3157,33 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm) > vm->flags &= ~VM_UNINITIALIZED; > } > > +struct vm_struct *atomic_get_vm_area_node(unsigned long size, unsigned long align, > + unsigned long shift, unsigned long flags, > + unsigned long start, unsigned long end, int node, > + gfp_t gfp_mask, const void *caller) > +{ > + struct vmap_area *va; > + struct vm_struct *area; > + > + size = ALIGN(size, 1ul << shift); > + if (unlikely(!size)) > + return NULL; > + > + area = kzalloc_node(sizeof(*area), gfp_mask, node); > + if (unlikely(!area)) > + return NULL; > + > + size += PAGE_SIZE; > + area->flags = flags; > + area->caller = caller; > + > + va = atomic_alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area); > + if (IS_ERR(va)) > + return NULL; > + > + return area; > +} > + > struct vm_struct *__get_vm_area_node(unsigned long size, > unsigned long align, unsigned long shift, unsigned long flags, > unsigned long start, unsigned long end, int node, > @@ -3418,6 +3496,33 @@ void vunmap(const void *addr) > } > EXPORT_SYMBOL(vunmap); > > +void *atomic_vmap(struct page **pages, unsigned int count, > + unsigned long flags, pgprot_t prot) > +{ > + struct vm_struct *area; > + unsigned long addr; > + unsigned long size; /* In bytes */ > + > + if (count > totalram_pages()) > + return NULL; > + > + size = (unsigned long)count << PAGE_SHIFT; > + area = atomic_get_vm_area_node(size, 1, PAGE_SHIFT, flags, > + VMALLOC_START, VMALLOC_END, > + NUMA_NO_NODE, GFP_ATOMIC, > + __builtin_return_address(0)); > + if (!area) > + return NULL; > + > + addr = (unsigned long)area->addr; > + if (vmap_pages_range(addr, addr + size, pgprot_nx(prot), > + pages, PAGE_SHIFT) < 0) { > + return NULL; > + } > + > + return area->addr; > +} > + > /** > * vmap - map an array of pages into virtually contiguous space > * @pages: array of page pointers > -- > 2.48.1 > It is copy-paste code, so it is odd. The proposal is not a way forward to me. Unfortunately vmalloc is not compatible with GFP_ATOMIC, there is at least one place it is a page-table allocation entries where it is hard-coded to the GFP_KERNEL. Doing this without locks and synchronizations is not possible. -- Uladzislau Rezki
On Thu, Mar 06, 2025 at 12:25:53AM +0900, Ryosuke Yasuoka wrote: > Some drivers can use vmap in drm_panic, however, vmap is sleepable and > takes locks. Since drm_panic will vmap in panic handler, atomic_vmap > requests pages with GFP_ATOMIC and maps KVA without locks and sleep. In addition to the implicit GFP_KERNEL allocations Vlad mentioned, how is this supposed to work? > + vn = addr_to_node(va->va_start); > + > + insert_vmap_area(va, &vn->busy.root, &vn->busy.head); If someone else is holding the vn->busy.lock because they're modifying the busy tree, you'll corrupt the tree. You can't just say "I can't take a lock here, so I won't bother". You need to figure out how to do something safe without taking the lock. For example, you could preallocate the page tables and reserve a vmap area when the driver loads that would then be usable for the panic situation. I don't know that we have APIs to let you do that today, but it's something that could be added.
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 31e9ffd936e3..c7a2a9a1976d 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -190,6 +190,8 @@ void * __must_check vrealloc_noprof(const void *p, size_t size, gfp_t flags) extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); +extern void *atomic_vmap(struct page **pages, unsigned int count, + unsigned long flags, pgprot_t prot); extern void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot); void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot); diff --git a/mm/internal.h b/mm/internal.h index 109ef30fee11..134b332bf5b9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1278,6 +1278,11 @@ int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, void free_zone_device_folio(struct folio *folio); int migrate_device_coherent_folio(struct folio *folio); +struct vm_struct *atomic_get_vm_area_node(unsigned long size, unsigned long align, + unsigned long shift, unsigned long flags, + unsigned long start, unsigned long end, int node, + gfp_t gfp_mask, const void *caller); + struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long shift, unsigned long flags, unsigned long start, diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a6e7acebe9ad..f5c93779c60a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1945,6 +1945,57 @@ static inline void setup_vmalloc_vm(struct vm_struct *vm, va->vm = vm; } +static struct vmap_area *atomic_alloc_vmap_area(unsigned long size, + unsigned long align, + unsigned long vstart, unsigned long vend, + int node, gfp_t gfp_mask, + unsigned long va_flags, struct vm_struct *vm) +{ + struct vmap_node *vn; + struct vmap_area *va; + unsigned long addr; + + if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align))) + return ERR_PTR(-EINVAL); + + if (unlikely(!vmap_initialized)) + return ERR_PTR(-EBUSY); + + va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); + if (unlikely(!va)) + return ERR_PTR(-ENOMEM); + + /* + * Only scan the relevant parts containing pointers to other objects + * to avoid false negatives. + */ + kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); + + addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list, + size, align, vstart, vend); + + trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend); + + va->va_start = addr; + va->va_end = addr + size; + va->vm = NULL; + va->flags = va_flags; + + vm->addr = (void *)va->va_start; + vm->size = va_size(va); + va->vm = vm; + + vn = addr_to_node(va->va_start); + + insert_vmap_area(va, &vn->busy.root, &vn->busy.head); + + BUG_ON(!IS_ALIGNED(va->va_start, align)); + BUG_ON(va->va_start < vstart); + BUG_ON(va->va_end > vend); + + return va; +} + /* * Allocate a region of KVA of the specified size and alignment, within the * vstart and vend. If vm is passed in, the two will also be bound. @@ -3106,6 +3157,33 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm) vm->flags &= ~VM_UNINITIALIZED; } +struct vm_struct *atomic_get_vm_area_node(unsigned long size, unsigned long align, + unsigned long shift, unsigned long flags, + unsigned long start, unsigned long end, int node, + gfp_t gfp_mask, const void *caller) +{ + struct vmap_area *va; + struct vm_struct *area; + + size = ALIGN(size, 1ul << shift); + if (unlikely(!size)) + return NULL; + + area = kzalloc_node(sizeof(*area), gfp_mask, node); + if (unlikely(!area)) + return NULL; + + size += PAGE_SIZE; + area->flags = flags; + area->caller = caller; + + va = atomic_alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area); + if (IS_ERR(va)) + return NULL; + + return area; +} + struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long shift, unsigned long flags, unsigned long start, unsigned long end, int node, @@ -3418,6 +3496,33 @@ void vunmap(const void *addr) } EXPORT_SYMBOL(vunmap); +void *atomic_vmap(struct page **pages, unsigned int count, + unsigned long flags, pgprot_t prot) +{ + struct vm_struct *area; + unsigned long addr; + unsigned long size; /* In bytes */ + + if (count > totalram_pages()) + return NULL; + + size = (unsigned long)count << PAGE_SHIFT; + area = atomic_get_vm_area_node(size, 1, PAGE_SHIFT, flags, + VMALLOC_START, VMALLOC_END, + NUMA_NO_NODE, GFP_ATOMIC, + __builtin_return_address(0)); + if (!area) + return NULL; + + addr = (unsigned long)area->addr; + if (vmap_pages_range(addr, addr + size, pgprot_nx(prot), + pages, PAGE_SHIFT) < 0) { + return NULL; + } + + return area->addr; +} + /** * vmap - map an array of pages into virtually contiguous space * @pages: array of page pointers
Some drivers can use vmap in drm_panic, however, vmap is sleepable and takes locks. Since drm_panic will vmap in panic handler, atomic_vmap requests pages with GFP_ATOMIC and maps KVA without locks and sleep. Signed-off-by: Ryosuke Yasuoka <ryasuoka@redhat.com> --- include/linux/vmalloc.h | 2 + mm/internal.h | 5 ++ mm/vmalloc.c | 105 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 112 insertions(+)