diff mbox series

[drm-next,1/2] vmalloc: Add atomic_vmap

Message ID 20250305152555.318159-2-ryasuoka@redhat.com (mailing list archive)
State New
Headers show
Series Enhance drm_panic Support for Virtio-GPU | expand

Commit Message

Ryosuke Yasuoka March 5, 2025, 3:25 p.m. UTC
Some drivers can use vmap in drm_panic, however, vmap is sleepable and
takes locks. Since drm_panic will vmap in panic handler, atomic_vmap
requests pages with GFP_ATOMIC and maps KVA without locks and sleep.

Signed-off-by: Ryosuke Yasuoka <ryasuoka@redhat.com>
---
 include/linux/vmalloc.h |   2 +
 mm/internal.h           |   5 ++
 mm/vmalloc.c            | 105 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 112 insertions(+)

Comments

Markus Elfring March 5, 2025, 5:08 p.m. UTC | #1
> Some drivers can use vmap in drm_panic, however, vmap is sleepable and
> takes locks. Since drm_panic will vmap in panic handler, atomic_vmap
> requests pages with GFP_ATOMIC and maps KVA without locks and sleep.

See also:
https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/submitting-patches.rst?h=v6.14-rc5#n94

Regards,
Markus
Uladzislau Rezki March 5, 2025, 5:27 p.m. UTC | #2
On Thu, Mar 06, 2025 at 12:25:53AM +0900, Ryosuke Yasuoka wrote:
> Some drivers can use vmap in drm_panic, however, vmap is sleepable and
> takes locks. Since drm_panic will vmap in panic handler, atomic_vmap
> requests pages with GFP_ATOMIC and maps KVA without locks and sleep.
> 
> Signed-off-by: Ryosuke Yasuoka <ryasuoka@redhat.com>
> ---
>  include/linux/vmalloc.h |   2 +
>  mm/internal.h           |   5 ++
>  mm/vmalloc.c            | 105 ++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 112 insertions(+)
> 
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index 31e9ffd936e3..c7a2a9a1976d 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -190,6 +190,8 @@ void * __must_check vrealloc_noprof(const void *p, size_t size, gfp_t flags)
>  extern void vfree(const void *addr);
>  extern void vfree_atomic(const void *addr);
>  
> +extern void *atomic_vmap(struct page **pages, unsigned int count,
> +			 unsigned long flags, pgprot_t prot);
>  extern void *vmap(struct page **pages, unsigned int count,
>  			unsigned long flags, pgprot_t prot);
>  void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot);
> diff --git a/mm/internal.h b/mm/internal.h
> index 109ef30fee11..134b332bf5b9 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -1278,6 +1278,11 @@ int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
>  void free_zone_device_folio(struct folio *folio);
>  int migrate_device_coherent_folio(struct folio *folio);
>  
> +struct vm_struct *atomic_get_vm_area_node(unsigned long size, unsigned long align,
> +					  unsigned long shift, unsigned long flags,
> +					  unsigned long start, unsigned long end, int node,
> +					  gfp_t gfp_mask, const void *caller);
> +
>  struct vm_struct *__get_vm_area_node(unsigned long size,
>  				     unsigned long align, unsigned long shift,
>  				     unsigned long flags, unsigned long start,
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index a6e7acebe9ad..f5c93779c60a 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -1945,6 +1945,57 @@ static inline void setup_vmalloc_vm(struct vm_struct *vm,
>  	va->vm = vm;
>  }
>  
> +static struct vmap_area *atomic_alloc_vmap_area(unsigned long size,
> +						unsigned long align,
> +						unsigned long vstart, unsigned long vend,
> +						int node, gfp_t gfp_mask,
> +						unsigned long va_flags, struct vm_struct *vm)
> +{
> +	struct vmap_node *vn;
> +	struct vmap_area *va;
> +	unsigned long addr;
> +
> +	if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align)))
> +		return ERR_PTR(-EINVAL);
> +
> +	if (unlikely(!vmap_initialized))
> +		return ERR_PTR(-EBUSY);
> +
> +	va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
> +	if (unlikely(!va))
> +		return ERR_PTR(-ENOMEM);
> +
> +	/*
> +	 * Only scan the relevant parts containing pointers to other objects
> +	 * to avoid false negatives.
> +	 */
> +	kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
> +
> +	addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
> +				 size, align, vstart, vend);
> +
> +	trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend);
> +
> +	va->va_start = addr;
> +	va->va_end = addr + size;
> +	va->vm = NULL;
> +	va->flags = va_flags;
> +
> +	vm->addr = (void *)va->va_start;
> +	vm->size = va_size(va);
> +	va->vm = vm;
> +
> +	vn = addr_to_node(va->va_start);
> +
> +	insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
> +
> +	BUG_ON(!IS_ALIGNED(va->va_start, align));
> +	BUG_ON(va->va_start < vstart);
> +	BUG_ON(va->va_end > vend);
> +
> +	return va;
> +}
> +
>  /*
>   * Allocate a region of KVA of the specified size and alignment, within the
>   * vstart and vend. If vm is passed in, the two will also be bound.
> @@ -3106,6 +3157,33 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm)
>  	vm->flags &= ~VM_UNINITIALIZED;
>  }
>  
> +struct vm_struct *atomic_get_vm_area_node(unsigned long size, unsigned long align,
> +					  unsigned long shift, unsigned long flags,
> +					  unsigned long start, unsigned long end, int node,
> +					  gfp_t gfp_mask, const void *caller)
> +{
> +	struct vmap_area *va;
> +	struct vm_struct *area;
> +
> +	size = ALIGN(size, 1ul << shift);
> +	if (unlikely(!size))
> +		return NULL;
> +
> +	area = kzalloc_node(sizeof(*area), gfp_mask, node);
> +	if (unlikely(!area))
> +		return NULL;
> +
> +	size += PAGE_SIZE;
> +	area->flags = flags;
> +	area->caller = caller;
> +
> +	va = atomic_alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
> +	if (IS_ERR(va))
> +		return NULL;
> +
> +	return area;
> +}
> +
>  struct vm_struct *__get_vm_area_node(unsigned long size,
>  		unsigned long align, unsigned long shift, unsigned long flags,
>  		unsigned long start, unsigned long end, int node,
> @@ -3418,6 +3496,33 @@ void vunmap(const void *addr)
>  }
>  EXPORT_SYMBOL(vunmap);
>  
> +void *atomic_vmap(struct page **pages, unsigned int count,
> +		  unsigned long flags, pgprot_t prot)
> +{
> +	struct vm_struct *area;
> +	unsigned long addr;
> +	unsigned long size;		/* In bytes */
> +
> +	if (count > totalram_pages())
> +		return NULL;
> +
> +	size = (unsigned long)count << PAGE_SHIFT;
> +	area = atomic_get_vm_area_node(size, 1, PAGE_SHIFT, flags,
> +				       VMALLOC_START, VMALLOC_END,
> +				       NUMA_NO_NODE, GFP_ATOMIC,
> +				       __builtin_return_address(0));
> +	if (!area)
> +		return NULL;
> +
> +	addr = (unsigned long)area->addr;
> +	if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
> +			     pages, PAGE_SHIFT) < 0) {
> +		return NULL;
> +	}
> +
> +	return area->addr;
> +}
> +
>  /**
>   * vmap - map an array of pages into virtually contiguous space
>   * @pages: array of page pointers
> -- 
> 2.48.1
> 
It is copy-paste code, so it is odd. The proposal is not a way forward
to me. Unfortunately vmalloc is not compatible with GFP_ATOMIC, there
is at least one place it is a page-table allocation entries where it is
hard-coded to the GFP_KERNEL.

Doing this without locks and synchronizations is not possible.

--
Uladzislau Rezki
Matthew Wilcox March 6, 2025, 4:52 a.m. UTC | #3
On Thu, Mar 06, 2025 at 12:25:53AM +0900, Ryosuke Yasuoka wrote:
> Some drivers can use vmap in drm_panic, however, vmap is sleepable and
> takes locks. Since drm_panic will vmap in panic handler, atomic_vmap
> requests pages with GFP_ATOMIC and maps KVA without locks and sleep.

In addition to the implicit GFP_KERNEL allocations Vlad mentioned, how
is this supposed to work?

> +	vn = addr_to_node(va->va_start);
> +
> +	insert_vmap_area(va, &vn->busy.root, &vn->busy.head);

If someone else is holding the vn->busy.lock because they're modifying the
busy tree, you'll corrupt the tree.  You can't just say "I can't take a
lock here, so I won't bother".  You need to figure out how to do something
safe without taking the lock.  For example, you could preallocate the
page tables and reserve a vmap area when the driver loads that would
then be usable for the panic situation.  I don't know that we have APIs
to let you do that today, but it's something that could be added.
Jocelyn Falempe March 6, 2025, 1:24 p.m. UTC | #4
On 06/03/2025 05:52, Matthew Wilcox wrote:
> On Thu, Mar 06, 2025 at 12:25:53AM +0900, Ryosuke Yasuoka wrote:
>> Some drivers can use vmap in drm_panic, however, vmap is sleepable and
>> takes locks. Since drm_panic will vmap in panic handler, atomic_vmap
>> requests pages with GFP_ATOMIC and maps KVA without locks and sleep.
> 
> In addition to the implicit GFP_KERNEL allocations Vlad mentioned, how
> is this supposed to work?
> 
>> +	vn = addr_to_node(va->va_start);
>> +
>> +	insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
> 
> If someone else is holding the vn->busy.lock because they're modifying the
> busy tree, you'll corrupt the tree.  You can't just say "I can't take a
> lock here, so I won't bother".  You need to figure out how to do something
> safe without taking the lock.  For example, you could preallocate the
> page tables and reserve a vmap area when the driver loads that would
> then be usable for the panic situation.  I don't know that we have APIs
> to let you do that today, but it's something that could be added.
> 
Regarding the lock, it should be possible to use the trylock() variant, 
and fail if the lock is already taken. (In the panic handler, only 1 CPU 
remain active, so it's unlikely the lock would be released anyway).

If we need to pre-allocate the page table and reserve the vmap area, 
maybe it would be easier to just always vmap() the primary framebuffer, 
so it can be used in the panic handler?

Best regards,
diff mbox series

Patch

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 31e9ffd936e3..c7a2a9a1976d 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -190,6 +190,8 @@  void * __must_check vrealloc_noprof(const void *p, size_t size, gfp_t flags)
 extern void vfree(const void *addr);
 extern void vfree_atomic(const void *addr);
 
+extern void *atomic_vmap(struct page **pages, unsigned int count,
+			 unsigned long flags, pgprot_t prot);
 extern void *vmap(struct page **pages, unsigned int count,
 			unsigned long flags, pgprot_t prot);
 void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot);
diff --git a/mm/internal.h b/mm/internal.h
index 109ef30fee11..134b332bf5b9 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1278,6 +1278,11 @@  int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
 void free_zone_device_folio(struct folio *folio);
 int migrate_device_coherent_folio(struct folio *folio);
 
+struct vm_struct *atomic_get_vm_area_node(unsigned long size, unsigned long align,
+					  unsigned long shift, unsigned long flags,
+					  unsigned long start, unsigned long end, int node,
+					  gfp_t gfp_mask, const void *caller);
+
 struct vm_struct *__get_vm_area_node(unsigned long size,
 				     unsigned long align, unsigned long shift,
 				     unsigned long flags, unsigned long start,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a6e7acebe9ad..f5c93779c60a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1945,6 +1945,57 @@  static inline void setup_vmalloc_vm(struct vm_struct *vm,
 	va->vm = vm;
 }
 
+static struct vmap_area *atomic_alloc_vmap_area(unsigned long size,
+						unsigned long align,
+						unsigned long vstart, unsigned long vend,
+						int node, gfp_t gfp_mask,
+						unsigned long va_flags, struct vm_struct *vm)
+{
+	struct vmap_node *vn;
+	struct vmap_area *va;
+	unsigned long addr;
+
+	if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align)))
+		return ERR_PTR(-EINVAL);
+
+	if (unlikely(!vmap_initialized))
+		return ERR_PTR(-EBUSY);
+
+	va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
+	if (unlikely(!va))
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * Only scan the relevant parts containing pointers to other objects
+	 * to avoid false negatives.
+	 */
+	kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
+
+	addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
+				 size, align, vstart, vend);
+
+	trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend);
+
+	va->va_start = addr;
+	va->va_end = addr + size;
+	va->vm = NULL;
+	va->flags = va_flags;
+
+	vm->addr = (void *)va->va_start;
+	vm->size = va_size(va);
+	va->vm = vm;
+
+	vn = addr_to_node(va->va_start);
+
+	insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
+
+	BUG_ON(!IS_ALIGNED(va->va_start, align));
+	BUG_ON(va->va_start < vstart);
+	BUG_ON(va->va_end > vend);
+
+	return va;
+}
+
 /*
  * Allocate a region of KVA of the specified size and alignment, within the
  * vstart and vend. If vm is passed in, the two will also be bound.
@@ -3106,6 +3157,33 @@  static void clear_vm_uninitialized_flag(struct vm_struct *vm)
 	vm->flags &= ~VM_UNINITIALIZED;
 }
 
+struct vm_struct *atomic_get_vm_area_node(unsigned long size, unsigned long align,
+					  unsigned long shift, unsigned long flags,
+					  unsigned long start, unsigned long end, int node,
+					  gfp_t gfp_mask, const void *caller)
+{
+	struct vmap_area *va;
+	struct vm_struct *area;
+
+	size = ALIGN(size, 1ul << shift);
+	if (unlikely(!size))
+		return NULL;
+
+	area = kzalloc_node(sizeof(*area), gfp_mask, node);
+	if (unlikely(!area))
+		return NULL;
+
+	size += PAGE_SIZE;
+	area->flags = flags;
+	area->caller = caller;
+
+	va = atomic_alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
+	if (IS_ERR(va))
+		return NULL;
+
+	return area;
+}
+
 struct vm_struct *__get_vm_area_node(unsigned long size,
 		unsigned long align, unsigned long shift, unsigned long flags,
 		unsigned long start, unsigned long end, int node,
@@ -3418,6 +3496,33 @@  void vunmap(const void *addr)
 }
 EXPORT_SYMBOL(vunmap);
 
+void *atomic_vmap(struct page **pages, unsigned int count,
+		  unsigned long flags, pgprot_t prot)
+{
+	struct vm_struct *area;
+	unsigned long addr;
+	unsigned long size;		/* In bytes */
+
+	if (count > totalram_pages())
+		return NULL;
+
+	size = (unsigned long)count << PAGE_SHIFT;
+	area = atomic_get_vm_area_node(size, 1, PAGE_SHIFT, flags,
+				       VMALLOC_START, VMALLOC_END,
+				       NUMA_NO_NODE, GFP_ATOMIC,
+				       __builtin_return_address(0));
+	if (!area)
+		return NULL;
+
+	addr = (unsigned long)area->addr;
+	if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
+			     pages, PAGE_SHIFT) < 0) {
+		return NULL;
+	}
+
+	return area->addr;
+}
+
 /**
  * vmap - map an array of pages into virtually contiguous space
  * @pages: array of page pointers