diff mbox series

[RFC,v2,1/1] mm/vmalloc: Introduce vmap_file()

Message ID 20250328211349.845857-2-vishal.moola@gmail.com (mailing list archive)
State New
Headers show
Series Introduce vmap_file() | expand

Commit Message

Vishal Moola (Oracle) March 28, 2025, 9:13 p.m. UTC
vmap_file() is effectively an in-kernel equivalent to calling mmap()
on a file. A user can pass in a file mapping, and vmap_file() will map
the specified portion of that file directly to kernel virtual space.

Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
---
 include/linux/vmalloc.h |   2 +
 mm/vmalloc.c            | 113 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 115 insertions(+)
diff mbox series

Patch

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 31e9ffd936e3..d5420985865f 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -192,6 +192,8 @@  extern void vfree_atomic(const void *addr);
 
 extern void *vmap(struct page **pages, unsigned int count,
 			unsigned long flags, pgprot_t prot);
+void *vmap_file(struct address_space *mapping, loff_t start, loff_t end,
+			unsigned long flags, pgprot_t prot);
 void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot);
 extern void vunmap(const void *addr);
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 3ed720a787ec..b94489032ab5 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3475,6 +3475,119 @@  void *vmap(struct page **pages, unsigned int count,
 }
 EXPORT_SYMBOL(vmap);
 
+/**
+ * vmap_file - map all folios in a file to virtually contiguous space.
+ * @mapping: The address space to map.
+ * @start: The starting byte.
+ * @end: The final byte to map.
+ * @flags: vm_area->flags.
+ * @prot: page protection for the mapping.
+ *
+ * Maps a file into contiguous kernel virtual space. The caller is expected
+ * to ensure that the folios caching the file are present and uptodate. The
+ * folios must remain so until the file is unmapped.
+ *
+ * If @start or @end are not PAGE_ALIGNED, vmap_file() will round
+ * @start down and @end up to encompass the desired pages. The
+ * address returned is always PAGE_ALIGNED.
+ *
+ * Return: the address of the area or %NULL on failure.
+ */
+void *vmap_file(struct address_space *mapping, loff_t start, loff_t end,
+		unsigned long flags, pgprot_t prot)
+{
+	struct vm_struct *area;
+	struct folio *folio;
+	unsigned long addr, end_addr;
+	const pgoff_t first = start >> PAGE_SHIFT;
+	const pgoff_t last = end >> PAGE_SHIFT;
+	XA_STATE(xas, &mapping->i_pages, first);
+
+	unsigned long size = (last - first + 1) << PAGE_SHIFT;
+
+	if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS))
+		return NULL;
+
+	/*
+	 * Your top guard is someone else's bottom guard. Not having a top
+	 * guard compromises someone else's mappings too.
+	 */
+	if (WARN_ON_ONCE(flags & VM_NO_GUARD))
+		flags &= ~VM_NO_GUARD;
+
+	area = get_vm_area_caller(size, flags, __builtin_return_address(0));
+	if (!area)
+		return NULL;
+
+	addr = (unsigned long) area->addr;
+	end_addr = addr + size;
+
+	rcu_read_lock();
+	xas_for_each(&xas, folio, last) {
+		phys_addr_t map_start;
+		int map_size, err;
+		bool pmd_bound, is_first_map;
+
+		if (xas_retry(&xas, folio))
+			continue;
+		if (!folio || xa_is_value(folio) ||
+				!folio_test_uptodate(folio))
+			goto out;
+
+		is_first_map = (addr == (unsigned long) area->addr);
+		map_start = folio_pfn(folio) << PAGE_SHIFT;
+		map_size = folio_size(folio);
+
+		/* We can unconditionally calculate values for the first
+		 * folio. This lets us handle skipping pages in the first
+		 * folio without verifying addresses every iteration.
+		 */
+		if (is_first_map) {
+			map_size -= (first - folio->index) << PAGE_SHIFT;
+			map_start += (first - folio->index) << PAGE_SHIFT;
+		}
+
+		if (addr + map_size > end_addr)
+			map_size = end_addr - addr;
+
+		/* We need to check if this folio will cross the pmd boundary.
+		 * If it does, we drop the rcu lock to allow for a new page
+		 * table allocation.
+		 */
+
+		pmd_bound = is_first_map ||
+			(IS_ALIGNED(addr, PMD_SIZE)) ||
+			((addr & PMD_MASK) !=
+			((addr + map_size) & PMD_MASK));
+
+		if (pmd_bound) {
+			xas_pause(&xas);
+			rcu_read_unlock();
+		}
+
+		err = vmap_range_noflush(addr, addr + map_size,
+				map_start, prot, PAGE_SHIFT);
+
+		if (pmd_bound)
+			rcu_read_lock();
+
+		if (err) {
+			vunmap(area->addr);
+			area->addr = NULL;
+			goto out;
+		}
+
+		addr += map_size;
+	}
+
+out:
+	rcu_read_unlock();
+	flush_cache_vmap((unsigned long)area->addr, end_addr);
+
+	return area->addr;
+}
+EXPORT_SYMBOL_GPL(vmap_file);
+
 #ifdef CONFIG_VMAP_PFN
 struct vmap_pfn_data {
 	unsigned long	*pfns;