diff mbox series

[RFC,v2,23/30] rust: fs: allow file systems backed by a block device

Message ID 20240514131711.379322-24-wedsonaf@gmail.com (mailing list archive)
State New, archived
Headers show
Series Rust abstractions for VFS | expand

Commit Message

Wedson Almeida Filho May 14, 2024, 1:17 p.m. UTC
From: Wedson Almeida Filho <walmeida@microsoft.com>

Allow Rust file systems that are backed by block devices (in addition to
in-memory ones).

Signed-off-by: Wedson Almeida Filho <walmeida@microsoft.com>
---
 rust/helpers.c            |  14 +++
 rust/kernel/block.rs      |   1 -
 rust/kernel/fs.rs         |  60 ++++++++---
 rust/kernel/fs/inode.rs   | 221 +++++++++++++++++++++++++++++++++++++-
 rust/kernel/fs/sb.rs      |  49 ++++++++-
 samples/rust/rust_rofs.rs |   2 +-
 6 files changed, 328 insertions(+), 19 deletions(-)
diff mbox series

Patch

diff --git a/rust/helpers.c b/rust/helpers.c
index 360a1d38ac19..6c6d18df055f 100644
--- a/rust/helpers.c
+++ b/rust/helpers.c
@@ -21,6 +21,7 @@ 
  */
 
 #include <kunit/test-bug.h>
+#include <linux/blkdev.h>
 #include <linux/bug.h>
 #include <linux/build_bug.h>
 #include <linux/cacheflush.h>
@@ -258,6 +259,13 @@  void rust_helper_kunmap_local(const void *vaddr)
 }
 EXPORT_SYMBOL_GPL(rust_helper_kunmap_local);
 
+struct folio *rust_helper_read_mapping_folio(struct address_space *mapping,
+					     pgoff_t index, struct file *file)
+{
+	return read_mapping_folio(mapping, index, file);
+}
+EXPORT_SYMBOL_GPL(rust_helper_read_mapping_folio);
+
 void rust_helper_i_uid_write(struct inode *inode, uid_t uid)
 {
 	i_uid_write(inode, uid);
@@ -294,6 +302,12 @@  unsigned int rust_helper_MKDEV(unsigned int major, unsigned int minor)
 }
 EXPORT_SYMBOL_GPL(rust_helper_MKDEV);
 
+sector_t rust_helper_bdev_nr_sectors(struct block_device *bdev)
+{
+	return bdev_nr_sectors(bdev);
+}
+EXPORT_SYMBOL_GPL(rust_helper_bdev_nr_sectors);
+
 unsigned long rust_helper_copy_to_user(void __user *to, const void *from,
 				       unsigned long n)
 {
diff --git a/rust/kernel/block.rs b/rust/kernel/block.rs
index 868623d7c873..4d669bd5dce9 100644
--- a/rust/kernel/block.rs
+++ b/rust/kernel/block.rs
@@ -31,7 +31,6 @@  impl Device {
     ///
     /// Callers must ensure that `ptr` is valid and remains so for the lifetime of the returned
     /// object.
-    #[allow(dead_code)]
     pub(crate) unsafe fn from_raw<'a>(ptr: *mut bindings::block_device) -> &'a Self {
         // SAFETY: The safety requirements guarantee that the cast below is ok.
         unsafe { &*ptr.cast::<Self>() }
diff --git a/rust/kernel/fs.rs b/rust/kernel/fs.rs
index 387e87e3edaf..864aca24d12c 100644
--- a/rust/kernel/fs.rs
+++ b/rust/kernel/fs.rs
@@ -26,6 +26,11 @@ 
 /// This is C's `loff_t`.
 pub type Offset = i64;
 
+/// An index into the page cache.
+///
+/// This is C's `pgoff_t`.
+pub type PageOffset = usize;
+
 /// Maximum size of an inode.
 pub const MAX_LFS_FILESIZE: Offset = bindings::MAX_LFS_FILESIZE;
 
@@ -37,6 +42,9 @@  pub trait FileSystem {
     /// The name of the file system type.
     const NAME: &'static CStr;
 
+    /// Determines how superblocks for this file system type are keyed.
+    const SUPER_TYPE: sb::Type = sb::Type::Independent;
+
     /// Determines if an implementation doesn't specify the required types.
     ///
     /// This is meant for internal use only.
@@ -44,7 +52,10 @@  pub trait FileSystem {
     const IS_UNSPECIFIED: bool = false;
 
     /// Initialises the new superblock and returns the data to attach to it.
-    fn fill_super(sb: &mut SuperBlock<Self, sb::New>) -> Result<Self::Data>;
+    fn fill_super(
+        sb: &mut SuperBlock<Self, sb::New>,
+        mapper: Option<inode::Mapper>,
+    ) -> Result<Self::Data>;
 
     /// Initialises and returns the root inode of the given superblock.
     ///
@@ -100,7 +111,7 @@  impl FileSystem for UnspecifiedFS {
     type Data = ();
     const NAME: &'static CStr = crate::c_str!("unspecified");
     const IS_UNSPECIFIED: bool = true;
-    fn fill_super(_: &mut SuperBlock<Self, sb::New>) -> Result {
+    fn fill_super(_: &mut SuperBlock<Self, sb::New>, _: Option<inode::Mapper>) -> Result {
         Err(ENOTSUPP)
     }
 
@@ -139,7 +150,9 @@  pub fn new<T: FileSystem + ?Sized>(module: &'static ThisModule) -> impl PinInit<
                 fs.name = T::NAME.as_char_ptr();
                 fs.init_fs_context = Some(Self::init_fs_context_callback::<T>);
                 fs.kill_sb = Some(Self::kill_sb_callback::<T>);
-                fs.fs_flags = 0;
+                fs.fs_flags = if let sb::Type::BlockDev = T::SUPER_TYPE {
+                    bindings::FS_REQUIRES_DEV as i32
+                } else { 0 };
 
                 // SAFETY: Pointers stored in `fs` are static so will live for as long as the
                 // registration is active (it is undone in `drop`).
@@ -162,9 +175,16 @@  pub fn new<T: FileSystem + ?Sized>(module: &'static ThisModule) -> impl PinInit<
     unsafe extern "C" fn kill_sb_callback<T: FileSystem + ?Sized>(
         sb_ptr: *mut bindings::super_block,
     ) {
-        // SAFETY: In `get_tree_callback` we always call `get_tree_nodev`, so `kill_anon_super` is
-        // the appropriate function to call for cleanup.
-        unsafe { bindings::kill_anon_super(sb_ptr) };
+        match T::SUPER_TYPE {
+            // SAFETY: In `get_tree_callback` we always call `get_tree_bdev` for
+            // `sb::Type::BlockDev`, so `kill_block_super` is the appropriate function to call
+            // for cleanup.
+            sb::Type::BlockDev => unsafe { bindings::kill_block_super(sb_ptr) },
+            // SAFETY: In `get_tree_callback` we always call `get_tree_nodev` for
+            // `sb::Type::Independent`, so `kill_anon_super` is the appropriate function to call
+            // for cleanup.
+            sb::Type::Independent => unsafe { bindings::kill_anon_super(sb_ptr) },
+        }
 
         // SAFETY: The C API contract guarantees that `sb_ptr` is valid for read.
         let ptr = unsafe { (*sb_ptr).s_fs_info };
@@ -200,9 +220,18 @@  impl<T: FileSystem + ?Sized> Tables<T> {
     };
 
     unsafe extern "C" fn get_tree_callback(fc: *mut bindings::fs_context) -> ffi::c_int {
-        // SAFETY: `fc` is valid per the callback contract. `fill_super_callback` also has
-        // the right type and is a valid callback.
-        unsafe { bindings::get_tree_nodev(fc, Some(Self::fill_super_callback)) }
+        match T::SUPER_TYPE {
+            // SAFETY: `fc` is valid per the callback contract. `fill_super_callback` also has
+            // the right type and is a valid callback.
+            sb::Type::BlockDev => unsafe {
+                bindings::get_tree_bdev(fc, Some(Self::fill_super_callback))
+            },
+            // SAFETY: `fc` is valid per the callback contract. `fill_super_callback` also has
+            // the right type and is a valid callback.
+            sb::Type::Independent => unsafe {
+                bindings::get_tree_nodev(fc, Some(Self::fill_super_callback))
+            },
+        }
     }
 
     unsafe extern "C" fn fill_super_callback(
@@ -221,7 +250,14 @@  impl<T: FileSystem + ?Sized> Tables<T> {
             sb.s_xattr = &Tables::<T>::XATTR_HANDLERS[0];
             sb.s_flags |= bindings::SB_RDONLY;
 
-            let data = T::fill_super(new_sb)?;
+            let mapper = if matches!(T::SUPER_TYPE, sb::Type::BlockDev) {
+                // SAFETY: This is the only mapper created for this inode, so it is unique.
+                Some(unsafe { new_sb.bdev().inode().mapper() })
+            } else {
+                None
+            };
+
+            let data = T::fill_super(new_sb, mapper)?;
 
             // N.B.: Even on failure, `kill_sb` is called and frees the data.
             sb.s_fs_info = data.into_foreign().cast_mut();
@@ -369,7 +405,7 @@  fn init(module: &'static ThisModule) -> impl PinInit<Self, Error> {
 ///
 /// ```
 /// # mod module_fs_sample {
-/// use kernel::fs::{dentry, inode::INode, sb, sb::SuperBlock, self};
+/// use kernel::fs::{dentry, inode::INode, inode::Mapper, sb, sb::SuperBlock, self};
 /// use kernel::prelude::*;
 ///
 /// kernel::module_fs! {
@@ -384,7 +420,7 @@  fn init(module: &'static ThisModule) -> impl PinInit<Self, Error> {
 /// impl fs::FileSystem for MyFs {
 ///     type Data = ();
 ///     const NAME: &'static CStr = kernel::c_str!("myfs");
-///     fn fill_super(_: &mut SuperBlock<Self, sb::New>) -> Result {
+///     fn fill_super(_: &mut SuperBlock<Self, sb::New>, _: Option<Mapper>) -> Result {
 ///         todo!()
 ///     }
 ///     fn init_root(_sb: &SuperBlock<Self>) -> Result<dentry::Root<Self>> {
diff --git a/rust/kernel/fs/inode.rs b/rust/kernel/fs/inode.rs
index 75b68d697a6e..5b3602362521 100644
--- a/rust/kernel/fs/inode.rs
+++ b/rust/kernel/fs/inode.rs
@@ -7,13 +7,16 @@ 
 //! C headers: [`include/linux/fs.h`](srctree/include/linux/fs.h)
 
 use super::{
-    address_space, dentry, dentry::DEntry, file, sb::SuperBlock, FileSystem, Offset, UnspecifiedFS,
+    address_space, dentry, dentry::DEntry, file, sb::SuperBlock, FileSystem, Offset, PageOffset,
+    UnspecifiedFS,
 };
-use crate::error::{code::*, Result};
+use crate::error::{code::*, from_err_ptr, Result};
 use crate::types::{ARef, AlwaysRefCounted, Either, ForeignOwnable, Lockable, Locked, Opaque};
-use crate::{bindings, block, str::CStr, str::CString, time::Timespec};
+use crate::{
+    bindings, block, build_error, folio, folio::Folio, str::CStr, str::CString, time::Timespec,
+};
 use core::mem::ManuallyDrop;
-use core::{marker::PhantomData, ptr};
+use core::{cmp, marker::PhantomData, ops::Deref, ptr};
 use macros::vtable;
 
 /// The number of an inode.
@@ -93,6 +96,129 @@  pub fn size(&self) -> Offset {
         // SAFETY: `self` is guaranteed to be valid by the existence of a shared reference.
         unsafe { bindings::i_size_read(self.0.get()) }
     }
+
+    /// Returns a mapper for this inode.
+    ///
+    /// # Safety
+    ///
+    /// Callers must ensure that mappers are unique for a given inode and range. For inodes that
+    /// back a block device, a mapper is always created when the filesystem is mounted; so callers
+    /// in such situations must ensure that that mapper is never used.
+    pub unsafe fn mapper(&self) -> Mapper<T> {
+        Mapper {
+            inode: self.into(),
+            begin: 0,
+            end: Offset::MAX,
+        }
+    }
+
+    /// Returns a mapped folio at the given offset.
+    ///
+    /// # Safety
+    ///
+    /// Callers must ensure that there are no concurrent mutable mappings of the folio.
+    pub unsafe fn mapped_folio(
+        &self,
+        offset: Offset,
+    ) -> Result<folio::Mapped<'_, folio::PageCache<T>>> {
+        let page_index = offset >> bindings::PAGE_SHIFT;
+        let page_offset = offset & ((bindings::PAGE_SIZE - 1) as Offset);
+        let folio = self.read_mapping_folio(page_index.try_into()?)?;
+
+        // SAFETY: The safety requirements guarantee that there are no concurrent mutable mappings
+        // of the folio.
+        unsafe { Folio::map_owned(folio, page_offset.try_into()?) }
+    }
+
+    /// Returns the folio at the given page index.
+    pub fn read_mapping_folio(
+        &self,
+        index: PageOffset,
+    ) -> Result<ARef<Folio<folio::PageCache<T>>>> {
+        let folio = from_err_ptr(unsafe {
+            bindings::read_mapping_folio(
+                (*self.0.get()).i_mapping,
+                index.try_into()?,
+                ptr::null_mut(),
+            )
+        })?;
+        let ptr = ptr::NonNull::new(folio)
+            .ok_or(EIO)?
+            .cast::<Folio<folio::PageCache<T>>>();
+        // SAFETY: The folio returned by read_mapping_folio has had its refcount incremented.
+        Ok(unsafe { ARef::from_raw(ptr) })
+    }
+
+    /// Iterate over the given range, one folio at a time.
+    ///
+    /// # Safety
+    ///
+    /// Callers must ensure that there are no concurrent mutable mappings of the folio.
+    pub unsafe fn for_each_page<U>(
+        &self,
+        first: Offset,
+        len: Offset,
+        mut cb: impl FnMut(&[u8]) -> Result<Option<U>>,
+    ) -> Result<Option<U>> {
+        if first >= self.size() {
+            return Ok(None);
+        }
+        let mut remain = cmp::min(len, self.size() - first);
+        first.checked_add(remain).ok_or(EIO)?;
+
+        let mut next = first;
+        while remain > 0 {
+            // SAFETY: The safety requirements of this function satisfy those of `mapped_folio`.
+            let data = unsafe { self.mapped_folio(next)? };
+            let avail = cmp::min(data.len(), remain.try_into().unwrap_or(usize::MAX));
+            let ret = cb(&data[..avail])?;
+            if ret.is_some() {
+                return Ok(ret);
+            }
+
+            next += avail as Offset;
+            remain -= avail as Offset;
+        }
+
+        Ok(None)
+    }
+}
+
+impl<T: FileSystem + ?Sized, U: Deref<Target = INode<T>>> Locked<U, ReadSem> {
+    /// Returns a mapped folio at the given offset.
+    // TODO: This conflicts with Locked<Folio>::write. Once we settle on a way to handle reading
+    // the contents of certain inodes (e.g., directories, links), then we switch to that and
+    // remove this.
+    pub fn mapped_folio<'a>(
+        &'a self,
+        offset: Offset,
+    ) -> Result<folio::Mapped<'a, folio::PageCache<T>>>
+    where
+        T: 'a,
+    {
+        if T::IS_UNSPECIFIED {
+            build_error!("unspecified file systems cannot safely map folios");
+        }
+
+        // SAFETY: The inode is locked in read mode, so it's ok to map its contents.
+        unsafe { self.deref().mapped_folio(offset) }
+    }
+
+    /// Iterate over the given range, one folio at a time.
+    // TODO: This has the same issue as mapped_folio above.
+    pub fn for_each_page<V>(
+        &self,
+        first: Offset,
+        len: Offset,
+        cb: impl FnMut(&[u8]) -> Result<Option<V>>,
+    ) -> Result<Option<V>> {
+        if T::IS_UNSPECIFIED {
+            build_error!("unspecified file systems cannot safely map folios");
+        }
+
+        // SAFETY: The inode is locked in read mode, so it's ok to map its contents.
+        unsafe { self.deref().for_each_page(first, len, cb) }
+    }
 }
 
 // SAFETY: The type invariants guarantee that `INode` is always ref-counted.
@@ -111,6 +237,7 @@  unsafe fn dec_ref(obj: ptr::NonNull<Self>) {
 /// Indicates that the an inode's rw semapahore is locked in read (shared) mode.
 pub struct ReadSem;
 
+// SAFETY: `raw_lock` calls `inode_lock_shared` which locks the inode in shared mode.
 unsafe impl<T: FileSystem + ?Sized> Lockable<ReadSem> for INode<T> {
     fn raw_lock(&self) {
         // SAFETY: Since there's a reference to the inode, it must be valid.
@@ -432,3 +559,89 @@  extern "C" fn drop_cstring(ptr: *mut core::ffi::c_void) {
         Self(&Table::<U>::TABLE, PhantomData)
     }
 }
+
+/// Allows mapping the contents of the inode.
+///
+/// # Invariants
+///
+/// Mappers are unique per range per inode.
+pub struct Mapper<T: FileSystem + ?Sized = UnspecifiedFS> {
+    inode: ARef<INode<T>>,
+    begin: Offset,
+    end: Offset,
+}
+
+// SAFETY: All inode and folio operations are safe from any thread.
+unsafe impl<T: FileSystem + ?Sized> Send for Mapper<T> {}
+
+// SAFETY: All inode and folio operations are safe from any thread.
+unsafe impl<T: FileSystem + ?Sized> Sync for Mapper<T> {}
+
+impl<T: FileSystem + ?Sized> Mapper<T> {
+    /// Splits the mapper into two ranges.
+    ///
+    /// The first range is from the beginning of `self` up to and including `offset - 1`. The
+    /// second range is from `offset` to the end of `self`.
+    pub fn split_at(mut self, offset: Offset) -> (Self, Self) {
+        let inode = self.inode.clone();
+        if offset <= self.begin {
+            (
+                Self {
+                    inode,
+                    begin: offset,
+                    end: offset,
+                },
+                self,
+            )
+        } else if offset >= self.end {
+            (
+                self,
+                Self {
+                    inode,
+                    begin: offset,
+                    end: offset,
+                },
+            )
+        } else {
+            let end = self.end;
+            self.end = offset;
+            (
+                self,
+                Self {
+                    inode,
+                    begin: offset,
+                    end,
+                },
+            )
+        }
+    }
+
+    /// Returns a mapped folio at the given offset.
+    pub fn mapped_folio(&self, offset: Offset) -> Result<folio::Mapped<'_, folio::PageCache<T>>> {
+        if offset < self.begin || offset >= self.end {
+            return Err(ERANGE);
+        }
+
+        // SAFETY: By the type invariant, there are no other mutable mappings of the folio.
+        let mut map = unsafe { self.inode.mapped_folio(offset) }?;
+        map.cap_len((self.end - offset).try_into()?);
+        Ok(map)
+    }
+
+    /// Iterate over the given range, one folio at a time.
+    pub fn for_each_page<U>(
+        &self,
+        first: Offset,
+        len: Offset,
+        cb: impl FnMut(&[u8]) -> Result<Option<U>>,
+    ) -> Result<Option<U>> {
+        if first < self.begin || first >= self.end {
+            return Err(ERANGE);
+        }
+
+        let actual_len = cmp::min(len, self.end - first);
+
+        // SAFETY: By the type invariant, there are no other mutable mappings of the folio.
+        unsafe { self.inode.for_each_page(first, actual_len, cb) }
+    }
+}
diff --git a/rust/kernel/fs/sb.rs b/rust/kernel/fs/sb.rs
index 7c0c52e6da0a..93c7b2770163 100644
--- a/rust/kernel/fs/sb.rs
+++ b/rust/kernel/fs/sb.rs
@@ -8,11 +8,22 @@ 
 
 use super::inode::{self, INode, Ino};
 use super::FileSystem;
-use crate::bindings;
 use crate::error::{code::*, Result};
 use crate::types::{ARef, Either, ForeignOwnable, Opaque};
+use crate::{bindings, block, build_error};
 use core::{marker::PhantomData, ptr};
 
+/// Type of superblock keying.
+///
+/// It determines how C's `fs_context_operations::get_tree` is implemented.
+pub enum Type {
+    /// Multiple independent superblocks may exist.
+    Independent,
+
+    /// Uses a block device.
+    BlockDev,
+}
+
 /// A typestate for [`SuperBlock`] that indicates that it's a new one, so not fully initialized
 /// yet.
 pub struct New;
@@ -75,6 +86,28 @@  pub fn rdonly(&self) -> bool {
         // SAFETY: `s_flags` only changes during init, so it is safe to read it.
         unsafe { (*self.0.get()).s_flags & bindings::SB_RDONLY != 0 }
     }
+
+    /// Returns the block device associated with the superblock.
+    pub fn bdev(&self) -> &block::Device {
+        if !matches!(T::SUPER_TYPE, Type::BlockDev) {
+            build_error!("bdev is only available in blockdev superblocks");
+        }
+
+        // SAFETY: The superblock is valid and given that it's a blockdev superblock it must have a
+        // valid `s_bdev` that remains valid while the superblock (`self`) is valid.
+        unsafe { block::Device::from_raw((*self.0.get()).s_bdev) }
+    }
+
+    /// Returns the number of sectors in the underlying block device.
+    pub fn sector_count(&self) -> block::Sector {
+        if !matches!(T::SUPER_TYPE, Type::BlockDev) {
+            build_error!("sector_count is only available in blockdev superblocks");
+        }
+
+        // SAFETY: The superblock is valid and given that it's a blockdev superblock it must have a
+        // valid `s_bdev`.
+        unsafe { bindings::bdev_nr_sectors((*self.0.get()).s_bdev) }
+    }
 }
 
 impl<T: FileSystem + ?Sized> SuperBlock<T, New> {
@@ -85,6 +118,20 @@  pub fn set_magic(&mut self, magic: usize) -> &mut Self {
         unsafe { (*self.0.get()).s_magic = magic as core::ffi::c_ulong };
         self
     }
+
+    /// Sets the device blocksize, subjected to the minimum accepted by the device.
+    ///
+    /// Returns the actual value set.
+    pub fn min_blocksize(&mut self, size: i32) -> i32 {
+        if !matches!(T::SUPER_TYPE, Type::BlockDev) {
+            build_error!("min_blocksize is only available in blockdev superblocks");
+        }
+
+        // SAFETY: This a new superblock that is being initialised, so it it's ok to set the block
+        // size. Additionally, we've checked that this is the superblock is backed by a block
+        // device, so it is also valid.
+        unsafe { bindings::sb_min_blocksize(self.0.get(), size) }
+    }
 }
 
 impl<T: FileSystem + ?Sized, S: DataInited> SuperBlock<T, S> {
diff --git a/samples/rust/rust_rofs.rs b/samples/rust/rust_rofs.rs
index 7027ca067f8f..fea3360b6e7a 100644
--- a/samples/rust/rust_rofs.rs
+++ b/samples/rust/rust_rofs.rs
@@ -101,7 +101,7 @@  impl fs::FileSystem for RoFs {
     type Data = ();
     const NAME: &'static CStr = c_str!("rust_rofs");
 
-    fn fill_super(sb: &mut sb::SuperBlock<Self, sb::New>) -> Result {
+    fn fill_super(sb: &mut sb::SuperBlock<Self, sb::New>, _: Option<inode::Mapper>) -> Result {
         sb.set_magic(0x52555354);
         Ok(())
     }