[v7,1/3] libfs: Add directory operations for stable offsets

Message ID	168814732984.530310.11190772066786107220.stgit@manet.1015granger.net (mailing list archive)
State	New
Headers	show Return-Path: <owner-linux-mm@kvack.org> Subject: [PATCH v7 1/3] libfs: Add directory operations for stable offsets From: Chuck Lever <cel@kernel.org> To: viro@zeniv.linux.org.uk, brauner@kernel.org, hughd@google.com, akpm@linux-foundation.org Cc: Chuck Lever <chuck.lever@oracle.com>, jlayton@redhat.com, linux-mm@kvack.org, linux-fsdevel@vger.kernel.org Date: Fri, 30 Jun 2023 13:48:49 -0400 Message-ID: <168814732984.530310.11190772066786107220.stgit@manet.1015granger.net> In-Reply-To: <168814723481.530310.17776748558242063239.stgit@manet.1015granger.net> References: <168814723481.530310.17776748558242063239.stgit@manet.1015granger.net> User-Agent: StGit/1.5 MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 7bit Sender: owner-linux-mm@kvack.org Precedence: bulk
Series	shmemfs stable directory offsets \| expand [v7,0/3] shmemfs stable directory offsets [v7,1/3] libfs: Add directory operations for stable offsets [v7,2/3] shmem: Refactor shmem_symlink() [v7,3/3] shmem: stable directory offsets

Message ID

168814732984.530310.11190772066786107220.stgit@manet.1015granger.net (mailing list archive)

State

New

Headers

Subject: [PATCH v7 1/3] libfs: Add directory operations for stable offsets
From: Chuck Lever <cel@kernel.org>
To: viro@zeniv.linux.org.uk, brauner@kernel.org, hughd@google.com,
 akpm@linux-foundation.org
Cc: Chuck Lever <chuck.lever@oracle.com>, jlayton@redhat.com,
 linux-mm@kvack.org, linux-fsdevel@vger.kernel.org
Date: Fri, 30 Jun 2023 13:48:49 -0400
Message-ID: 
 <168814732984.530310.11190772066786107220.stgit@manet.1015granger.net>
In-Reply-To: 
 <168814723481.530310.17776748558242063239.stgit@manet.1015granger.net>
References: 
 <168814723481.530310.17776748558242063239.stgit@manet.1015granger.net>
User-Agent: StGit/1.5
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: 7bit
Sender: owner-linux-mm@kvack.org
Precedence: bulk

Series

shmemfs stable directory offsets | expand

Commit Message

Chuck Lever June 30, 2023, 5:48 p.m. UTC

From: Chuck Lever <chuck.lever@oracle.com>

Create a vector of directory operations in fs/libfs.c that handles
directory seeks and readdir via stable offsets instead of the
current cursor-based mechanism.

For the moment these are unused.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 Documentation/filesystems/locking.rst |    2 
 Documentation/filesystems/vfs.rst     |    6 +
 fs/libfs.c                            |  247 +++++++++++++++++++++++++++++++++
 include/linux/fs.h                    |   18 ++
 4 files changed, 272 insertions(+), 1 deletion(-)

Comments

Christian Brauner July 3, 2023, 10:56 a.m. UTC | #1

On Fri, Jun 30, 2023 at 01:48:49PM -0400, Chuck Lever wrote:
> From: Chuck Lever <chuck.lever@oracle.com>
> 
> Create a vector of directory operations in fs/libfs.c that handles
> directory seeks and readdir via stable offsets instead of the
> current cursor-based mechanism.
> 
> For the moment these are unused.
> 
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
>  Documentation/filesystems/locking.rst |    2 
>  Documentation/filesystems/vfs.rst     |    6 +
>  fs/libfs.c                            |  247 +++++++++++++++++++++++++++++++++
>  include/linux/fs.h                    |   18 ++
>  4 files changed, 272 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
> index ed148919e11a..6a928fee3400 100644
> --- a/Documentation/filesystems/locking.rst
> +++ b/Documentation/filesystems/locking.rst
> @@ -85,6 +85,7 @@ prototypes::
>  			    struct dentry *dentry, struct fileattr *fa);
>  	int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa);
>  	struct posix_acl * (*get_acl)(struct mnt_idmap *, struct dentry *, int);
> +	struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
>  
>  locking rules:
>  	all may block
> @@ -115,6 +116,7 @@ atomic_open:	shared (exclusive if O_CREAT is set in open flags)
>  tmpfile:	no
>  fileattr_get:	no or exclusive
>  fileattr_set:	exclusive
> +get_offset_ctx: no
>  ==============	=============================================
>  
>  
> diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
> index cb2a97e49872..898d0b43109e 100644
> --- a/Documentation/filesystems/vfs.rst
> +++ b/Documentation/filesystems/vfs.rst
> @@ -515,6 +515,7 @@ As of kernel 2.6.22, the following members are defined:
>  		int (*fileattr_set)(struct mnt_idmap *idmap,
>  				    struct dentry *dentry, struct fileattr *fa);
>  		int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa);
> +	        struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
>  	};
>  
>  Again, all methods are called without any locks being held, unless
> @@ -675,7 +676,10 @@ otherwise noted.
>  	called on ioctl(FS_IOC_SETFLAGS) and ioctl(FS_IOC_FSSETXATTR) to
>  	change miscellaneous file flags and attributes.  Callers hold
>  	i_rwsem exclusive.  If unset, then fall back to f_op->ioctl().
> -
> +``get_offset_ctx``
> +	called to get the offset context for a directory inode. A
> +        filesystem must define this operation to use
> +        simple_offset_dir_operations.
>  
>  The Address Space Object
>  ========================
> diff --git a/fs/libfs.c b/fs/libfs.c
> index 5b851315eeed..68b0000dc518 100644
> --- a/fs/libfs.c
> +++ b/fs/libfs.c
> @@ -239,6 +239,253 @@ const struct inode_operations simple_dir_inode_operations = {
>  };
>  EXPORT_SYMBOL(simple_dir_inode_operations);
>  
> +static void offset_set(struct dentry *dentry, unsigned long offset)
> +{
> +	dentry->d_fsdata = (void *)offset;
> +}
> +
> +static unsigned long dentry2offset(struct dentry *dentry)
> +{
> +	return (unsigned long)dentry->d_fsdata;
> +}

This looks fine to me and tmpfs xfstests seem happy too. Currently we
use unsigned long in some places, and u32 in some other places. It's not
a big deal but I would prefer if we kept this consistent and made it
clear everywhere that the offset is a 32 bit unsigned and that the
xarray's limit is U32_MAX. So I would like to fold the following change
into this series unless there are objections:

diff --git a/fs/libfs.c b/fs/libfs.c
index 68b0000dc518..a7e56baf8bbd 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -239,14 +239,14 @@ const struct inode_operations simple_dir_inode_operations = {
 };
 EXPORT_SYMBOL(simple_dir_inode_operations);

-static void offset_set(struct dentry *dentry, unsigned long offset)
+static void offset_set(struct dentry *dentry, u32 offset)
 {
-       dentry->d_fsdata = (void *)offset;
+       dentry->d_fsdata = (void *)((uintptr_t)(offset));
 }

-static unsigned long dentry2offset(struct dentry *dentry)
+static u32 dentry2offset(struct dentry *dentry)
 {
-       return (unsigned long)dentry->d_fsdata;
+       return (u32)((uintptr_t)(dentry->d_fsdata));
 }

 /**
@@ -296,12 +296,13 @@ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
  */
 void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry)
 {
-       unsigned long index = dentry2offset(dentry);
+       u32 offset;

-       if (index == 0)
+       offset = dentry2offset(dentry);
+       if (offset == 0)
                return;

-       xa_erase(&octx->xa, index);
+       xa_erase(&octx->xa, offset);
        offset_set(dentry, 0);
 }

@@ -322,8 +323,8 @@ int simple_offset_rename_exchange(struct inode *old_dir,
 {
        struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir);
        struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir);
-       unsigned long old_index = dentry2offset(old_dentry);
-       unsigned long new_index = dentry2offset(new_dentry);
+       u32 old_index = dentry2offset(old_dentry);
+       u32 new_index = dentry2offset(new_dentry);
        int ret;

        simple_offset_remove(old_ctx, old_dentry);
@@ -414,7 +415,7 @@ static struct dentry *offset_find_next(struct xa_state *xas)

 static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
 {
-       loff_t offset = dentry2offset(dentry);
+       u32 offset = dentry2offset(dentry);
        struct inode *inode = d_inode(dentry);

        return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset,

Chuck Lever III July 3, 2023, 1:26 p.m. UTC | #2

> On Jul 3, 2023, at 6:56 AM, Christian Brauner <brauner@kernel.org> wrote:
> 
> On Fri, Jun 30, 2023 at 01:48:49PM -0400, Chuck Lever wrote:
>> From: Chuck Lever <chuck.lever@oracle.com>
>> 
>> Create a vector of directory operations in fs/libfs.c that handles
>> directory seeks and readdir via stable offsets instead of the
>> current cursor-based mechanism.
>> 
>> For the moment these are unused.
>> 
>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>> ---
>> Documentation/filesystems/locking.rst |    2 
>> Documentation/filesystems/vfs.rst     |    6 +
>> fs/libfs.c                            |  247 +++++++++++++++++++++++++++++++++
>> include/linux/fs.h                    |   18 ++
>> 4 files changed, 272 insertions(+), 1 deletion(-)
>> 
>> diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
>> index ed148919e11a..6a928fee3400 100644
>> --- a/Documentation/filesystems/locking.rst
>> +++ b/Documentation/filesystems/locking.rst
>> @@ -85,6 +85,7 @@ prototypes::
>>     struct dentry *dentry, struct fileattr *fa);
>> int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa);
>> struct posix_acl * (*get_acl)(struct mnt_idmap *, struct dentry *, int);
>> + struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
>> 
>> locking rules:
>> all may block
>> @@ -115,6 +116,7 @@ atomic_open: shared (exclusive if O_CREAT is set in open flags)
>> tmpfile: no
>> fileattr_get: no or exclusive
>> fileattr_set: exclusive
>> +get_offset_ctx: no
>> ============== =============================================
>> 
>> 
>> diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
>> index cb2a97e49872..898d0b43109e 100644
>> --- a/Documentation/filesystems/vfs.rst
>> +++ b/Documentation/filesystems/vfs.rst
>> @@ -515,6 +515,7 @@ As of kernel 2.6.22, the following members are defined:
>> int (*fileattr_set)(struct mnt_idmap *idmap,
>>     struct dentry *dentry, struct fileattr *fa);
>> int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa);
>> +         struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
>> };
>> 
>> Again, all methods are called without any locks being held, unless
>> @@ -675,7 +676,10 @@ otherwise noted.
>> called on ioctl(FS_IOC_SETFLAGS) and ioctl(FS_IOC_FSSETXATTR) to
>> change miscellaneous file flags and attributes.  Callers hold
>> i_rwsem exclusive.  If unset, then fall back to f_op->ioctl().
>> -
>> +``get_offset_ctx``
>> + called to get the offset context for a directory inode. A
>> +        filesystem must define this operation to use
>> +        simple_offset_dir_operations.
>> 
>> The Address Space Object
>> ========================
>> diff --git a/fs/libfs.c b/fs/libfs.c
>> index 5b851315eeed..68b0000dc518 100644
>> --- a/fs/libfs.c
>> +++ b/fs/libfs.c
>> @@ -239,6 +239,253 @@ const struct inode_operations simple_dir_inode_operations = {
>> };
>> EXPORT_SYMBOL(simple_dir_inode_operations);
>> 
>> +static void offset_set(struct dentry *dentry, unsigned long offset)
>> +{
>> + dentry->d_fsdata = (void *)offset;
>> +}
>> +
>> +static unsigned long dentry2offset(struct dentry *dentry)
>> +{
>> + return (unsigned long)dentry->d_fsdata;
>> +}
> 
> This looks fine to me and tmpfs xfstests seem happy too. Currently we
> use unsigned long in some places, and u32 in some other places.

The two types are in response to the xarray API, which is a little
confusing (sometimes it wants a ulong, sometimes a u32). I tried
to make the type casting explicit wherever possible.

Your clean-up looks like a readability improvement to me.


> It's not
> a big deal but I would prefer if we kept this consistent and made it
> clear everywhere that the offset is a 32 bit unsigned and that the
> xarray's limit is U32_MAX. So I would like to fold the following change
> into this series unless there are objections:
> 
> diff --git a/fs/libfs.c b/fs/libfs.c
> index 68b0000dc518..a7e56baf8bbd 100644
> --- a/fs/libfs.c
> +++ b/fs/libfs.c
> @@ -239,14 +239,14 @@ const struct inode_operations simple_dir_inode_operations = {
> };
> EXPORT_SYMBOL(simple_dir_inode_operations);
> 
> -static void offset_set(struct dentry *dentry, unsigned long offset)
> +static void offset_set(struct dentry *dentry, u32 offset)
> {
> -       dentry->d_fsdata = (void *)offset;
> +       dentry->d_fsdata = (void *)((uintptr_t)(offset));
> }
> 
> -static unsigned long dentry2offset(struct dentry *dentry)
> +static u32 dentry2offset(struct dentry *dentry)
> {
> -       return (unsigned long)dentry->d_fsdata;
> +       return (u32)((uintptr_t)(dentry->d_fsdata));
> }
> 
> /**
> @@ -296,12 +296,13 @@ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
>  */
> void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry)
> {
> -       unsigned long index = dentry2offset(dentry);
> +       u32 offset;
> 
> -       if (index == 0)
> +       offset = dentry2offset(dentry);
> +       if (offset == 0)
>                return;
> 
> -       xa_erase(&octx->xa, index);
> +       xa_erase(&octx->xa, offset);
>        offset_set(dentry, 0);
> }
> 
> @@ -322,8 +323,8 @@ int simple_offset_rename_exchange(struct inode *old_dir,
> {
>        struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir);
>        struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir);
> -       unsigned long old_index = dentry2offset(old_dentry);
> -       unsigned long new_index = dentry2offset(new_dentry);
> +       u32 old_index = dentry2offset(old_dentry);
> +       u32 new_index = dentry2offset(new_dentry);
>        int ret;
> 
>        simple_offset_remove(old_ctx, old_dentry);
> @@ -414,7 +415,7 @@ static struct dentry *offset_find_next(struct xa_state *xas)
> 
> static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
> {
> -       loff_t offset = dentry2offset(dentry);
> +       u32 offset = dentry2offset(dentry);
>        struct inode *inode = d_inode(dentry);
> 
>        return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset,


--
Chuck Lever

diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index ed148919e11a..6a928fee3400 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -85,6 +85,7 @@  prototypes::
 			    struct dentry *dentry, struct fileattr *fa);
 	int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa);
 	struct posix_acl * (*get_acl)(struct mnt_idmap *, struct dentry *, int);
+	struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
 
 locking rules:
 	all may block
@@ -115,6 +116,7 @@  atomic_open:	shared (exclusive if O_CREAT is set in open flags)
 tmpfile:	no
 fileattr_get:	no or exclusive
 fileattr_set:	exclusive
+get_offset_ctx: no
 ==============	=============================================
 
 
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index cb2a97e49872..898d0b43109e 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -515,6 +515,7 @@  As of kernel 2.6.22, the following members are defined:
 		int (*fileattr_set)(struct mnt_idmap *idmap,
 				    struct dentry *dentry, struct fileattr *fa);
 		int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa);
+	        struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
 	};
 
 Again, all methods are called without any locks being held, unless
@@ -675,7 +676,10 @@  otherwise noted.
 	called on ioctl(FS_IOC_SETFLAGS) and ioctl(FS_IOC_FSSETXATTR) to
 	change miscellaneous file flags and attributes.  Callers hold
 	i_rwsem exclusive.  If unset, then fall back to f_op->ioctl().
-
+``get_offset_ctx``
+	called to get the offset context for a directory inode. A
+        filesystem must define this operation to use
+        simple_offset_dir_operations.
 
 The Address Space Object
 ========================
diff --git a/fs/libfs.c b/fs/libfs.c
index 5b851315eeed..68b0000dc518 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -239,6 +239,253 @@  const struct inode_operations simple_dir_inode_operations = {
 };
 EXPORT_SYMBOL(simple_dir_inode_operations);
 
+static void offset_set(struct dentry *dentry, unsigned long offset)
+{
+	dentry->d_fsdata = (void *)offset;
+}
+
+static unsigned long dentry2offset(struct dentry *dentry)
+{
+	return (unsigned long)dentry->d_fsdata;
+}
+
+/**
+ * simple_offset_init - initialize an offset_ctx
+ * @octx: directory offset map to be initialized
+ *
+ */
+void simple_offset_init(struct offset_ctx *octx)
+{
+	xa_init_flags(&octx->xa, XA_FLAGS_ALLOC1);
+
+	/* 0 is '.', 1 is '..', so always start with offset 2 */
+	octx->next_offset = 2;
+}
+
+/**
+ * simple_offset_add - Add an entry to a directory's offset map
+ * @octx: directory offset ctx to be updated
+ * @dentry: new dentry being added
+ *
+ * Returns zero on success. @so_ctx and the dentry offset are updated.
+ * Otherwise, a negative errno value is returned.
+ */
+int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
+{
+	static const struct xa_limit limit = XA_LIMIT(2, U32_MAX);
+	u32 offset;
+	int ret;
+
+	if (dentry2offset(dentry) != 0)
+		return -EBUSY;
+
+	ret = xa_alloc_cyclic(&octx->xa, &offset, dentry, limit,
+			      &octx->next_offset, GFP_KERNEL);
+	if (ret < 0)
+		return ret;
+
+	offset_set(dentry, offset);
+	return 0;
+}
+
+/**
+ * simple_offset_remove - Remove an entry to a directory's offset map
+ * @octx: directory offset ctx to be updated
+ * @dentry: dentry being removed
+ *
+ */
+void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry)
+{
+	unsigned long index = dentry2offset(dentry);
+
+	if (index == 0)
+		return;
+
+	xa_erase(&octx->xa, index);
+	offset_set(dentry, 0);
+}
+
+/**
+ * simple_offset_rename_exchange - exchange rename with directory offsets
+ * @old_dir: parent of dentry being moved
+ * @old_dentry: dentry being moved
+ * @new_dir: destination parent
+ * @new_dentry: destination dentry
+ *
+ * Returns zero on success. Otherwise a negative errno is returned and the
+ * rename is rolled back.
+ */
+int simple_offset_rename_exchange(struct inode *old_dir,
+				  struct dentry *old_dentry,
+				  struct inode *new_dir,
+				  struct dentry *new_dentry)
+{
+	struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir);
+	struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir);
+	unsigned long old_index = dentry2offset(old_dentry);
+	unsigned long new_index = dentry2offset(new_dentry);
+	int ret;
+
+	simple_offset_remove(old_ctx, old_dentry);
+	simple_offset_remove(new_ctx, new_dentry);
+
+	ret = simple_offset_add(new_ctx, old_dentry);
+	if (ret)
+		goto out_restore;
+
+	ret = simple_offset_add(old_ctx, new_dentry);
+	if (ret) {
+		simple_offset_remove(new_ctx, old_dentry);
+		goto out_restore;
+	}
+
+	ret = simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
+	if (ret) {
+		simple_offset_remove(new_ctx, old_dentry);
+		simple_offset_remove(old_ctx, new_dentry);
+		goto out_restore;
+	}
+	return 0;
+
+out_restore:
+	offset_set(old_dentry, old_index);
+	xa_store(&old_ctx->xa, old_index, old_dentry, GFP_KERNEL);
+	offset_set(new_dentry, new_index);
+	xa_store(&new_ctx->xa, new_index, new_dentry, GFP_KERNEL);
+	return ret;
+}
+
+/**
+ * simple_offset_destroy - Release offset map
+ * @octx: directory offset ctx that is about to be destroyed
+ *
+ * During fs teardown (eg. umount), a directory's offset map might still
+ * contain entries. xa_destroy() cleans out anything that remains.
+ */
+void simple_offset_destroy(struct offset_ctx *octx)
+{
+	xa_destroy(&octx->xa);
+}
+
+/**
+ * offset_dir_llseek - Advance the read position of a directory descriptor
+ * @file: an open directory whose position is to be updated
+ * @offset: a byte offset
+ * @whence: enumerator describing the starting position for this update
+ *
+ * SEEK_END, SEEK_DATA, and SEEK_HOLE are not supported for directories.
+ *
+ * Returns the updated read position if successful; otherwise a
+ * negative errno is returned and the read position remains unchanged.
+ */
+static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	switch (whence) {
+	case SEEK_CUR:
+		offset += file->f_pos;
+		fallthrough;
+	case SEEK_SET:
+		if (offset >= 0)
+			break;
+		fallthrough;
+	default:
+		return -EINVAL;
+	}
+
+	return vfs_setpos(file, offset, U32_MAX);
+}
+
+static struct dentry *offset_find_next(struct xa_state *xas)
+{
+	struct dentry *child, *found = NULL;
+
+	rcu_read_lock();
+	child = xas_next_entry(xas, U32_MAX);
+	if (!child)
+		goto out;
+	spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
+	if (simple_positive(child))
+		found = dget_dlock(child);
+	spin_unlock(&child->d_lock);
+out:
+	rcu_read_unlock();
+	return found;
+}
+
+static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
+{
+	loff_t offset = dentry2offset(dentry);
+	struct inode *inode = d_inode(dentry);
+
+	return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset,
+			  inode->i_ino, fs_umode_to_dtype(inode->i_mode));
+}
+
+static void offset_iterate_dir(struct dentry *dir, struct dir_context *ctx)
+{
+	struct inode *inode = d_inode(dir);
+	struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode);
+	XA_STATE(xas, &so_ctx->xa, ctx->pos);
+	struct dentry *dentry;
+
+	while (true) {
+		spin_lock(&dir->d_lock);
+		dentry = offset_find_next(&xas);
+		spin_unlock(&dir->d_lock);
+		if (!dentry)
+			break;
+
+		if (!offset_dir_emit(ctx, dentry)) {
+			dput(dentry);
+			break;
+		}
+
+		dput(dentry);
+		ctx->pos = xas.xa_index + 1;
+	}
+}
+
+/**
+ * offset_readdir - Emit entries starting at offset @ctx->pos
+ * @file: an open directory to iterate over
+ * @ctx: directory iteration context
+ *
+ * Caller must hold @file's i_rwsem to prevent insertion or removal of
+ * entries during this call.
+ *
+ * On entry, @ctx->pos contains an offset that represents the first entry
+ * to be read from the directory.
+ *
+ * The operation continues until there are no more entries to read, or
+ * until the ctx->actor indicates there is no more space in the caller's
+ * output buffer.
+ *
+ * On return, @ctx->pos contains an offset that will read the next entry
+ * in this directory when shmem_readdir() is called again with @ctx.
+ *
+ * Return values:
+ *   %0 - Complete
+ */
+static int offset_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct dentry *dir = file->f_path.dentry;
+
+	lockdep_assert_held(&d_inode(dir)->i_rwsem);
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	offset_iterate_dir(dir, ctx);
+	return 0;
+}
+
+const struct file_operations simple_offset_dir_operations = {
+	.llseek		= offset_dir_llseek,
+	.iterate_shared	= offset_readdir,
+	.read		= generic_read_dir,
+	.fsync		= noop_fsync,
+};
+
 static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
 {
 	struct dentry *child = NULL;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6867512907d6..59a4129ce14c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1770,6 +1770,7 @@  struct dir_context {
 
 struct iov_iter;
 struct io_uring_cmd;
+struct offset_ctx;
 
 struct file_operations {
 	struct module *owner;
@@ -1857,6 +1858,7 @@  struct inode_operations {
 	int (*fileattr_set)(struct mnt_idmap *idmap,
 			    struct dentry *dentry, struct fileattr *fa);
 	int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa);
+	struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
 } ____cacheline_aligned;
 
 static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio,
@@ -2971,6 +2973,22 @@  extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
 extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
 		const void __user *from, size_t count);
 
+struct offset_ctx {
+	struct xarray		xa;
+	u32			next_offset;
+};
+
+void simple_offset_init(struct offset_ctx *octx);
+int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry);
+void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry);
+int simple_offset_rename_exchange(struct inode *old_dir,
+				  struct dentry *old_dentry,
+				  struct inode *new_dir,
+				  struct dentry *new_dentry);
+void simple_offset_destroy(struct offset_ctx *octx);
+
+extern const struct file_operations simple_offset_dir_operations;
+
 extern int __generic_file_fsync(struct file *, loff_t, loff_t, int);
 extern int generic_file_fsync(struct file *, loff_t, loff_t, int);

[v7,1/3] libfs: Add directory operations for stable offsets

Commit Message

Comments

Patch