diff mbox series

[RFC,6/6,6.6] libfs: fix infinite directory reads for offset dir

Message ID 20241111005242.34654-7-cel@kernel.org (mailing list archive)
State New
Headers show
Series Address rename/readdir bugs in fs/libfs.c | expand

Commit Message

Chuck Lever Nov. 11, 2024, 12:52 a.m. UTC
From: yangerkun <yangerkun@huawei.com>

[ Upstream commit 64a7ce76fb901bf9f9c36cf5d681328fc0fd4b5a ]

After we switch tmpfs dir operations from simple_dir_operations to
simple_offset_dir_operations, every rename happened will fill new dentry
to dest dir's maple tree(&SHMEM_I(inode)->dir_offsets->mt) with a free
key starting with octx->newx_offset, and then set newx_offset equals to
free key + 1. This will lead to infinite readdir combine with rename
happened at the same time, which fail generic/736 in xfstests(detail show
as below).

1. create 5000 files(1 2 3...) under one dir
2. call readdir(man 3 readdir) once, and get one entry
3. rename(entry, "TEMPFILE"), then rename("TEMPFILE", entry)
4. loop 2~3, until readdir return nothing or we loop too many
   times(tmpfs break test with the second condition)

We choose the same logic what commit 9b378f6ad48cf ("btrfs: fix infinite
directory reads") to fix it, record the last_index when we open dir, and
do not emit the entry which index >= last_index. The file->private_data
now used in offset dir can use directly to do this, and we also update
the last_index when we llseek the dir file.

Fixes: a2e459555c5f ("shmem: stable directory offsets")
Signed-off-by: yangerkun <yangerkun@huawei.com>
Link: https://lore.kernel.org/r/20240731043835.1828697-1-yangerkun@huawei.com
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
[brauner: only update last_index after seek when offset is zero like Jan suggested]
Signed-off-by: Christian Brauner <brauner@kernel.org>
Link: https://nvd.nist.gov/vuln/detail/CVE-2024-46701
[ cel: adjusted to apply to origin/linux-6.6.y ]
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/libfs.c | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

Comments

Yu Kuai Nov. 11, 2024, 2:36 a.m. UTC | #1
Hi,

在 2024/11/11 8:52, cel@kernel.org 写道:
> From: yangerkun <yangerkun@huawei.com>
> 
> [ Upstream commit 64a7ce76fb901bf9f9c36cf5d681328fc0fd4b5a ]
> 
> After we switch tmpfs dir operations from simple_dir_operations to
> simple_offset_dir_operations, every rename happened will fill new dentry
> to dest dir's maple tree(&SHMEM_I(inode)->dir_offsets->mt) with a free
> key starting with octx->newx_offset, and then set newx_offset equals to
> free key + 1. This will lead to infinite readdir combine with rename
> happened at the same time, which fail generic/736 in xfstests(detail show
> as below).
> 
> 1. create 5000 files(1 2 3...) under one dir
> 2. call readdir(man 3 readdir) once, and get one entry
> 3. rename(entry, "TEMPFILE"), then rename("TEMPFILE", entry)
> 4. loop 2~3, until readdir return nothing or we loop too many
>     times(tmpfs break test with the second condition)
> 
> We choose the same logic what commit 9b378f6ad48cf ("btrfs: fix infinite
> directory reads") to fix it, record the last_index when we open dir, and
> do not emit the entry which index >= last_index. The file->private_data

Please notice this requires last_index should never overflow, otherwise
readdir will be messed up.
> now used in offset dir can use directly to do this, and we also update
> the last_index when we llseek the dir file.
> 
> Fixes: a2e459555c5f ("shmem: stable directory offsets")
> Signed-off-by: yangerkun <yangerkun@huawei.com>
> Link: https://lore.kernel.org/r/20240731043835.1828697-1-yangerkun@huawei.com
> Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
> [brauner: only update last_index after seek when offset is zero like Jan suggested]
> Signed-off-by: Christian Brauner <brauner@kernel.org>
> Link: https://nvd.nist.gov/vuln/detail/CVE-2024-46701
> [ cel: adjusted to apply to origin/linux-6.6.y ]
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
>   fs/libfs.c | 37 +++++++++++++++++++++++++------------
>   1 file changed, 25 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/libfs.c b/fs/libfs.c
> index a87005c89534..b59ff0dfea1f 100644
> --- a/fs/libfs.c
> +++ b/fs/libfs.c
> @@ -449,6 +449,14 @@ void simple_offset_destroy(struct offset_ctx *octx)
>   	xa_destroy(&octx->xa);
>   }
>   
> +static int offset_dir_open(struct inode *inode, struct file *file)
> +{
> +	struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
> +
> +	file->private_data = (void *)ctx->next_offset;
> +	return 0;
> +}

Looks like xarray is still used.

I'm in the cc list ,so I assume you saw my set, then I don't know why
you're ignoring my concerns.

1) next_offset is 32-bit and can overflow in a long-time running
machine.
2) Once next_offset overflows, readdir will skip the files that offset
is bigger.

Thanks,
Kuai

> +
>   /**
>    * offset_dir_llseek - Advance the read position of a directory descriptor
>    * @file: an open directory whose position is to be updated
> @@ -462,6 +470,9 @@ void simple_offset_destroy(struct offset_ctx *octx)
>    */
>   static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
>   {
> +	struct inode *inode = file->f_inode;
> +	struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
> +
>   	switch (whence) {
>   	case SEEK_CUR:
>   		offset += file->f_pos;
> @@ -475,8 +486,9 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
>   	}
>   
>   	/* In this case, ->private_data is protected by f_pos_lock */
> -	file->private_data = NULL;
> -	return vfs_setpos(file, offset, U32_MAX);
> +	if (!offset)
> +		file->private_data = (void *)ctx->next_offset;
> +	return vfs_setpos(file, offset, LONG_MAX);
>   }
>   
>   static struct dentry *offset_find_next(struct xa_state *xas)
> @@ -505,7 +517,7 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
>   			  inode->i_ino, fs_umode_to_dtype(inode->i_mode));
>   }
>   
> -static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
> +static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, long last_index)
>   {
>   	struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode);
>   	XA_STATE(xas, &so_ctx->xa, ctx->pos);
> @@ -514,17 +526,21 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
>   	while (true) {
>   		dentry = offset_find_next(&xas);
>   		if (!dentry)
> -			return ERR_PTR(-ENOENT);
> +			return;
> +
> +		if (dentry2offset(dentry) >= last_index) {
> +			dput(dentry);
> +			return;
> +		}
>   
>   		if (!offset_dir_emit(ctx, dentry)) {
>   			dput(dentry);
> -			break;
> +			return;
>   		}
>   
>   		dput(dentry);
>   		ctx->pos = xas.xa_index + 1;
>   	}
> -	return NULL;
>   }
>   
>   /**
> @@ -551,22 +567,19 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
>   static int offset_readdir(struct file *file, struct dir_context *ctx)
>   {
>   	struct dentry *dir = file->f_path.dentry;
> +	long last_index = (long)file->private_data;
>   
>   	lockdep_assert_held(&d_inode(dir)->i_rwsem);
>   
>   	if (!dir_emit_dots(file, ctx))
>   		return 0;
>   
> -	/* In this case, ->private_data is protected by f_pos_lock */
> -	if (ctx->pos == DIR_OFFSET_MIN)
> -		file->private_data = NULL;
> -	else if (file->private_data == ERR_PTR(-ENOENT))
> -		return 0;
> -	file->private_data = offset_iterate_dir(d_inode(dir), ctx);
> +	offset_iterate_dir(d_inode(dir), ctx, last_index);
>   	return 0;
>   }
>   
>   const struct file_operations simple_offset_dir_operations = {
> +	.open		= offset_dir_open,
>   	.llseek		= offset_dir_llseek,
>   	.iterate_shared	= offset_readdir,
>   	.read		= generic_read_dir,
>
Chuck Lever III Nov. 11, 2024, 2:39 p.m. UTC | #2
> On Nov 10, 2024, at 9:36 PM, Yu Kuai <yukuai1@huaweicloud.com> wrote:
> 
> Hi,
> 
> 在 2024/11/11 8:52, cel@kernel.org 写道:
>> From: yangerkun <yangerkun@huawei.com>
>> [ Upstream commit 64a7ce76fb901bf9f9c36cf5d681328fc0fd4b5a ]
>> After we switch tmpfs dir operations from simple_dir_operations to
>> simple_offset_dir_operations, every rename happened will fill new dentry
>> to dest dir's maple tree(&SHMEM_I(inode)->dir_offsets->mt) with a free
>> key starting with octx->newx_offset, and then set newx_offset equals to
>> free key + 1. This will lead to infinite readdir combine with rename
>> happened at the same time, which fail generic/736 in xfstests(detail show
>> as below).
>> 1. create 5000 files(1 2 3...) under one dir
>> 2. call readdir(man 3 readdir) once, and get one entry
>> 3. rename(entry, "TEMPFILE"), then rename("TEMPFILE", entry)
>> 4. loop 2~3, until readdir return nothing or we loop too many
>>    times(tmpfs break test with the second condition)
>> We choose the same logic what commit 9b378f6ad48cf ("btrfs: fix infinite
>> directory reads") to fix it, record the last_index when we open dir, and
>> do not emit the entry which index >= last_index. The file->private_data
> 
> Please notice this requires last_index should never overflow, otherwise
> readdir will be messed up.

It would help your cause if you could be more specific
than "messed up".


>> now used in offset dir can use directly to do this, and we also update
>> the last_index when we llseek the dir file.
>> Fixes: a2e459555c5f ("shmem: stable directory offsets")
>> Signed-off-by: yangerkun <yangerkun@huawei.com>
>> Link: https://lore.kernel.org/r/20240731043835.1828697-1-yangerkun@huawei.com
>> Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
>> [brauner: only update last_index after seek when offset is zero like Jan suggested]
>> Signed-off-by: Christian Brauner <brauner@kernel.org>
>> Link: https://nvd.nist.gov/vuln/detail/CVE-2024-46701
>> [ cel: adjusted to apply to origin/linux-6.6.y ]
>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>> ---
>>  fs/libfs.c | 37 +++++++++++++++++++++++++------------
>>  1 file changed, 25 insertions(+), 12 deletions(-)
>> diff --git a/fs/libfs.c b/fs/libfs.c
>> index a87005c89534..b59ff0dfea1f 100644
>> --- a/fs/libfs.c
>> +++ b/fs/libfs.c
>> @@ -449,6 +449,14 @@ void simple_offset_destroy(struct offset_ctx *octx)
>>   xa_destroy(&octx->xa);
>>  }
>>  +static int offset_dir_open(struct inode *inode, struct file *file)
>> +{
>> + struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
>> +
>> + file->private_data = (void *)ctx->next_offset;
>> + return 0;
>> +}
> 
> Looks like xarray is still used.

That's not going to change, as several folks have already
explained.


> I'm in the cc list ,so I assume you saw my set, then I don't know why
> you're ignoring my concerns.

> 1) next_offset is 32-bit and can overflow in a long-time running
> machine.
> 2) Once next_offset overflows, readdir will skip the files that offset
> is bigger.

In that case, that entry won't be visible via getdents(3)
until the directory is re-opened or the process does an
lseek(fd, 0, SEEK_SET).

That is the proper and expected behavior. I suspect you
will see exactly that behavior with ext4 and 32-bit
directory offsets, for example.

Does that not directly address your concern? Or do you
mean that Erkun's patch introduces a new issue?

If there is a problem here, please construct a reproducer
against this patch set and post it.


> Thanks,
> Kuai
> 
>> +
>>  /**
>>   * offset_dir_llseek - Advance the read position of a directory descriptor
>>   * @file: an open directory whose position is to be updated
>> @@ -462,6 +470,9 @@ void simple_offset_destroy(struct offset_ctx *octx)
>>   */
>>  static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
>>  {
>> + struct inode *inode = file->f_inode;
>> + struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
>> +
>>   switch (whence) {
>>   case SEEK_CUR:
>>   offset += file->f_pos;
>> @@ -475,8 +486,9 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
>>   }
>>     /* In this case, ->private_data is protected by f_pos_lock */
>> - file->private_data = NULL;
>> - return vfs_setpos(file, offset, U32_MAX);
>> + if (!offset)
>> + file->private_data = (void *)ctx->next_offset;
>> + return vfs_setpos(file, offset, LONG_MAX);
>>  }
>>    static struct dentry *offset_find_next(struct xa_state *xas)
>> @@ -505,7 +517,7 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
>>     inode->i_ino, fs_umode_to_dtype(inode->i_mode));
>>  }
>>  -static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
>> +static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, long last_index)
>>  {
>>   struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode);
>>   XA_STATE(xas, &so_ctx->xa, ctx->pos);
>> @@ -514,17 +526,21 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
>>   while (true) {
>>   dentry = offset_find_next(&xas);
>>   if (!dentry)
>> - return ERR_PTR(-ENOENT);
>> + return;
>> +
>> + if (dentry2offset(dentry) >= last_index) {
>> + dput(dentry);
>> + return;
>> + }
>>     if (!offset_dir_emit(ctx, dentry)) {
>>   dput(dentry);
>> - break;
>> + return;
>>   }
>>     dput(dentry);
>>   ctx->pos = xas.xa_index + 1;
>>   }
>> - return NULL;
>>  }
>>    /**
>> @@ -551,22 +567,19 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
>>  static int offset_readdir(struct file *file, struct dir_context *ctx)
>>  {
>>   struct dentry *dir = file->f_path.dentry;
>> + long last_index = (long)file->private_data;
>>     lockdep_assert_held(&d_inode(dir)->i_rwsem);
>>     if (!dir_emit_dots(file, ctx))
>>   return 0;
>>  - /* In this case, ->private_data is protected by f_pos_lock */
>> - if (ctx->pos == DIR_OFFSET_MIN)
>> - file->private_data = NULL;
>> - else if (file->private_data == ERR_PTR(-ENOENT))
>> - return 0;
>> - file->private_data = offset_iterate_dir(d_inode(dir), ctx);
>> + offset_iterate_dir(d_inode(dir), ctx, last_index);
>>   return 0;
>>  }
>>    const struct file_operations simple_offset_dir_operations = {
>> + .open = offset_dir_open,
>>   .llseek = offset_dir_llseek,
>>   .iterate_shared = offset_readdir,
>>   .read = generic_read_dir,


--
Chuck Lever
yangerkun Nov. 11, 2024, 3:20 p.m. UTC | #3
在 2024/11/11 22:39, Chuck Lever III 写道:
> 
> 
>> On Nov 10, 2024, at 9:36 PM, Yu Kuai <yukuai1@huaweicloud.com> wrote:
>>
>> Hi,
>>
>> 在 2024/11/11 8:52, cel@kernel.org 写道:
>>> From: yangerkun <yangerkun@huawei.com>
>>> [ Upstream commit 64a7ce76fb901bf9f9c36cf5d681328fc0fd4b5a ]
>>> After we switch tmpfs dir operations from simple_dir_operations to
>>> simple_offset_dir_operations, every rename happened will fill new dentry
>>> to dest dir's maple tree(&SHMEM_I(inode)->dir_offsets->mt) with a free
>>> key starting with octx->newx_offset, and then set newx_offset equals to
>>> free key + 1. This will lead to infinite readdir combine with rename
>>> happened at the same time, which fail generic/736 in xfstests(detail show
>>> as below).
>>> 1. create 5000 files(1 2 3...) under one dir
>>> 2. call readdir(man 3 readdir) once, and get one entry
>>> 3. rename(entry, "TEMPFILE"), then rename("TEMPFILE", entry)
>>> 4. loop 2~3, until readdir return nothing or we loop too many
>>>     times(tmpfs break test with the second condition)
>>> We choose the same logic what commit 9b378f6ad48cf ("btrfs: fix infinite
>>> directory reads") to fix it, record the last_index when we open dir, and
>>> do not emit the entry which index >= last_index. The file->private_data
>>
>> Please notice this requires last_index should never overflow, otherwise
>> readdir will be messed up.
> 
> It would help your cause if you could be more specific
> than "messed up".
> 
> 
>>> now used in offset dir can use directly to do this, and we also update
>>> the last_index when we llseek the dir file.
>>> Fixes: a2e459555c5f ("shmem: stable directory offsets")
>>> Signed-off-by: yangerkun <yangerkun@huawei.com>
>>> Link: https://lore.kernel.org/r/20240731043835.1828697-1-yangerkun@huawei.com
>>> Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
>>> [brauner: only update last_index after seek when offset is zero like Jan suggested]
>>> Signed-off-by: Christian Brauner <brauner@kernel.org>
>>> Link: https://nvd.nist.gov/vuln/detail/CVE-2024-46701
>>> [ cel: adjusted to apply to origin/linux-6.6.y ]
>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>>> ---
>>>   fs/libfs.c | 37 +++++++++++++++++++++++++------------
>>>   1 file changed, 25 insertions(+), 12 deletions(-)
>>> diff --git a/fs/libfs.c b/fs/libfs.c
>>> index a87005c89534..b59ff0dfea1f 100644
>>> --- a/fs/libfs.c
>>> +++ b/fs/libfs.c
>>> @@ -449,6 +449,14 @@ void simple_offset_destroy(struct offset_ctx *octx)
>>>    xa_destroy(&octx->xa);
>>>   }
>>>   +static int offset_dir_open(struct inode *inode, struct file *file)
>>> +{
>>> + struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
>>> +
>>> + file->private_data = (void *)ctx->next_offset;
>>> + return 0;
>>> +}
>>
>> Looks like xarray is still used.
> 
> That's not going to change, as several folks have already
> explained.
> 
> 
>> I'm in the cc list ,so I assume you saw my set, then I don't know why
>> you're ignoring my concerns.
> 
>> 1) next_offset is 32-bit and can overflow in a long-time running
>> machine.
>> 2) Once next_offset overflows, readdir will skip the files that offset
>> is bigger.
> 

I'm sorry, I'm a little busy these days, so I haven't responded to this
series of emails.

> In that case, that entry won't be visible via getdents(3)
> until the directory is re-opened or the process does an
> lseek(fd, 0, SEEK_SET).

Yes.

> 
> That is the proper and expected behavior. I suspect you
> will see exactly that behavior with ext4 and 32-bit
> directory offsets, for example.

Emm...

For this case like this:

1. mkdir /tmp/dir and touch /tmp/dir/file1 /tmp/dir/file2
2. open /tmp/dir with fd1
3. readdir and get /tmp/dir/file1
4. rm /tmp/dir/file2
5. touch /tmp/dir/file2
4. loop 4~5 for 2^32 times
5. readdir /tmp/dir with fd1

For tmpfs now, we may see no /tmp/dir/file2, since the offset has been 
overflow, for ext4 it is ok... So we think this will be a problem.

> 
> Does that not directly address your concern? Or do you
> mean that Erkun's patch introduces a new issue?

Yes, to be honest, my personal feeling is a problem. But for 64bit, it 
may never been trigger.

> 
> If there is a problem here, please construct a reproducer
> against this patch set and post it.
> 
> 
>> Thanks,
>> Kuai
>>
>>> +
>>>   /**
>>>    * offset_dir_llseek - Advance the read position of a directory descriptor
>>>    * @file: an open directory whose position is to be updated
>>> @@ -462,6 +470,9 @@ void simple_offset_destroy(struct offset_ctx *octx)
>>>    */
>>>   static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
>>>   {
>>> + struct inode *inode = file->f_inode;
>>> + struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
>>> +
>>>    switch (whence) {
>>>    case SEEK_CUR:
>>>    offset += file->f_pos;
>>> @@ -475,8 +486,9 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
>>>    }
>>>      /* In this case, ->private_data is protected by f_pos_lock */
>>> - file->private_data = NULL;
>>> - return vfs_setpos(file, offset, U32_MAX);
>>> + if (!offset)
>>> + file->private_data = (void *)ctx->next_offset;
>>> + return vfs_setpos(file, offset, LONG_MAX);
>>>   }
>>>     static struct dentry *offset_find_next(struct xa_state *xas)
>>> @@ -505,7 +517,7 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
>>>      inode->i_ino, fs_umode_to_dtype(inode->i_mode));
>>>   }
>>>   -static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
>>> +static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, long last_index)
>>>   {
>>>    struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode);
>>>    XA_STATE(xas, &so_ctx->xa, ctx->pos);
>>> @@ -514,17 +526,21 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
>>>    while (true) {
>>>    dentry = offset_find_next(&xas);
>>>    if (!dentry)
>>> - return ERR_PTR(-ENOENT);
>>> + return;
>>> +
>>> + if (dentry2offset(dentry) >= last_index) {
>>> + dput(dentry);
>>> + return;
>>> + }
>>>      if (!offset_dir_emit(ctx, dentry)) {
>>>    dput(dentry);
>>> - break;
>>> + return;
>>>    }
>>>      dput(dentry);
>>>    ctx->pos = xas.xa_index + 1;
>>>    }
>>> - return NULL;
>>>   }
>>>     /**
>>> @@ -551,22 +567,19 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
>>>   static int offset_readdir(struct file *file, struct dir_context *ctx)
>>>   {
>>>    struct dentry *dir = file->f_path.dentry;
>>> + long last_index = (long)file->private_data;
>>>      lockdep_assert_held(&d_inode(dir)->i_rwsem);
>>>      if (!dir_emit_dots(file, ctx))
>>>    return 0;
>>>   - /* In this case, ->private_data is protected by f_pos_lock */
>>> - if (ctx->pos == DIR_OFFSET_MIN)
>>> - file->private_data = NULL;
>>> - else if (file->private_data == ERR_PTR(-ENOENT))
>>> - return 0;
>>> - file->private_data = offset_iterate_dir(d_inode(dir), ctx);
>>> + offset_iterate_dir(d_inode(dir), ctx, last_index);
>>>    return 0;
>>>   }
>>>     const struct file_operations simple_offset_dir_operations = {
>>> + .open = offset_dir_open,
>>>    .llseek = offset_dir_llseek,
>>>    .iterate_shared = offset_readdir,
>>>    .read = generic_read_dir,
> 
> 
> --
> Chuck Lever
> 
>
Chuck Lever III Nov. 11, 2024, 3:34 p.m. UTC | #4
> On Nov 11, 2024, at 10:20 AM, yangerkun <yangerkun@huaweicloud.com> wrote:
> 
> 
> 
> 在 2024/11/11 22:39, Chuck Lever III 写道:
>>> On Nov 10, 2024, at 9:36 PM, Yu Kuai <yukuai1@huaweicloud.com> wrote:
>>> 
>>> Hi,
>>> 
>>> 在 2024/11/11 8:52, cel@kernel.org 写道:
>>>> From: yangerkun <yangerkun@huawei.com>
>>>> [ Upstream commit 64a7ce76fb901bf9f9c36cf5d681328fc0fd4b5a ]
>>>> After we switch tmpfs dir operations from simple_dir_operations to
>>>> simple_offset_dir_operations, every rename happened will fill new dentry
>>>> to dest dir's maple tree(&SHMEM_I(inode)->dir_offsets->mt) with a free
>>>> key starting with octx->newx_offset, and then set newx_offset equals to
>>>> free key + 1. This will lead to infinite readdir combine with rename
>>>> happened at the same time, which fail generic/736 in xfstests(detail show
>>>> as below).
>>>> 1. create 5000 files(1 2 3...) under one dir
>>>> 2. call readdir(man 3 readdir) once, and get one entry
>>>> 3. rename(entry, "TEMPFILE"), then rename("TEMPFILE", entry)
>>>> 4. loop 2~3, until readdir return nothing or we loop too many
>>>>    times(tmpfs break test with the second condition)
>>>> We choose the same logic what commit 9b378f6ad48cf ("btrfs: fix infinite
>>>> directory reads") to fix it, record the last_index when we open dir, and
>>>> do not emit the entry which index >= last_index. The file->private_data
>>> 
>>> Please notice this requires last_index should never overflow, otherwise
>>> readdir will be messed up.
>> It would help your cause if you could be more specific
>> than "messed up".
>>>> now used in offset dir can use directly to do this, and we also update
>>>> the last_index when we llseek the dir file.
>>>> Fixes: a2e459555c5f ("shmem: stable directory offsets")
>>>> Signed-off-by: yangerkun <yangerkun@huawei.com>
>>>> Link: https://lore.kernel.org/r/20240731043835.1828697-1-yangerkun@huawei.com
>>>> Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
>>>> [brauner: only update last_index after seek when offset is zero like Jan suggested]
>>>> Signed-off-by: Christian Brauner <brauner@kernel.org>
>>>> Link: https://nvd.nist.gov/vuln/detail/CVE-2024-46701
>>>> [ cel: adjusted to apply to origin/linux-6.6.y ]
>>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>>>> ---
>>>>  fs/libfs.c | 37 +++++++++++++++++++++++++------------
>>>>  1 file changed, 25 insertions(+), 12 deletions(-)
>>>> diff --git a/fs/libfs.c b/fs/libfs.c
>>>> index a87005c89534..b59ff0dfea1f 100644
>>>> --- a/fs/libfs.c
>>>> +++ b/fs/libfs.c
>>>> @@ -449,6 +449,14 @@ void simple_offset_destroy(struct offset_ctx *octx)
>>>>   xa_destroy(&octx->xa);
>>>>  }
>>>>  +static int offset_dir_open(struct inode *inode, struct file *file)
>>>> +{
>>>> + struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
>>>> +
>>>> + file->private_data = (void *)ctx->next_offset;
>>>> + return 0;
>>>> +}
>>> 
>>> Looks like xarray is still used.
>> That's not going to change, as several folks have already
>> explained.
>>> I'm in the cc list ,so I assume you saw my set, then I don't know why
>>> you're ignoring my concerns.
>>> 1) next_offset is 32-bit and can overflow in a long-time running
>>> machine.
>>> 2) Once next_offset overflows, readdir will skip the files that offset
>>> is bigger.
> 
> I'm sorry, I'm a little busy these days, so I haven't responded to this
> series of emails.
> 
>> In that case, that entry won't be visible via getdents(3)
>> until the directory is re-opened or the process does an
>> lseek(fd, 0, SEEK_SET).
> 
> Yes.
> 
>> That is the proper and expected behavior. I suspect you
>> will see exactly that behavior with ext4 and 32-bit
>> directory offsets, for example.
> 
> Emm...
> 
> For this case like this:
> 
> 1. mkdir /tmp/dir and touch /tmp/dir/file1 /tmp/dir/file2
> 2. open /tmp/dir with fd1
> 3. readdir and get /tmp/dir/file1
> 4. rm /tmp/dir/file2
> 5. touch /tmp/dir/file2
> 4. loop 4~5 for 2^32 times
> 5. readdir /tmp/dir with fd1
> 
> For tmpfs now, we may see no /tmp/dir/file2, since the offset has been overflow, for ext4 it is ok... So we think this will be a problem.
> 
>> Does that not directly address your concern? Or do you
>> mean that Erkun's patch introduces a new issue?
> 
> Yes, to be honest, my personal feeling is a problem. But for 64bit, it may never been trigger.

Thanks for confirming.

In that case, the preferred way to handle it is to fix
the issue in upstream, and then backport that fix to LTS.
Dependence on 64-bit offsets to avoid a failure case
should be considered a workaround, not a real fix, IMHO.

Do you have a few moments to address it, or if not I
will see to it.

I think reducing the xa_limit in simple_offset_add() to,
say, 2..16 would make the reproducer fire almost
immediately.


>> If there is a problem here, please construct a reproducer
>> against this patch set and post it.
>>> Thanks,
>>> Kuai
>>> 
>>>> +
>>>>  /**
>>>>   * offset_dir_llseek - Advance the read position of a directory descriptor
>>>>   * @file: an open directory whose position is to be updated
>>>> @@ -462,6 +470,9 @@ void simple_offset_destroy(struct offset_ctx *octx)
>>>>   */
>>>>  static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
>>>>  {
>>>> + struct inode *inode = file->f_inode;
>>>> + struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
>>>> +
>>>>   switch (whence) {
>>>>   case SEEK_CUR:
>>>>   offset += file->f_pos;
>>>> @@ -475,8 +486,9 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
>>>>   }
>>>>     /* In this case, ->private_data is protected by f_pos_lock */
>>>> - file->private_data = NULL;
>>>> - return vfs_setpos(file, offset, U32_MAX);
>>>> + if (!offset)
>>>> + file->private_data = (void *)ctx->next_offset;
>>>> + return vfs_setpos(file, offset, LONG_MAX);
>>>>  }
>>>>    static struct dentry *offset_find_next(struct xa_state *xas)
>>>> @@ -505,7 +517,7 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
>>>>     inode->i_ino, fs_umode_to_dtype(inode->i_mode));
>>>>  }
>>>>  -static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
>>>> +static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, long last_index)
>>>>  {
>>>>   struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode);
>>>>   XA_STATE(xas, &so_ctx->xa, ctx->pos);
>>>> @@ -514,17 +526,21 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
>>>>   while (true) {
>>>>   dentry = offset_find_next(&xas);
>>>>   if (!dentry)
>>>> - return ERR_PTR(-ENOENT);
>>>> + return;
>>>> +
>>>> + if (dentry2offset(dentry) >= last_index) {
>>>> + dput(dentry);
>>>> + return;
>>>> + }
>>>>     if (!offset_dir_emit(ctx, dentry)) {
>>>>   dput(dentry);
>>>> - break;
>>>> + return;
>>>>   }
>>>>     dput(dentry);
>>>>   ctx->pos = xas.xa_index + 1;
>>>>   }
>>>> - return NULL;
>>>>  }
>>>>    /**
>>>> @@ -551,22 +567,19 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
>>>>  static int offset_readdir(struct file *file, struct dir_context *ctx)
>>>>  {
>>>>   struct dentry *dir = file->f_path.dentry;
>>>> + long last_index = (long)file->private_data;
>>>>     lockdep_assert_held(&d_inode(dir)->i_rwsem);
>>>>     if (!dir_emit_dots(file, ctx))
>>>>   return 0;
>>>>  - /* In this case, ->private_data is protected by f_pos_lock */
>>>> - if (ctx->pos == DIR_OFFSET_MIN)
>>>> - file->private_data = NULL;
>>>> - else if (file->private_data == ERR_PTR(-ENOENT))
>>>> - return 0;
>>>> - file->private_data = offset_iterate_dir(d_inode(dir), ctx);
>>>> + offset_iterate_dir(d_inode(dir), ctx, last_index);
>>>>   return 0;
>>>>  }
>>>>    const struct file_operations simple_offset_dir_operations = {
>>>> + .open = offset_dir_open,
>>>>   .llseek = offset_dir_llseek,
>>>>   .iterate_shared = offset_readdir,
>>>>   .read = generic_read_dir,
>> --
>> Chuck Lever


--
Chuck Lever
yangerkun Nov. 12, 2024, 3:43 a.m. UTC | #5
在 2024/11/11 23:34, Chuck Lever III 写道:
> 
> 
>> On Nov 11, 2024, at 10:20 AM, yangerkun <yangerkun@huaweicloud.com> wrote:
>>
>>
>>
>> 在 2024/11/11 22:39, Chuck Lever III 写道:
>>>> On Nov 10, 2024, at 9:36 PM, Yu Kuai <yukuai1@huaweicloud.com> wrote:
>>>>
>>>> Hi,
>>>>
>>>> 在 2024/11/11 8:52, cel@kernel.org 写道:
>>>>> From: yangerkun <yangerkun@huawei.com>
>>>>> [ Upstream commit 64a7ce76fb901bf9f9c36cf5d681328fc0fd4b5a ]
>>>>> After we switch tmpfs dir operations from simple_dir_operations to
>>>>> simple_offset_dir_operations, every rename happened will fill new dentry
>>>>> to dest dir's maple tree(&SHMEM_I(inode)->dir_offsets->mt) with a free
>>>>> key starting with octx->newx_offset, and then set newx_offset equals to
>>>>> free key + 1. This will lead to infinite readdir combine with rename
>>>>> happened at the same time, which fail generic/736 in xfstests(detail show
>>>>> as below).
>>>>> 1. create 5000 files(1 2 3...) under one dir
>>>>> 2. call readdir(man 3 readdir) once, and get one entry
>>>>> 3. rename(entry, "TEMPFILE"), then rename("TEMPFILE", entry)
>>>>> 4. loop 2~3, until readdir return nothing or we loop too many
>>>>>     times(tmpfs break test with the second condition)
>>>>> We choose the same logic what commit 9b378f6ad48cf ("btrfs: fix infinite
>>>>> directory reads") to fix it, record the last_index when we open dir, and
>>>>> do not emit the entry which index >= last_index. The file->private_data
>>>>
>>>> Please notice this requires last_index should never overflow, otherwise
>>>> readdir will be messed up.
>>> It would help your cause if you could be more specific
>>> than "messed up".
>>>>> now used in offset dir can use directly to do this, and we also update
>>>>> the last_index when we llseek the dir file.
>>>>> Fixes: a2e459555c5f ("shmem: stable directory offsets")
>>>>> Signed-off-by: yangerkun <yangerkun@huawei.com>
>>>>> Link: https://lore.kernel.org/r/20240731043835.1828697-1-yangerkun@huawei.com
>>>>> Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
>>>>> [brauner: only update last_index after seek when offset is zero like Jan suggested]
>>>>> Signed-off-by: Christian Brauner <brauner@kernel.org>
>>>>> Link: https://nvd.nist.gov/vuln/detail/CVE-2024-46701
>>>>> [ cel: adjusted to apply to origin/linux-6.6.y ]
>>>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>>>>> ---
>>>>>   fs/libfs.c | 37 +++++++++++++++++++++++++------------
>>>>>   1 file changed, 25 insertions(+), 12 deletions(-)
>>>>> diff --git a/fs/libfs.c b/fs/libfs.c
>>>>> index a87005c89534..b59ff0dfea1f 100644
>>>>> --- a/fs/libfs.c
>>>>> +++ b/fs/libfs.c
>>>>> @@ -449,6 +449,14 @@ void simple_offset_destroy(struct offset_ctx *octx)
>>>>>    xa_destroy(&octx->xa);
>>>>>   }
>>>>>   +static int offset_dir_open(struct inode *inode, struct file *file)
>>>>> +{
>>>>> + struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
>>>>> +
>>>>> + file->private_data = (void *)ctx->next_offset;
>>>>> + return 0;
>>>>> +}
>>>>
>>>> Looks like xarray is still used.
>>> That's not going to change, as several folks have already
>>> explained.
>>>> I'm in the cc list ,so I assume you saw my set, then I don't know why
>>>> you're ignoring my concerns.
>>>> 1) next_offset is 32-bit and can overflow in a long-time running
>>>> machine.
>>>> 2) Once next_offset overflows, readdir will skip the files that offset
>>>> is bigger.
>>
>> I'm sorry, I'm a little busy these days, so I haven't responded to this
>> series of emails.
>>
>>> In that case, that entry won't be visible via getdents(3)
>>> until the directory is re-opened or the process does an
>>> lseek(fd, 0, SEEK_SET).
>>
>> Yes.
>>
>>> That is the proper and expected behavior. I suspect you
>>> will see exactly that behavior with ext4 and 32-bit
>>> directory offsets, for example.
>>
>> Emm...
>>
>> For this case like this:
>>
>> 1. mkdir /tmp/dir and touch /tmp/dir/file1 /tmp/dir/file2
>> 2. open /tmp/dir with fd1
>> 3. readdir and get /tmp/dir/file1
>> 4. rm /tmp/dir/file2
>> 5. touch /tmp/dir/file2
>> 4. loop 4~5 for 2^32 times
>> 5. readdir /tmp/dir with fd1
>>
>> For tmpfs now, we may see no /tmp/dir/file2, since the offset has been overflow, for ext4 it is ok... So we think this will be a problem.
>>
>>> Does that not directly address your concern? Or do you
>>> mean that Erkun's patch introduces a new issue?
>>
>> Yes, to be honest, my personal feeling is a problem. But for 64bit, it may never been trigger.
> 
> Thanks for confirming.
> 
> In that case, the preferred way to handle it is to fix
> the issue in upstream, and then backport that fix to LTS.
> Dependence on 64-bit offsets to avoid a failure case
> should be considered a workaround, not a real fix, IMHO.

Yes.

> 
> Do you have a few moments to address it, or if not I
> will see to it.

You can try to do this, for the reason I am quite busy now until end of 
this month... Sorry.

> 
> I think reducing the xa_limit in simple_offset_add() to,
> say, 2..16 would make the reproducer fire almost
> immediately.

Yes.

> 
> 
>>> If there is a problem here, please construct a reproducer
>>> against this patch set and post it.
>>>> Thanks,
>>>> Kuai
>>>>
>>>>> +
>>>>>   /**
>>>>>    * offset_dir_llseek - Advance the read position of a directory descriptor
>>>>>    * @file: an open directory whose position is to be updated
>>>>> @@ -462,6 +470,9 @@ void simple_offset_destroy(struct offset_ctx *octx)
>>>>>    */
>>>>>   static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
>>>>>   {
>>>>> + struct inode *inode = file->f_inode;
>>>>> + struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
>>>>> +
>>>>>    switch (whence) {
>>>>>    case SEEK_CUR:
>>>>>    offset += file->f_pos;
>>>>> @@ -475,8 +486,9 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
>>>>>    }
>>>>>      /* In this case, ->private_data is protected by f_pos_lock */
>>>>> - file->private_data = NULL;
>>>>> - return vfs_setpos(file, offset, U32_MAX);
>>>>> + if (!offset)
>>>>> + file->private_data = (void *)ctx->next_offset;
>>>>> + return vfs_setpos(file, offset, LONG_MAX);
>>>>>   }
>>>>>     static struct dentry *offset_find_next(struct xa_state *xas)
>>>>> @@ -505,7 +517,7 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
>>>>>      inode->i_ino, fs_umode_to_dtype(inode->i_mode));
>>>>>   }
>>>>>   -static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
>>>>> +static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, long last_index)
>>>>>   {
>>>>>    struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode);
>>>>>    XA_STATE(xas, &so_ctx->xa, ctx->pos);
>>>>> @@ -514,17 +526,21 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
>>>>>    while (true) {
>>>>>    dentry = offset_find_next(&xas);
>>>>>    if (!dentry)
>>>>> - return ERR_PTR(-ENOENT);
>>>>> + return;
>>>>> +
>>>>> + if (dentry2offset(dentry) >= last_index) {
>>>>> + dput(dentry);
>>>>> + return;
>>>>> + }
>>>>>      if (!offset_dir_emit(ctx, dentry)) {
>>>>>    dput(dentry);
>>>>> - break;
>>>>> + return;
>>>>>    }
>>>>>      dput(dentry);
>>>>>    ctx->pos = xas.xa_index + 1;
>>>>>    }
>>>>> - return NULL;
>>>>>   }
>>>>>     /**
>>>>> @@ -551,22 +567,19 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
>>>>>   static int offset_readdir(struct file *file, struct dir_context *ctx)
>>>>>   {
>>>>>    struct dentry *dir = file->f_path.dentry;
>>>>> + long last_index = (long)file->private_data;
>>>>>      lockdep_assert_held(&d_inode(dir)->i_rwsem);
>>>>>      if (!dir_emit_dots(file, ctx))
>>>>>    return 0;
>>>>>   - /* In this case, ->private_data is protected by f_pos_lock */
>>>>> - if (ctx->pos == DIR_OFFSET_MIN)
>>>>> - file->private_data = NULL;
>>>>> - else if (file->private_data == ERR_PTR(-ENOENT))
>>>>> - return 0;
>>>>> - file->private_data = offset_iterate_dir(d_inode(dir), ctx);
>>>>> + offset_iterate_dir(d_inode(dir), ctx, last_index);
>>>>>    return 0;
>>>>>   }
>>>>>     const struct file_operations simple_offset_dir_operations = {
>>>>> + .open = offset_dir_open,
>>>>>    .llseek = offset_dir_llseek,
>>>>>    .iterate_shared = offset_readdir,
>>>>>    .read = generic_read_dir,
>>> --
>>> Chuck Lever
> 
> 
> --
> Chuck Lever
> 
>
Chuck Lever III Nov. 12, 2024, 3:37 p.m. UTC | #6
> On Nov 11, 2024, at 10:43 PM, yangerkun <yangerkun@huaweicloud.com> wrote:
> 
> 
> 
> 在 2024/11/11 23:34, Chuck Lever III 写道:
>>> On Nov 11, 2024, at 10:20 AM, yangerkun <yangerkun@huaweicloud.com> wrote:
>>> 
>>> 
>>> 
>>> 在 2024/11/11 22:39, Chuck Lever III 写道:
>>>>> On Nov 10, 2024, at 9:36 PM, Yu Kuai <yukuai1@huaweicloud.com> wrote:
>>>>> 
>>>>> Hi,
>>>>> 
>>>>> 在 2024/11/11 8:52, cel@kernel.org 写道:
>>>>>> From: yangerkun <yangerkun@huawei.com>
>>>>>> [ Upstream commit 64a7ce76fb901bf9f9c36cf5d681328fc0fd4b5a ]
>>>>>> After we switch tmpfs dir operations from simple_dir_operations to
>>>>>> simple_offset_dir_operations, every rename happened will fill new dentry
>>>>>> to dest dir's maple tree(&SHMEM_I(inode)->dir_offsets->mt) with a free
>>>>>> key starting with octx->newx_offset, and then set newx_offset equals to
>>>>>> free key + 1. This will lead to infinite readdir combine with rename
>>>>>> happened at the same time, which fail generic/736 in xfstests(detail show
>>>>>> as below).
>>>>>> 1. create 5000 files(1 2 3...) under one dir
>>>>>> 2. call readdir(man 3 readdir) once, and get one entry
>>>>>> 3. rename(entry, "TEMPFILE"), then rename("TEMPFILE", entry)
>>>>>> 4. loop 2~3, until readdir return nothing or we loop too many
>>>>>>    times(tmpfs break test with the second condition)
>>>>>> We choose the same logic what commit 9b378f6ad48cf ("btrfs: fix infinite
>>>>>> directory reads") to fix it, record the last_index when we open dir, and
>>>>>> do not emit the entry which index >= last_index. The file->private_data
>>>>> 
>>>>> Please notice this requires last_index should never overflow, otherwise
>>>>> readdir will be messed up.
>>>> It would help your cause if you could be more specific
>>>> than "messed up".
>>>>>> now used in offset dir can use directly to do this, and we also update
>>>>>> the last_index when we llseek the dir file.
>>>>>> Fixes: a2e459555c5f ("shmem: stable directory offsets")
>>>>>> Signed-off-by: yangerkun <yangerkun@huawei.com>
>>>>>> Link: https://lore.kernel.org/r/20240731043835.1828697-1-yangerkun@huawei.com
>>>>>> Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
>>>>>> [brauner: only update last_index after seek when offset is zero like Jan suggested]
>>>>>> Signed-off-by: Christian Brauner <brauner@kernel.org>
>>>>>> Link: https://nvd.nist.gov/vuln/detail/CVE-2024-46701
>>>>>> [ cel: adjusted to apply to origin/linux-6.6.y ]
>>>>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>>>>>> ---
>>>>>>  fs/libfs.c | 37 +++++++++++++++++++++++++------------
>>>>>>  1 file changed, 25 insertions(+), 12 deletions(-)
>>>>>> diff --git a/fs/libfs.c b/fs/libfs.c
>>>>>> index a87005c89534..b59ff0dfea1f 100644
>>>>>> --- a/fs/libfs.c
>>>>>> +++ b/fs/libfs.c
>>>>>> @@ -449,6 +449,14 @@ void simple_offset_destroy(struct offset_ctx *octx)
>>>>>>   xa_destroy(&octx->xa);
>>>>>>  }
>>>>>>  +static int offset_dir_open(struct inode *inode, struct file *file)
>>>>>> +{
>>>>>> + struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
>>>>>> +
>>>>>> + file->private_data = (void *)ctx->next_offset;
>>>>>> + return 0;
>>>>>> +}
>>>>> 
>>>>> Looks like xarray is still used.
>>>> That's not going to change, as several folks have already
>>>> explained.
>>>>> I'm in the cc list ,so I assume you saw my set, then I don't know why
>>>>> you're ignoring my concerns.
>>>>> 1) next_offset is 32-bit and can overflow in a long-time running
>>>>> machine.
>>>>> 2) Once next_offset overflows, readdir will skip the files that offset
>>>>> is bigger.
>>> 
>>> I'm sorry, I'm a little busy these days, so I haven't responded to this
>>> series of emails.
>>> 
>>>> In that case, that entry won't be visible via getdents(3)
>>>> until the directory is re-opened or the process does an
>>>> lseek(fd, 0, SEEK_SET).
>>> 
>>> Yes.
>>> 
>>>> That is the proper and expected behavior. I suspect you
>>>> will see exactly that behavior with ext4 and 32-bit
>>>> directory offsets, for example.
>>> 
>>> Emm...
>>> 
>>> For this case like this:
>>> 
>>> 1. mkdir /tmp/dir and touch /tmp/dir/file1 /tmp/dir/file2
>>> 2. open /tmp/dir with fd1
>>> 3. readdir and get /tmp/dir/file1
>>> 4. rm /tmp/dir/file2
>>> 5. touch /tmp/dir/file2
>>> 4. loop 4~5 for 2^32 times
>>> 5. readdir /tmp/dir with fd1
>>> 
>>> For tmpfs now, we may see no /tmp/dir/file2, since the offset has been overflow, for ext4 it is ok... So we think this will be a problem.
>>> 
>>>> Does that not directly address your concern? Or do you
>>>> mean that Erkun's patch introduces a new issue?
>>> 
>>> Yes, to be honest, my personal feeling is a problem. But for 64bit, it may never been trigger.
>> Thanks for confirming.
>> In that case, the preferred way to handle it is to fix
>> the issue in upstream, and then backport that fix to LTS.
>> Dependence on 64-bit offsets to avoid a failure case
>> should be considered a workaround, not a real fix, IMHO.
> 
> Yes.
> 
>> Do you have a few moments to address it, or if not I
>> will see to it.
> 
> You can try to do this, for the reason I am quite busy now until end of this month... Sorry.

No worries!


>> I think reducing the xa_limit in simple_offset_add() to,
>> say, 2..16 would make the reproducer fire almost
>> immediately.
> 
> Yes.
> 
>>>> If there is a problem here, please construct a reproducer
>>>> against this patch set and post it.
>>>>> Thanks,
>>>>> Kuai
>>>>> 
>>>>>> +
>>>>>>  /**
>>>>>>   * offset_dir_llseek - Advance the read position of a directory descriptor
>>>>>>   * @file: an open directory whose position is to be updated
>>>>>> @@ -462,6 +470,9 @@ void simple_offset_destroy(struct offset_ctx *octx)
>>>>>>   */
>>>>>>  static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
>>>>>>  {
>>>>>> + struct inode *inode = file->f_inode;
>>>>>> + struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
>>>>>> +
>>>>>>   switch (whence) {
>>>>>>   case SEEK_CUR:
>>>>>>   offset += file->f_pos;
>>>>>> @@ -475,8 +486,9 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
>>>>>>   }
>>>>>>     /* In this case, ->private_data is protected by f_pos_lock */
>>>>>> - file->private_data = NULL;
>>>>>> - return vfs_setpos(file, offset, U32_MAX);
>>>>>> + if (!offset)
>>>>>> + file->private_data = (void *)ctx->next_offset;
>>>>>> + return vfs_setpos(file, offset, LONG_MAX);
>>>>>>  }
>>>>>>    static struct dentry *offset_find_next(struct xa_state *xas)
>>>>>> @@ -505,7 +517,7 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
>>>>>>     inode->i_ino, fs_umode_to_dtype(inode->i_mode));
>>>>>>  }
>>>>>>  -static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
>>>>>> +static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, long last_index)
>>>>>>  {
>>>>>>   struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode);
>>>>>>   XA_STATE(xas, &so_ctx->xa, ctx->pos);
>>>>>> @@ -514,17 +526,21 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
>>>>>>   while (true) {
>>>>>>   dentry = offset_find_next(&xas);
>>>>>>   if (!dentry)
>>>>>> - return ERR_PTR(-ENOENT);
>>>>>> + return;
>>>>>> +
>>>>>> + if (dentry2offset(dentry) >= last_index) {
>>>>>> + dput(dentry);
>>>>>> + return;
>>>>>> + }
>>>>>>     if (!offset_dir_emit(ctx, dentry)) {
>>>>>>   dput(dentry);
>>>>>> - break;
>>>>>> + return;
>>>>>>   }
>>>>>>     dput(dentry);
>>>>>>   ctx->pos = xas.xa_index + 1;
>>>>>>   }
>>>>>> - return NULL;
>>>>>>  }
>>>>>>    /**
>>>>>> @@ -551,22 +567,19 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
>>>>>>  static int offset_readdir(struct file *file, struct dir_context *ctx)
>>>>>>  {
>>>>>>   struct dentry *dir = file->f_path.dentry;
>>>>>> + long last_index = (long)file->private_data;
>>>>>>     lockdep_assert_held(&d_inode(dir)->i_rwsem);
>>>>>>     if (!dir_emit_dots(file, ctx))
>>>>>>   return 0;
>>>>>>  - /* In this case, ->private_data is protected by f_pos_lock */
>>>>>> - if (ctx->pos == DIR_OFFSET_MIN)
>>>>>> - file->private_data = NULL;
>>>>>> - else if (file->private_data == ERR_PTR(-ENOENT))
>>>>>> - return 0;
>>>>>> - file->private_data = offset_iterate_dir(d_inode(dir), ctx);
>>>>>> + offset_iterate_dir(d_inode(dir), ctx, last_index);
>>>>>>   return 0;
>>>>>>  }
>>>>>>    const struct file_operations simple_offset_dir_operations = {
>>>>>> + .open = offset_dir_open,
>>>>>>   .llseek = offset_dir_llseek,
>>>>>>   .iterate_shared = offset_readdir,
>>>>>>   .read = generic_read_dir,
>>>> --
>>>> Chuck Lever
>> --
>> Chuck Lever


--
Chuck Lever
Chuck Lever III Nov. 13, 2024, 3:17 p.m. UTC | #7
On Mon, Nov 11, 2024 at 11:20:17PM +0800, yangerkun wrote:
> 
> 
> 在 2024/11/11 22:39, Chuck Lever III 写道:
> > 
> > 
> > > On Nov 10, 2024, at 9:36 PM, Yu Kuai <yukuai1@huaweicloud.com> wrote:
> > > I'm in the cc list ,so I assume you saw my set, then I don't know why
> > > you're ignoring my concerns.
> > > 1) next_offset is 32-bit and can overflow in a long-time running
> > > machine.
> > > 2) Once next_offset overflows, readdir will skip the files that offset
> > > is bigger.
> 
> I'm sorry, I'm a little busy these days, so I haven't responded to this
> series of emails.
> 
> > In that case, that entry won't be visible via getdents(3)
> > until the directory is re-opened or the process does an
> > lseek(fd, 0, SEEK_SET).
> 
> Yes.
> 
> > 
> > That is the proper and expected behavior. I suspect you
> > will see exactly that behavior with ext4 and 32-bit
> > directory offsets, for example.
> 
> Emm...
> 
> For this case like this:
> 
> 1. mkdir /tmp/dir and touch /tmp/dir/file1 /tmp/dir/file2
> 2. open /tmp/dir with fd1
> 3. readdir and get /tmp/dir/file1
> 4. rm /tmp/dir/file2
> 5. touch /tmp/dir/file2
> 4. loop 4~5 for 2^32 times
> 5. readdir /tmp/dir with fd1
> 
> For tmpfs now, we may see no /tmp/dir/file2, since the offset has been
> overflow, for ext4 it is ok... So we think this will be a problem.

I constructed a simple test program using the above steps:

/*
 * 1. mkdir /tmp/dir and touch /tmp/dir/file1 /tmp/dir/file2
 * 2. open /tmp/dir with fd1
 * 3. readdir and get /tmp/dir/file1
 * 4. rm /tmp/dir/file2
 * 5. touch /tmp/dir/file2
 * 6. loop 4~5 for 2^32 times
 * 7. readdir /tmp/dir with fd1
 */

#include <sys/types.h>
#include <sys/stat.h>

#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>

static void list_directory(DIR *dirp)
{
	struct dirent *de;

	errno = 0;
	do {
		de = readdir(dirp);
		if (!de)
			break;

		printf("d_off:  %lld\n", de->d_off);
		printf("d_name: %s\n", de->d_name);
	} while (true);

	if (errno)
		perror("readdir");
	else
		printf("EOD\n");
}

int main(int argc, char **argv)
{
	unsigned long i;
	DIR *dirp;
	int ret;

	/* 1. */
	ret = mkdir("/tmp/dir", 0755);
	if (ret < 0) {
		perror("mkdir");
		return 1;
	}

	ret = creat("/tmp/dir/file1", 0644);
	if (ret < 0) {
		perror("creat");
		return 1;
	}
	close(ret);

	ret = creat("/tmp/dir/file2", 0644);
	if (ret < 0) {
		perror("creat");
		return 1;
	}
	close(ret);

	/* 2. */
	errno = 0;
	dirp = opendir("/tmp/dir");
	if (!dirp) {
		if (errno)
			perror("opendir");
		else
			fprintf(stderr, "EOD\n");
		closedir(dirp);
		return 1;
	}

	/* 3. */
	errno = 0;
	do {
		struct dirent *de;

		de = readdir(dirp);
		if (!de) {
			if (errno) {
				perror("readdir");
				closedir(dirp);
				return 1;
			}
			break;
		}
		if (strcmp(de->d_name, "file1") == 0) {
			printf("Found 'file1'\n");
			break;
		}
	} while (true);

	/* run the test. */
	for (i = 0; i < 10000; i++) {
		/* 4. */
		ret = unlink("/tmp/dir/file2");
		if (ret < 0) {
			perror("unlink");
			closedir(dirp);
			return 1;
		}

		/* 5. */
		ret = creat("/tmp/dir/file2", 0644);
		if (ret < 0) {
			perror("creat");
			fprintf(stderr, "i = %lu\n", i);
			closedir(dirp);
			return 1;
		}
		close(ret);
	}

	/* 7. */
	printf("\ndirectory after test:\n");
	list_directory(dirp);

	/* cel. */
	rewinddir(dirp);
	printf("\ndirectory after rewind:\n");
	list_directory(dirp);

	closedir(dirp);
	return 0;
}


> > Does that not directly address your concern? Or do you
> > mean that Erkun's patch introduces a new issue?
> 
> Yes, to be honest, my personal feeling is a problem. But for 64bit, it may
> never been trigger.

I ran the test program above on this kernel:

https://git.kernel.org/pub/scm/linux/kernel/git/cel/linux.git/log/?h=nfsd-testing

Note that it has a patch to restrict the range of directory offset
values for tmpfs to 2..4096.

I did not observe any unexpected behavior after the offset values
wrapped. At step 7, I can always see file2, and its offset is always
4. At step "cel" I can see all expected directory entries.

I tested on v6.12-rc7 with the same range restriction but using
Maple tree and 64-bit offsets. No unexpected behavior there either.

So either we're still missing something, or there is no problem. My
only theory is maybe it's an issue with an implicit integer sign
conversion, and we should restrict the offset range to 2..S32_MAX.

I can try testing with a range of (U32_MAX - 4096)..(U32_MAX).


> > If there is a problem here, please construct a reproducer
> > against this patch set and post it.

Invitation still stands: if you have a solid reproducer, please post
it.
Yu Kuai Nov. 16, 2024, 7:22 a.m. UTC | #8
Hi,

在 2024/11/13 23:17, Chuck Lever 写道:
> On Mon, Nov 11, 2024 at 11:20:17PM +0800, yangerkun wrote:
>>
>>
>> 在 2024/11/11 22:39, Chuck Lever III 写道:
>>>
>>>
>>>> On Nov 10, 2024, at 9:36 PM, Yu Kuai <yukuai1@huaweicloud.com> wrote:
>>>> I'm in the cc list ,so I assume you saw my set, then I don't know why
>>>> you're ignoring my concerns.
>>>> 1) next_offset is 32-bit and can overflow in a long-time running
>>>> machine.
>>>> 2) Once next_offset overflows, readdir will skip the files that offset
>>>> is bigger.
>>
>> I'm sorry, I'm a little busy these days, so I haven't responded to this
>> series of emails.
>>
>>> In that case, that entry won't be visible via getdents(3)
>>> until the directory is re-opened or the process does an
>>> lseek(fd, 0, SEEK_SET).
>>
>> Yes.
>>
>>>
>>> That is the proper and expected behavior. I suspect you
>>> will see exactly that behavior with ext4 and 32-bit
>>> directory offsets, for example.
>>
>> Emm...
>>
>> For this case like this:
>>
>> 1. mkdir /tmp/dir and touch /tmp/dir/file1 /tmp/dir/file2
>> 2. open /tmp/dir with fd1
>> 3. readdir and get /tmp/dir/file1
>> 4. rm /tmp/dir/file2
>> 5. touch /tmp/dir/file2
>> 4. loop 4~5 for 2^32 times
>> 5. readdir /tmp/dir with fd1
>>
>> For tmpfs now, we may see no /tmp/dir/file2, since the offset has been
>> overflow, for ext4 it is ok... So we think this will be a problem.
> 
> I constructed a simple test program using the above steps:
> 
> /*
>   * 1. mkdir /tmp/dir and touch /tmp/dir/file1 /tmp/dir/file2
>   * 2. open /tmp/dir with fd1
>   * 3. readdir and get /tmp/dir/file1
>   * 4. rm /tmp/dir/file2
>   * 5. touch /tmp/dir/file2
>   * 6. loop 4~5 for 2^32 times
>   * 7. readdir /tmp/dir with fd1
>   */
> 
> #include <sys/types.h>
> #include <sys/stat.h>
> 
> #include <dirent.h>
> #include <errno.h>
> #include <fcntl.h>
> #include <unistd.h>
> #include <stdbool.h>
> #include <stdio.h>
> #include <string.h>
> 
> static void list_directory(DIR *dirp)
> {
> 	struct dirent *de;
> 
> 	errno = 0;
> 	do {
> 		de = readdir(dirp);
> 		if (!de)
> 			break;
> 
> 		printf("d_off:  %lld\n", de->d_off);
> 		printf("d_name: %s\n", de->d_name);
> 	} while (true);
> 
> 	if (errno)
> 		perror("readdir");
> 	else
> 		printf("EOD\n");
> }
> 
> int main(int argc, char **argv)
> {
> 	unsigned long i;
> 	DIR *dirp;
> 	int ret;
> 
> 	/* 1. */
> 	ret = mkdir("/tmp/dir", 0755);
> 	if (ret < 0) {
> 		perror("mkdir");
> 		return 1;
> 	}
> 
> 	ret = creat("/tmp/dir/file1", 0644);
> 	if (ret < 0) {
> 		perror("creat");
> 		return 1;
> 	}
> 	close(ret);
> 
> 	ret = creat("/tmp/dir/file2", 0644);
> 	if (ret < 0) {
> 		perror("creat");
> 		return 1;
> 	}
> 	close(ret);
> 
> 	/* 2. */
> 	errno = 0;
> 	dirp = opendir("/tmp/dir");
> 	if (!dirp) {
> 		if (errno)
> 			perror("opendir");
> 		else
> 			fprintf(stderr, "EOD\n");
> 		closedir(dirp);
> 		return 1;
> 	}
> 
> 	/* 3. */
> 	errno = 0;
> 	do {
> 		struct dirent *de;
> 
> 		de = readdir(dirp);
> 		if (!de) {
> 			if (errno) {
> 				perror("readdir");
> 				closedir(dirp);
> 				return 1;
> 			}
> 			break;
> 		}
> 		if (strcmp(de->d_name, "file1") == 0) {
> 			printf("Found 'file1'\n");
> 			break;
> 		}
> 	} while (true);
> 
> 	/* run the test. */
> 	for (i = 0; i < 10000; i++) {
> 		/* 4. */
> 		ret = unlink("/tmp/dir/file2");
> 		if (ret < 0) {
> 			perror("unlink");
> 			closedir(dirp);
> 			return 1;
> 		}
> 
> 		/* 5. */
> 		ret = creat("/tmp/dir/file2", 0644);
> 		if (ret < 0) {
> 			perror("creat");
> 			fprintf(stderr, "i = %lu\n", i);
> 			closedir(dirp);
> 			return 1;
> 		}
> 		close(ret);
> 	}
> 
> 	/* 7. */
> 	printf("\ndirectory after test:\n");
> 	list_directory(dirp);
> 
> 	/* cel. */
> 	rewinddir(dirp);
> 	printf("\ndirectory after rewind:\n");
> 	list_directory(dirp);
> 
> 	closedir(dirp);
> 	return 0;
> }
> 
> 
>>> Does that not directly address your concern? Or do you
>>> mean that Erkun's patch introduces a new issue?
>>
>> Yes, to be honest, my personal feeling is a problem. But for 64bit, it may
>> never been trigger.
> 
> I ran the test program above on this kernel:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/cel/linux.git/log/?h=nfsd-testing
> 
> Note that it has a patch to restrict the range of directory offset
> values for tmpfs to 2..4096.
> 
> I did not observe any unexpected behavior after the offset values
> wrapped. At step 7, I can always see file2, and its offset is always
> 4. At step "cel" I can see all expected directory entries.

Then, do you investigate more or not?
> 
> I tested on v6.12-rc7 with the same range restriction but using
> Maple tree and 64-bit offsets. No unexpected behavior there either.
> 
> So either we're still missing something, or there is no problem. My
> only theory is maybe it's an issue with an implicit integer sign
> conversion, and we should restrict the offset range to 2..S32_MAX.
> 
> I can try testing with a range of (U32_MAX - 4096)..(U32_MAX).

You can try the following reproducer, it's much simpler. First, apply
following patch(on latest kernel):

diff --git a/fs/libfs.c b/fs/libfs.c
index a168ece5cc61..7c1a5982a0c8 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -291,7 +291,7 @@ int simple_offset_add(struct offset_ctx *octx, 
struct dentry *dentry)
                 return -EBUSY;

         ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, 
DIR_OFFSET_MIN,
-                                LONG_MAX, &octx->next_offset, GFP_KERNEL);
+                                256, &octx->next_offset, GFP_KERNEL);
         if (ret < 0)
                 return ret;


Then, create a new tmpfs dir, inside the dir:

[root@fedora test-libfs]# for ((i=0; i<256; ++i)); do touch $i; done
touch: cannot touch '255': Device or resource busy
[root@fedora test-libfs]# ls
0    103  109  114  12   125  130  136  141  147  152  158  163  169 
174  18   185  190  196  200  206  211  217  222  228  233  239  244  25 
   26  31  37  42  48  53  59  64  7   75  80  86  91  97
1    104  11   115  120  126  131  137  142  148  153  159  164  17 
175  180  186  191  197  201  207  212  218  223  229  234  24   245 
250  27  32  38  43  49  54  6   65  70  76  81  87  92  98
10   105  110  116  121  127  132  138  143  149  154  16   165  170 
176  181  187  192  198  202  208  213  219  224  23   235  240  246 
251  28  33  39  44  5   55  60  66  71  77  82  88  93  99
100  106  111  117  122  128  133  139  144  15   155  160  166  171 
177  182  188  193  199  203  209  214  22   225  230  236  241  247 
252  29  34  4   45  50  56  61  67  72  78  83  89  94
101  107  112  118  123  129  134  14   145  150  156  161  167  172 
178  183  189  194  2    204  21   215  220  226  231  237  242  248 
253  3   35  40  46  51  57  62  68  73  79  84  9   95
102  108  113  119  124  13   135  140  146  151  157  162  168  173 
179  184  19   195  20   205  210  216  221  227  232  238  243  249 
254  30  36  41  47  52  58  63  69  74  8   85  90  96
[root@fedora test-libfs]# rm -f 0
[root@fedora test-libfs]# touch 255
[root@fedora test-libfs]# ls
255
[root@fedora test-libfs]#

I don't think I have to explain why the second ls can only show the file
255...

Thanks,
Kuai

> 
> 
>>> If there is a problem here, please construct a reproducer
>>> against this patch set and post it.
> 
> Invitation still stands: if you have a solid reproducer, please post
> it.
> 
>
diff mbox series

Patch

diff --git a/fs/libfs.c b/fs/libfs.c
index a87005c89534..b59ff0dfea1f 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -449,6 +449,14 @@  void simple_offset_destroy(struct offset_ctx *octx)
 	xa_destroy(&octx->xa);
 }
 
+static int offset_dir_open(struct inode *inode, struct file *file)
+{
+	struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
+
+	file->private_data = (void *)ctx->next_offset;
+	return 0;
+}
+
 /**
  * offset_dir_llseek - Advance the read position of a directory descriptor
  * @file: an open directory whose position is to be updated
@@ -462,6 +470,9 @@  void simple_offset_destroy(struct offset_ctx *octx)
  */
 static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
 {
+	struct inode *inode = file->f_inode;
+	struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
+
 	switch (whence) {
 	case SEEK_CUR:
 		offset += file->f_pos;
@@ -475,8 +486,9 @@  static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
 	}
 
 	/* In this case, ->private_data is protected by f_pos_lock */
-	file->private_data = NULL;
-	return vfs_setpos(file, offset, U32_MAX);
+	if (!offset)
+		file->private_data = (void *)ctx->next_offset;
+	return vfs_setpos(file, offset, LONG_MAX);
 }
 
 static struct dentry *offset_find_next(struct xa_state *xas)
@@ -505,7 +517,7 @@  static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
 			  inode->i_ino, fs_umode_to_dtype(inode->i_mode));
 }
 
-static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
+static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, long last_index)
 {
 	struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode);
 	XA_STATE(xas, &so_ctx->xa, ctx->pos);
@@ -514,17 +526,21 @@  static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
 	while (true) {
 		dentry = offset_find_next(&xas);
 		if (!dentry)
-			return ERR_PTR(-ENOENT);
+			return;
+
+		if (dentry2offset(dentry) >= last_index) {
+			dput(dentry);
+			return;
+		}
 
 		if (!offset_dir_emit(ctx, dentry)) {
 			dput(dentry);
-			break;
+			return;
 		}
 
 		dput(dentry);
 		ctx->pos = xas.xa_index + 1;
 	}
-	return NULL;
 }
 
 /**
@@ -551,22 +567,19 @@  static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
 static int offset_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct dentry *dir = file->f_path.dentry;
+	long last_index = (long)file->private_data;
 
 	lockdep_assert_held(&d_inode(dir)->i_rwsem);
 
 	if (!dir_emit_dots(file, ctx))
 		return 0;
 
-	/* In this case, ->private_data is protected by f_pos_lock */
-	if (ctx->pos == DIR_OFFSET_MIN)
-		file->private_data = NULL;
-	else if (file->private_data == ERR_PTR(-ENOENT))
-		return 0;
-	file->private_data = offset_iterate_dir(d_inode(dir), ctx);
+	offset_iterate_dir(d_inode(dir), ctx, last_index);
 	return 0;
 }
 
 const struct file_operations simple_offset_dir_operations = {
+	.open		= offset_dir_open,
 	.llseek		= offset_dir_llseek,
 	.iterate_shared	= offset_readdir,
 	.read		= generic_read_dir,