diff mbox series

[3/3] io_uring: add support for getdents

Message ID 20230711114027.59945-4-hao.xu@linux.dev (mailing list archive)
State New
Headers show
Series io_uring getdents | expand

Commit Message

Hao Xu July 11, 2023, 11:40 a.m. UTC
From: Hao Xu <howeyxu@tencent.com>

This add support for getdents64 to io_uring, acting exactly like the
syscall: the directory is iterated from it's current's position as
stored in the file struct, and the file's position is updated exactly as
if getdents64 had been called.

For filesystems that support NOWAIT in iterate_shared(), try to use it
first; if a user already knows the filesystem they use do not support
nowait they can force async through IOSQE_ASYNC in the sqe flags,
avoiding the need to bounce back through a useless EAGAIN return.

Co-developed-by: Dominique Martinet <asmadeus@codewreck.org>
Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
Signed-off-by: Hao Xu <howeyxu@tencent.com>
---
 include/uapi/linux/io_uring.h |  7 ++++
 io_uring/fs.c                 | 60 +++++++++++++++++++++++++++++++++++
 io_uring/fs.h                 |  3 ++
 io_uring/opdef.c              |  8 +++++
 4 files changed, 78 insertions(+)

Comments

Dominique Martinet July 11, 2023, 12:15 p.m. UTC | #1
Hao Xu wrote on Tue, Jul 11, 2023 at 07:40:27PM +0800:
> diff --git a/io_uring/fs.c b/io_uring/fs.c
> index f6a69a549fd4..77f00577e09c 100644
> --- a/io_uring/fs.c
> +++ b/io_uring/fs.c
> @@ -291,3 +298,56 @@ void io_link_cleanup(struct io_kiocb *req)
>  	putname(sl->oldpath);
>  	putname(sl->newpath);
>  }
> +
> +int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> +{
> +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
> +
> +	if (READ_ONCE(sqe->off) != 0)
> +		return -EINVAL;
> +
> +	gd->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr));
> +	gd->count = READ_ONCE(sqe->len);
> +
> +	return 0;
> +}
> +
> +int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
> +{
> +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
> +	struct file *file;
> +	unsigned long getdents_flags = 0;
> +	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
> +	bool should_lock = false;
> +	int ret;
> +
> +	if (force_nonblock) {
> +		if (!(req->file->f_mode & FMODE_NOWAIT))
> +			return -EAGAIN;
> +
> +		getdents_flags = DIR_CONTEXT_F_NOWAIT;
> +	}
> +
> +	file = req->file;
> +	if (file && (file->f_mode & FMODE_ATOMIC_POS)) {

If file is NULL here things will just blow up in vfs_getdents anyway,
let's remove the useless check

> +		if (file_count(file) > 1)

I was curious about this so I found it's basically what __fdget_pos does
before deciding it should take the f_pos_lock, and as such this is
probably correct... But if someone can chime in here: what guarantees
someone else won't __fdget_pos (or equivalent through this) the file
again between this and the vfs_getdents call?
That second get would make file_count > 1 and it would lock, but lock
hadn't been taken here so the other call could get the lock without
waiting and both would process getdents or seek or whatever in
parallel.


That aside I don't see any obvious problem with this.
Hao Xu July 12, 2023, 7:53 a.m. UTC | #2
On 7/11/23 20:15, Dominique Martinet wrote:
> Hao Xu wrote on Tue, Jul 11, 2023 at 07:40:27PM +0800:
>> diff --git a/io_uring/fs.c b/io_uring/fs.c
>> index f6a69a549fd4..77f00577e09c 100644
>> --- a/io_uring/fs.c
>> +++ b/io_uring/fs.c
>> @@ -291,3 +298,56 @@ void io_link_cleanup(struct io_kiocb *req)
>>   	putname(sl->oldpath);
>>   	putname(sl->newpath);
>>   }
>> +
>> +int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>> +{
>> +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
>> +
>> +	if (READ_ONCE(sqe->off) != 0)
>> +		return -EINVAL;
>> +
>> +	gd->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr));
>> +	gd->count = READ_ONCE(sqe->len);
>> +
>> +	return 0;
>> +}
>> +
>> +int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
>> +{
>> +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
>> +	struct file *file;
>> +	unsigned long getdents_flags = 0;
>> +	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
>> +	bool should_lock = false;
>> +	int ret;
>> +
>> +	if (force_nonblock) {
>> +		if (!(req->file->f_mode & FMODE_NOWAIT))
>> +			return -EAGAIN;
>> +
>> +		getdents_flags = DIR_CONTEXT_F_NOWAIT;
>> +	}
>> +
>> +	file = req->file;
>> +	if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
> 
> If file is NULL here things will just blow up in vfs_getdents anyway,
> let's remove the useless check
> 
>> +		if (file_count(file) > 1)
> 
> I was curious about this so I found it's basically what __fdget_pos does
> before deciding it should take the f_pos_lock, and as such this is
> probably correct... But if someone can chime in here: what guarantees
> someone else won't __fdget_pos (or equivalent through this) the file
> again between this and the vfs_getdents call?
> That second get would make file_count > 1 and it would lock, but lock
> hadn't been taken here so the other call could get the lock without
> waiting and both would process getdents or seek or whatever in
> parallel.
> 

Hi Dominique,

This file_count(file) is atomic_read, so I believe no race condition here.

> 
> That aside I don't see any obvious problem with this.
>
Hao Xu July 12, 2023, 8:01 a.m. UTC | #3
On 7/11/23 20:15, Dominique Martinet wrote:
> Hao Xu wrote on Tue, Jul 11, 2023 at 07:40:27PM +0800:
>> diff --git a/io_uring/fs.c b/io_uring/fs.c
>> index f6a69a549fd4..77f00577e09c 100644
>> --- a/io_uring/fs.c
>> +++ b/io_uring/fs.c
>> @@ -291,3 +298,56 @@ void io_link_cleanup(struct io_kiocb *req)
>>   	putname(sl->oldpath);
>>   	putname(sl->newpath);
>>   }
>> +
>> +int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>> +{
>> +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
>> +
>> +	if (READ_ONCE(sqe->off) != 0)
>> +		return -EINVAL;
>> +
>> +	gd->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr));
>> +	gd->count = READ_ONCE(sqe->len);
>> +
>> +	return 0;
>> +}
>> +
>> +int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
>> +{
>> +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
>> +	struct file *file;
>> +	unsigned long getdents_flags = 0;
>> +	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
>> +	bool should_lock = false;
>> +	int ret;
>> +
>> +	if (force_nonblock) {
>> +		if (!(req->file->f_mode & FMODE_NOWAIT))
>> +			return -EAGAIN;
>> +
>> +		getdents_flags = DIR_CONTEXT_F_NOWAIT;
>> +	}
>> +
>> +	file = req->file;
>> +	if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
> 
> If file is NULL here things will just blow up in vfs_getdents anyway,
> let's remove the useless check

Sorry, forgot this one. Actually I think the file NULL check is not 
necessary here since it has been done in general code path in 
io_assign_file()


> 
>> +		if (file_count(file) > 1)
> 
> I was curious about this so I found it's basically what __fdget_pos does
> before deciding it should take the f_pos_lock, and as such this is
> probably correct... But if someone can chime in here: what guarantees
> someone else won't __fdget_pos (or equivalent through this) the file
> again between this and the vfs_getdents call?
> That second get would make file_count > 1 and it would lock, but lock
> hadn't been taken here so the other call could get the lock without
> waiting and both would process getdents or seek or whatever in
> parallel.
> 
> 
> That aside I don't see any obvious problem with this.
>
Christian Brauner July 12, 2023, 3:27 p.m. UTC | #4
On Tue, Jul 11, 2023 at 07:40:27PM +0800, Hao Xu wrote:
> From: Hao Xu <howeyxu@tencent.com>
> 
> This add support for getdents64 to io_uring, acting exactly like the
> syscall: the directory is iterated from it's current's position as
> stored in the file struct, and the file's position is updated exactly as
> if getdents64 had been called.
> 
> For filesystems that support NOWAIT in iterate_shared(), try to use it
> first; if a user already knows the filesystem they use do not support
> nowait they can force async through IOSQE_ASYNC in the sqe flags,
> avoiding the need to bounce back through a useless EAGAIN return.
> 
> Co-developed-by: Dominique Martinet <asmadeus@codewreck.org>
> Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
> Signed-off-by: Hao Xu <howeyxu@tencent.com>
> ---
>  include/uapi/linux/io_uring.h |  7 ++++
>  io_uring/fs.c                 | 60 +++++++++++++++++++++++++++++++++++
>  io_uring/fs.h                 |  3 ++
>  io_uring/opdef.c              |  8 +++++
>  4 files changed, 78 insertions(+)
> 
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 08720c7bd92f..6c0d521135a6 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -65,6 +65,7 @@ struct io_uring_sqe {
>  		__u32		xattr_flags;
>  		__u32		msg_ring_flags;
>  		__u32		uring_cmd_flags;
> +		__u32		getdents_flags;
>  	};
>  	__u64	user_data;	/* data to be passed back at completion time */
>  	/* pack this to avoid bogus arm OABI complaints */
> @@ -235,6 +236,7 @@ enum io_uring_op {
>  	IORING_OP_URING_CMD,
>  	IORING_OP_SEND_ZC,
>  	IORING_OP_SENDMSG_ZC,
> +	IORING_OP_GETDENTS,
>  
>  	/* this goes last, obviously */
>  	IORING_OP_LAST,
> @@ -273,6 +275,11 @@ enum io_uring_op {
>   */
>  #define SPLICE_F_FD_IN_FIXED	(1U << 31) /* the last bit of __u32 */
>  
> +/*
> + * sqe->getdents_flags
> + */
> +#define IORING_GETDENTS_REWIND	(1U << 0)
> +
>  /*
>   * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the
>   * command flags for POLL_ADD are stored in sqe->len.
> diff --git a/io_uring/fs.c b/io_uring/fs.c
> index f6a69a549fd4..77f00577e09c 100644
> --- a/io_uring/fs.c
> +++ b/io_uring/fs.c
> @@ -47,6 +47,13 @@ struct io_link {
>  	int				flags;
>  };
>  
> +struct io_getdents {
> +	struct file			*file;
> +	struct linux_dirent64 __user	*dirent;
> +	unsigned int			count;
> +	int				flags;
> +};
> +
>  int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>  {
>  	struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename);
> @@ -291,3 +298,56 @@ void io_link_cleanup(struct io_kiocb *req)
>  	putname(sl->oldpath);
>  	putname(sl->newpath);
>  }
> +
> +int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> +{
> +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
> +
> +	if (READ_ONCE(sqe->off) != 0)
> +		return -EINVAL;
> +
> +	gd->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr));
> +	gd->count = READ_ONCE(sqe->len);
> +
> +	return 0;
> +}
> +
> +int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
> +{
> +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
> +	struct file *file;
> +	unsigned long getdents_flags = 0;
> +	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
> +	bool should_lock = false;
> +	int ret;
> +
> +	if (force_nonblock) {
> +		if (!(req->file->f_mode & FMODE_NOWAIT))
> +			return -EAGAIN;
> +
> +		getdents_flags = DIR_CONTEXT_F_NOWAIT;

I mentioned this on the other patch but it seems really pointless to
have that extra flag. I would really like to hear a good reason for
this.

> +	}
> +
> +	file = req->file;
> +	if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
> +		if (file_count(file) > 1)

Assume we have a regular non-threaded process that just opens an fd to a
file. The process registers an async readdir request via that fd for the
file with io_uring and goes to do other stuff while waiting for the
result.

Some time later, io_uring gets to io_getdents() and the task is still
single threaded and the file hasn't been shared in the meantime. So
io_getdents() doesn't take the lock and starts the readdir() call.

Concurrently, the process that registered the io_uring request was free
to do other stuff and issued a synchronous readdir() system call which
calls fdget_pos(). Since the fdtable still isn't shared it doesn't
increment f_count and doesn't acquire the mutex. Now there's another
concurrent readdir() going on.

(Similar thing can happen if the process creates a thread for example.)

Two readdir() requests now proceed concurrently which is not intended.
Now to verify that this race can't happen with io_uring:

* regular fds:
  It seems that io_uring calls fget() on each regular file descriptor
  when an async request is registered. So that means that io_uring
  always hold its own explicit reference here.
  So as long as the original task is alive or another thread is alive
  f_count is guaranteed to be > 1 and so the mutex would always be
  acquired.

  If the registering process dies right before io_uring gets to the
  io_getdents() request no other process can steal the fd anymore and in
  that case the readdir call would not lock. But that's fine.

* fixed fds:
  I don't know the reference counting rules here. io_uring would need to
  ensure that it's impossible for two async readdir requests via a fixed
  fd to race because f_count is == 1.

  Iiuc, if a process registers a file it opened as a fixed file and
  immediately closes the fd afterwards - without anyone else holding a
  reference to that file - and only uses the fixed fd going forward, the
  f_count of that file in io_uring's fixed file table is always 1.

  So one could issue any number of concurrent readdir requests with no
  mutual exclusion. So for fixed files there definitely is a race, no?

All of that could ofc be simplified if we could just always acquire the
mutex in fdget_pos() and other places and drop that file_count(file) > 1
optimization everywhere. But I have no idea if the optimization for not
acquiring the mutex if f_count == 1 is worth it?

I hope I didn't confuse myself here.

Jens, do yo have any input here?

> +			should_lock = true;
> +	}
> +	if (should_lock) {
> +		if (!force_nonblock)
> +			mutex_lock(&file->f_pos_lock);
> +		else if (!mutex_trylock(&file->f_pos_lock))
> +			return -EAGAIN;
> +	}

Open-coding this seems extremely brittle with an invitation for subtle
bugs.

> +
> +	ret = vfs_getdents(file, gd->dirent, gd->count, getdents_flags);
> +	if (should_lock)
> +		mutex_unlock(&file->f_pos_lock);
> +
> +	if (ret == -EAGAIN && force_nonblock)
> +		return -EAGAIN;
> +
> +	io_req_set_res(req, ret, 0);
> +	return 0;
> +}
> +
> diff --git a/io_uring/fs.h b/io_uring/fs.h
> index 0bb5efe3d6bb..f83a6f3a678d 100644
> --- a/io_uring/fs.h
> +++ b/io_uring/fs.h
> @@ -18,3 +18,6 @@ int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags);
>  int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
>  int io_linkat(struct io_kiocb *req, unsigned int issue_flags);
>  void io_link_cleanup(struct io_kiocb *req);
> +
> +int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
> +int io_getdents(struct io_kiocb *req, unsigned int issue_flags);
> diff --git a/io_uring/opdef.c b/io_uring/opdef.c
> index 3b9c6489b8b6..1bae6b2a8d0b 100644
> --- a/io_uring/opdef.c
> +++ b/io_uring/opdef.c
> @@ -428,6 +428,11 @@ const struct io_issue_def io_issue_defs[] = {
>  		.prep			= io_eopnotsupp_prep,
>  #endif
>  	},
> +	[IORING_OP_GETDENTS] = {
> +		.needs_file		= 1,
> +		.prep			= io_getdents_prep,
> +		.issue			= io_getdents,
> +	},
>  };
>  
>  
> @@ -648,6 +653,9 @@ const struct io_cold_def io_cold_defs[] = {
>  		.fail			= io_sendrecv_fail,
>  #endif
>  	},
> +	[IORING_OP_GETDENTS] = {
> +		.name			= "GETDENTS",
> +	},
>  };
>  
>  const char *io_uring_get_opcode(u8 opcode)
> -- 
> 2.25.1
>
Dominique Martinet July 12, 2023, 4:10 p.m. UTC | #5
Hao Xu wrote on Wed, Jul 12, 2023 at 03:53:24PM +0800:
> > > +		if (file_count(file) > 1)
> > 
> > I was curious about this so I found it's basically what __fdget_pos does
> > before deciding it should take the f_pos_lock, and as such this is
> > probably correct... But if someone can chime in here: what guarantees
> > someone else won't __fdget_pos (or equivalent through this) the file
> > again between this and the vfs_getdents call?
> > That second get would make file_count > 1 and it would lock, but lock
> > hadn't been taken here so the other call could get the lock without
> > waiting and both would process getdents or seek or whatever in
> > parallel.
> > 
> 
> This file_count(file) is atomic_read, so I believe no race condition here.

I don't see how that helps in the presence of another thread getting the
lock after we possibly issued a getdents without the lock, e.g.

t1 call io_uring getdents here
t1 sees file_count(file) == 1 and skips getting lock
t1 starts issuing vfs_getdents [... processing]
t2 calls either io_uring getdents or getdents64 syscall
t2 gets the lock, since it wasn't taken by t1 it can be obtained
t2 issues another vfs_getdents

Christian raised the same issue so I'll leave this to his part of the
thread for reply, but I hope that clarified my concern.


-----

BTW I forgot to point out: this dropped the REWIND bit from my patch; I
believe some form of "seek" is necessary for real applications to make
use of this (for example, a web server could keep the fd open in a LRU
and keep issuing readdir over and over again everytime it gets an
indexing request); not having rewind means it'd need to close and
re-open the fd everytime which doesn't seem optimal.

A previous iteration discussed that real seek is difficult and not
necessarily needed to I settled for rewind, but was there a reason you
decided to stop handling that?

My very egoistical personal use case won't require it, so I can just say
I don't care here, but it would be nice to have a reason explained at
some point
Hao Xu July 13, 2023, 4:05 a.m. UTC | #6
On 7/13/23 00:10, Dominique Martinet wrote:
> Hao Xu wrote on Wed, Jul 12, 2023 at 03:53:24PM +0800:
>>>> +		if (file_count(file) > 1)
>>> I was curious about this so I found it's basically what __fdget_pos does
>>> before deciding it should take the f_pos_lock, and as such this is
>>> probably correct... But if someone can chime in here: what guarantees
>>> someone else won't __fdget_pos (or equivalent through this) the file
>>> again between this and the vfs_getdents call?
>>> That second get would make file_count > 1 and it would lock, but lock
>>> hadn't been taken here so the other call could get the lock without
>>> waiting and both would process getdents or seek or whatever in
>>> parallel.
>>>
>> This file_count(file) is atomic_read, so I believe no race condition here.
> I don't see how that helps in the presence of another thread getting the
> lock after we possibly issued a getdents without the lock, e.g.
>
> t1 call io_uring getdents here
> t1 sees file_count(file) == 1 and skips getting lock
> t1 starts issuing vfs_getdents [... processing]
> t2 calls either io_uring getdents or getdents64 syscall
> t2 gets the lock, since it wasn't taken by t1 it can be obtained
> t2 issues another vfs_getdents
>
> Christian raised the same issue so I'll leave this to his part of the
> thread for reply, but I hope that clarified my concern.


Hi Dominique,

Ah, I misunderstood your question, sorry. The thing is f_count is 
init-ed to be 1,

and normal uring requests do fdget first, so I think it's ok for normal 
requests.

What Christian points out is issue with fixed file, that is indeed a 
problem I think.


>
> -----
>
> BTW I forgot to point out: this dropped the REWIND bit from my patch; I
> believe some form of "seek" is necessary for real applications to make
> use of this (for example, a web server could keep the fd open in a LRU
> and keep issuing readdir over and over again everytime it gets an
> indexing request); not having rewind means it'd need to close and
> re-open the fd everytime which doesn't seem optimal.
>
> A previous iteration discussed that real seek is difficult and not
> necessarily needed to I settled for rewind, but was there a reason you
> decided to stop handling that?
>
> My very egoistical personal use case won't require it, so I can just say
> I don't care here, but it would be nice to have a reason explained at
> some point


Yes, like Al pointed out, getdents with an offset is not the right way 
to do it,

So a way to do seek is a must. But like what I said in the cover-letter, 
I do think the right thing is to

import lseek/llseek to io_uring, not increment the complex of getdents.


Thanks,

Hao
Hao Xu July 13, 2023, 4:35 a.m. UTC | #7
On 7/12/23 23:27, Christian Brauner wrote:
> On Tue, Jul 11, 2023 at 07:40:27PM +0800, Hao Xu wrote:
>> From: Hao Xu <howeyxu@tencent.com>
>>
>> This add support for getdents64 to io_uring, acting exactly like the
>> syscall: the directory is iterated from it's current's position as
>> stored in the file struct, and the file's position is updated exactly as
>> if getdents64 had been called.
>>
>> For filesystems that support NOWAIT in iterate_shared(), try to use it
>> first; if a user already knows the filesystem they use do not support
>> nowait they can force async through IOSQE_ASYNC in the sqe flags,
>> avoiding the need to bounce back through a useless EAGAIN return.
>>
>> Co-developed-by: Dominique Martinet <asmadeus@codewreck.org>
>> Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
>> Signed-off-by: Hao Xu <howeyxu@tencent.com>
>> ---
>>   include/uapi/linux/io_uring.h |  7 ++++
>>   io_uring/fs.c                 | 60 +++++++++++++++++++++++++++++++++++
>>   io_uring/fs.h                 |  3 ++
>>   io_uring/opdef.c              |  8 +++++
>>   4 files changed, 78 insertions(+)
>>
>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>> index 08720c7bd92f..6c0d521135a6 100644
>> --- a/include/uapi/linux/io_uring.h
>> +++ b/include/uapi/linux/io_uring.h
>> @@ -65,6 +65,7 @@ struct io_uring_sqe {
>>   		__u32		xattr_flags;
>>   		__u32		msg_ring_flags;
>>   		__u32		uring_cmd_flags;
>> +		__u32		getdents_flags;
>>   	};
>>   	__u64	user_data;	/* data to be passed back at completion time */
>>   	/* pack this to avoid bogus arm OABI complaints */
>> @@ -235,6 +236,7 @@ enum io_uring_op {
>>   	IORING_OP_URING_CMD,
>>   	IORING_OP_SEND_ZC,
>>   	IORING_OP_SENDMSG_ZC,
>> +	IORING_OP_GETDENTS,
>>   
>>   	/* this goes last, obviously */
>>   	IORING_OP_LAST,
>> @@ -273,6 +275,11 @@ enum io_uring_op {
>>    */
>>   #define SPLICE_F_FD_IN_FIXED	(1U << 31) /* the last bit of __u32 */
>>   
>> +/*
>> + * sqe->getdents_flags
>> + */
>> +#define IORING_GETDENTS_REWIND	(1U << 0)
>> +
>>   /*
>>    * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the
>>    * command flags for POLL_ADD are stored in sqe->len.
>> diff --git a/io_uring/fs.c b/io_uring/fs.c
>> index f6a69a549fd4..77f00577e09c 100644
>> --- a/io_uring/fs.c
>> +++ b/io_uring/fs.c
>> @@ -47,6 +47,13 @@ struct io_link {
>>   	int				flags;
>>   };
>>   
>> +struct io_getdents {
>> +	struct file			*file;
>> +	struct linux_dirent64 __user	*dirent;
>> +	unsigned int			count;
>> +	int				flags;
>> +};
>> +
>>   int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>   {
>>   	struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename);
>> @@ -291,3 +298,56 @@ void io_link_cleanup(struct io_kiocb *req)
>>   	putname(sl->oldpath);
>>   	putname(sl->newpath);
>>   }
>> +
>> +int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>> +{
>> +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
>> +
>> +	if (READ_ONCE(sqe->off) != 0)
>> +		return -EINVAL;
>> +
>> +	gd->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr));
>> +	gd->count = READ_ONCE(sqe->len);
>> +
>> +	return 0;
>> +}
>> +
>> +int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
>> +{
>> +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
>> +	struct file *file;
>> +	unsigned long getdents_flags = 0;
>> +	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
>> +	bool should_lock = false;
>> +	int ret;
>> +
>> +	if (force_nonblock) {
>> +		if (!(req->file->f_mode & FMODE_NOWAIT))
>> +			return -EAGAIN;
>> +
>> +		getdents_flags = DIR_CONTEXT_F_NOWAIT;
> 
> I mentioned this on the other patch but it seems really pointless to
> have that extra flag. I would really like to hear a good reason for
> this.
> 
>> +	}
>> +
>> +	file = req->file;
>> +	if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
>> +		if (file_count(file) > 1)
> 
> Assume we have a regular non-threaded process that just opens an fd to a
> file. The process registers an async readdir request via that fd for the
> file with io_uring and goes to do other stuff while waiting for the
> result.
> 
> Some time later, io_uring gets to io_getdents() and the task is still
> single threaded and the file hasn't been shared in the meantime. So
> io_getdents() doesn't take the lock and starts the readdir() call.
> 
> Concurrently, the process that registered the io_uring request was free
> to do other stuff and issued a synchronous readdir() system call which
> calls fdget_pos(). Since the fdtable still isn't shared it doesn't
> increment f_count and doesn't acquire the mutex. Now there's another
> concurrent readdir() going on.
> 
> (Similar thing can happen if the process creates a thread for example.)
> 
> Two readdir() requests now proceed concurrently which is not intended.
> Now to verify that this race can't happen with io_uring:
> 
> * regular fds:
>    It seems that io_uring calls fget() on each regular file descriptor
>    when an async request is registered. So that means that io_uring
>    always hold its own explicit reference here.
>    So as long as the original task is alive or another thread is alive
>    f_count is guaranteed to be > 1 and so the mutex would always be
>    acquired.
> 
>    If the registering process dies right before io_uring gets to the
>    io_getdents() request no other process can steal the fd anymore and in
>    that case the readdir call would not lock. But that's fine.
> 
> * fixed fds:
>    I don't know the reference counting rules here. io_uring would need to
>    ensure that it's impossible for two async readdir requests via a fixed
>    fd to race because f_count is == 1.
> 
>    Iiuc, if a process registers a file it opened as a fixed file and
>    immediately closes the fd afterwards - without anyone else holding a
>    reference to that file - and only uses the fixed fd going forward, the
>    f_count of that file in io_uring's fixed file table is always 1.
> 
>    So one could issue any number of concurrent readdir requests with no
>    mutual exclusion. So for fixed files there definitely is a race, no?

Hi Christian,
The ref logic for fixed file is that it does fdget() when registering
the file, and fdput() when unregistering it. So the ref in between is
always > 1. The fixed file feature is to reduce frequent fdget/fdput,
but it does call them at the register/unregister time.


> 
> All of that could ofc be simplified if we could just always acquire the
> mutex in fdget_pos() and other places and drop that file_count(file) > 1
> optimization everywhere. But I have no idea if the optimization for not
> acquiring the mutex if f_count == 1 is worth it?
> 
> I hope I didn't confuse myself here.
> 
> Jens, do yo have any input here?
> 
>> +			should_lock = true;
>> +	}
>> +	if (should_lock) {
>> +		if (!force_nonblock)
>> +			mutex_lock(&file->f_pos_lock);
>> +		else if (!mutex_trylock(&file->f_pos_lock))
>> +			return -EAGAIN;
>> +	}
> 
> Open-coding this seems extremely brittle with an invitation for subtle
> bugs.

Could you elaborate on this, I'm not sure that I understand it quite
well. Sorry for my poor English.

Thanks,
Hao

> 
>> +
>> +	ret = vfs_getdents(file, gd->dirent, gd->count, getdents_flags);
>> +	if (should_lock)
>> +		mutex_unlock(&file->f_pos_lock);
>> +
>> +	if (ret == -EAGAIN && force_nonblock)
>> +		return -EAGAIN;
>> +
>> +	io_req_set_res(req, ret, 0);
>> +	return 0;
>> +}
>> +
>> diff --git a/io_uring/fs.h b/io_uring/fs.h
>> index 0bb5efe3d6bb..f83a6f3a678d 100644
>> --- a/io_uring/fs.h
>> +++ b/io_uring/fs.h
>> @@ -18,3 +18,6 @@ int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags);
>>   int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
>>   int io_linkat(struct io_kiocb *req, unsigned int issue_flags);
>>   void io_link_cleanup(struct io_kiocb *req);
>> +
>> +int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
>> +int io_getdents(struct io_kiocb *req, unsigned int issue_flags);
>> diff --git a/io_uring/opdef.c b/io_uring/opdef.c
>> index 3b9c6489b8b6..1bae6b2a8d0b 100644
>> --- a/io_uring/opdef.c
>> +++ b/io_uring/opdef.c
>> @@ -428,6 +428,11 @@ const struct io_issue_def io_issue_defs[] = {
>>   		.prep			= io_eopnotsupp_prep,
>>   #endif
>>   	},
>> +	[IORING_OP_GETDENTS] = {
>> +		.needs_file		= 1,
>> +		.prep			= io_getdents_prep,
>> +		.issue			= io_getdents,
>> +	},
>>   };
>>   
>>   
>> @@ -648,6 +653,9 @@ const struct io_cold_def io_cold_defs[] = {
>>   		.fail			= io_sendrecv_fail,
>>   #endif
>>   	},
>> +	[IORING_OP_GETDENTS] = {
>> +		.name			= "GETDENTS",
>> +	},
>>   };
>>   
>>   const char *io_uring_get_opcode(u8 opcode)
>> -- 
>> 2.25.1
>>
Hao Xu July 13, 2023, 4:40 a.m. UTC | #8
On 7/13/23 12:05, Hao Xu wrote:
> 
> On 7/13/23 00:10, Dominique Martinet wrote:
>> Hao Xu wrote on Wed, Jul 12, 2023 at 03:53:24PM +0800:
>>>>> +        if (file_count(file) > 1)
>>>> I was curious about this so I found it's basically what __fdget_pos 
>>>> does
>>>> before deciding it should take the f_pos_lock, and as such this is
>>>> probably correct... But if someone can chime in here: what guarantees
>>>> someone else won't __fdget_pos (or equivalent through this) the file
>>>> again between this and the vfs_getdents call?
>>>> That second get would make file_count > 1 and it would lock, but lock
>>>> hadn't been taken here so the other call could get the lock without
>>>> waiting and both would process getdents or seek or whatever in
>>>> parallel.
>>>>
>>> This file_count(file) is atomic_read, so I believe no race condition 
>>> here.
>> I don't see how that helps in the presence of another thread getting the
>> lock after we possibly issued a getdents without the lock, e.g.
>>
>> t1 call io_uring getdents here
>> t1 sees file_count(file) == 1 and skips getting lock
>> t1 starts issuing vfs_getdents [... processing]
>> t2 calls either io_uring getdents or getdents64 syscall
>> t2 gets the lock, since it wasn't taken by t1 it can be obtained
>> t2 issues another vfs_getdents
>>
>> Christian raised the same issue so I'll leave this to his part of the
>> thread for reply, but I hope that clarified my concern.
> 
> 
> Hi Dominique,
> 
> Ah, I misunderstood your question, sorry. The thing is f_count is 
> init-ed to be 1,
> 
> and normal uring requests do fdget first, so I think it's ok for normal 
> requests.
> 
> What Christian points out is issue with fixed file, that is indeed a 
> problem I think.

After re-think of it, I think there is no race in fixed file case as
well, because the f_count is always >1


> 
> 
>>
>> -----
>>
>> BTW I forgot to point out: this dropped the REWIND bit from my patch; I
>> believe some form of "seek" is necessary for real applications to make
>> use of this (for example, a web server could keep the fd open in a LRU
>> and keep issuing readdir over and over again everytime it gets an
>> indexing request); not having rewind means it'd need to close and
>> re-open the fd everytime which doesn't seem optimal.
>>
>> A previous iteration discussed that real seek is difficult and not
>> necessarily needed to I settled for rewind, but was there a reason you
>> decided to stop handling that?
>>
>> My very egoistical personal use case won't require it, so I can just say
>> I don't care here, but it would be nice to have a reason explained at
>> some point
> 
> 
> Yes, like Al pointed out, getdents with an offset is not the right way 
> to do it,
> 
> So a way to do seek is a must. But like what I said in the cover-letter, 
> I do think the right thing is to
> 
> import lseek/llseek to io_uring, not increment the complex of getdents.
> 
> 
> Thanks,
> 
> Hao
> 
>
Christian Brauner July 13, 2023, 7:10 a.m. UTC | #9
On Thu, Jul 13, 2023 at 12:35:07PM +0800, Hao Xu wrote:
> On 7/12/23 23:27, Christian Brauner wrote:
> > On Tue, Jul 11, 2023 at 07:40:27PM +0800, Hao Xu wrote:
> > > From: Hao Xu <howeyxu@tencent.com>
> > > 
> > > This add support for getdents64 to io_uring, acting exactly like the
> > > syscall: the directory is iterated from it's current's position as
> > > stored in the file struct, and the file's position is updated exactly as
> > > if getdents64 had been called.
> > > 
> > > For filesystems that support NOWAIT in iterate_shared(), try to use it
> > > first; if a user already knows the filesystem they use do not support
> > > nowait they can force async through IOSQE_ASYNC in the sqe flags,
> > > avoiding the need to bounce back through a useless EAGAIN return.
> > > 
> > > Co-developed-by: Dominique Martinet <asmadeus@codewreck.org>
> > > Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
> > > Signed-off-by: Hao Xu <howeyxu@tencent.com>
> > > ---
> > >   include/uapi/linux/io_uring.h |  7 ++++
> > >   io_uring/fs.c                 | 60 +++++++++++++++++++++++++++++++++++
> > >   io_uring/fs.h                 |  3 ++
> > >   io_uring/opdef.c              |  8 +++++
> > >   4 files changed, 78 insertions(+)
> > > 
> > > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> > > index 08720c7bd92f..6c0d521135a6 100644
> > > --- a/include/uapi/linux/io_uring.h
> > > +++ b/include/uapi/linux/io_uring.h
> > > @@ -65,6 +65,7 @@ struct io_uring_sqe {
> > >   		__u32		xattr_flags;
> > >   		__u32		msg_ring_flags;
> > >   		__u32		uring_cmd_flags;
> > > +		__u32		getdents_flags;
> > >   	};
> > >   	__u64	user_data;	/* data to be passed back at completion time */
> > >   	/* pack this to avoid bogus arm OABI complaints */
> > > @@ -235,6 +236,7 @@ enum io_uring_op {
> > >   	IORING_OP_URING_CMD,
> > >   	IORING_OP_SEND_ZC,
> > >   	IORING_OP_SENDMSG_ZC,
> > > +	IORING_OP_GETDENTS,
> > >   	/* this goes last, obviously */
> > >   	IORING_OP_LAST,
> > > @@ -273,6 +275,11 @@ enum io_uring_op {
> > >    */
> > >   #define SPLICE_F_FD_IN_FIXED	(1U << 31) /* the last bit of __u32 */
> > > +/*
> > > + * sqe->getdents_flags
> > > + */
> > > +#define IORING_GETDENTS_REWIND	(1U << 0)
> > > +
> > >   /*
> > >    * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the
> > >    * command flags for POLL_ADD are stored in sqe->len.
> > > diff --git a/io_uring/fs.c b/io_uring/fs.c
> > > index f6a69a549fd4..77f00577e09c 100644
> > > --- a/io_uring/fs.c
> > > +++ b/io_uring/fs.c
> > > @@ -47,6 +47,13 @@ struct io_link {
> > >   	int				flags;
> > >   };
> > > +struct io_getdents {
> > > +	struct file			*file;
> > > +	struct linux_dirent64 __user	*dirent;
> > > +	unsigned int			count;
> > > +	int				flags;
> > > +};
> > > +
> > >   int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> > >   {
> > >   	struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename);
> > > @@ -291,3 +298,56 @@ void io_link_cleanup(struct io_kiocb *req)
> > >   	putname(sl->oldpath);
> > >   	putname(sl->newpath);
> > >   }
> > > +
> > > +int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> > > +{
> > > +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
> > > +
> > > +	if (READ_ONCE(sqe->off) != 0)
> > > +		return -EINVAL;
> > > +
> > > +	gd->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr));
> > > +	gd->count = READ_ONCE(sqe->len);
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
> > > +{
> > > +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
> > > +	struct file *file;
> > > +	unsigned long getdents_flags = 0;
> > > +	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
> > > +	bool should_lock = false;
> > > +	int ret;
> > > +
> > > +	if (force_nonblock) {
> > > +		if (!(req->file->f_mode & FMODE_NOWAIT))
> > > +			return -EAGAIN;
> > > +
> > > +		getdents_flags = DIR_CONTEXT_F_NOWAIT;
> > 
> > I mentioned this on the other patch but it seems really pointless to
> > have that extra flag. I would really like to hear a good reason for
> > this.
> > 
> > > +	}
> > > +
> > > +	file = req->file;
> > > +	if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
> > > +		if (file_count(file) > 1)
> > 
> > Assume we have a regular non-threaded process that just opens an fd to a
> > file. The process registers an async readdir request via that fd for the
> > file with io_uring and goes to do other stuff while waiting for the
> > result.
> > 
> > Some time later, io_uring gets to io_getdents() and the task is still
> > single threaded and the file hasn't been shared in the meantime. So
> > io_getdents() doesn't take the lock and starts the readdir() call.
> > 
> > Concurrently, the process that registered the io_uring request was free
> > to do other stuff and issued a synchronous readdir() system call which
> > calls fdget_pos(). Since the fdtable still isn't shared it doesn't
> > increment f_count and doesn't acquire the mutex. Now there's another
> > concurrent readdir() going on.
> > 
> > (Similar thing can happen if the process creates a thread for example.)
> > 
> > Two readdir() requests now proceed concurrently which is not intended.
> > Now to verify that this race can't happen with io_uring:
> > 
> > * regular fds:
> >    It seems that io_uring calls fget() on each regular file descriptor
> >    when an async request is registered. So that means that io_uring
> >    always hold its own explicit reference here.
> >    So as long as the original task is alive or another thread is alive
> >    f_count is guaranteed to be > 1 and so the mutex would always be
> >    acquired.
> > 
> >    If the registering process dies right before io_uring gets to the
> >    io_getdents() request no other process can steal the fd anymore and in
> >    that case the readdir call would not lock. But that's fine.
> > 
> > * fixed fds:
> >    I don't know the reference counting rules here. io_uring would need to
> >    ensure that it's impossible for two async readdir requests via a fixed
> >    fd to race because f_count is == 1.
> > 
> >    Iiuc, if a process registers a file it opened as a fixed file and
> >    immediately closes the fd afterwards - without anyone else holding a
> >    reference to that file - and only uses the fixed fd going forward, the
> >    f_count of that file in io_uring's fixed file table is always 1.
> > 
> >    So one could issue any number of concurrent readdir requests with no
> >    mutual exclusion. So for fixed files there definitely is a race, no?
> 
> Hi Christian,
> The ref logic for fixed file is that it does fdget() when registering

It absolutely can't be the case that io_uring uses fdget()/fdput() for
long-term file references. fdget() internally use __fget_light() which
avoids taking a reference on the file if the file table isn't shared. So
should that file be stashed anywhere for async work its a UAF waiting to
happen.

> the file, and fdput() when unregistering it. So the ref in between is
> always > 1. The fixed file feature is to reduce frequent fdget/fdput,
> but it does call them at the register/unregister time.

So consider:

// Caller opens some file.
fd_register = open("/some/file", ...); // f_count == 1

// Caller registers that file as a fixed file
IORING_REGISTER_FILES
-> io_sqe_files_register()
   -> fget(fd_register) // f_count == 2
   -> io_fixed_file_set()

// Caller trades regular fd reference for fixed file reference completely.
close(fd_register);
-> close_fd(fd_register)
   -> file = pick_file()
   -> filp_close(file)
      -> fput(file)    // f_count == 1


// Caller spawns a second thread. Both treads issue async getdents via
// fixed file.
T1                                              T2
IORING_OP_GETDENTS                              IORING_OP_GETDENTS

// At some point io_assign_file() must be called which has:

          if (req->flags & REQ_F_FIXED_FILE)
                  req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags);
          else
                  req->file = io_file_get_normal(req, req->cqe.fd);

// Since this is REQ_F_FIXED_FILE f_count == 1

if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
        if (file_count(file) > 1)

// No lock is taken; T1 and T2 issue getdents concurrently without any
// locking. -> race on f_pos

I'm happy to be convinced that this is safe, but please someone explain
in detail why this can't happen and where that extra f_count reference
for fixed files that this code wants to rely on is coming from.

Afaik, the whole point is that fixed files don't ever call fget()/fput()
after having been registered anymore. Consequently, f_count should be 1
once io_uring has taken full ownership of the file and the file can only
be addressed via a fixed file reference.

> 
> 
> > 
> > All of that could ofc be simplified if we could just always acquire the
> > mutex in fdget_pos() and other places and drop that file_count(file) > 1
> > optimization everywhere. But I have no idea if the optimization for not
> > acquiring the mutex if f_count == 1 is worth it?
> > 
> > I hope I didn't confuse myself here.
> > 
> > Jens, do yo have any input here?
> > 
> > > +			should_lock = true;
> > > +	}
> > > +	if (should_lock) {
> > > +		if (!force_nonblock)
> > > +			mutex_lock(&file->f_pos_lock);
> > > +		else if (!mutex_trylock(&file->f_pos_lock))
> > > +			return -EAGAIN;
> > > +	}
> > 
> > Open-coding this seems extremely brittle with an invitation for subtle
> > bugs.
> 
> Could you elaborate on this, I'm not sure that I understand it quite
> well. Sorry for my poor English.

No need to apologize. I'm wondering whether this should be moved into a
tiny helper and actually be exposed via a vfs header if we go this
route is all.
Hao Xu July 13, 2023, 9:06 a.m. UTC | #10
Hi Christian,

On 7/13/23 15:10, Christian Brauner wrote:
> On Thu, Jul 13, 2023 at 12:35:07PM +0800, Hao Xu wrote:
>> On 7/12/23 23:27, Christian Brauner wrote:
>>> On Tue, Jul 11, 2023 at 07:40:27PM +0800, Hao Xu wrote:
>>>> From: Hao Xu <howeyxu@tencent.com>
>>>>
>>>> This add support for getdents64 to io_uring, acting exactly like the
>>>> syscall: the directory is iterated from it's current's position as
>>>> stored in the file struct, and the file's position is updated exactly as
>>>> if getdents64 had been called.
>>>>
>>>> For filesystems that support NOWAIT in iterate_shared(), try to use it
>>>> first; if a user already knows the filesystem they use do not support
>>>> nowait they can force async through IOSQE_ASYNC in the sqe flags,
>>>> avoiding the need to bounce back through a useless EAGAIN return.
>>>>
>>>> Co-developed-by: Dominique Martinet <asmadeus@codewreck.org>
>>>> Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
>>>> Signed-off-by: Hao Xu <howeyxu@tencent.com>
>>>> ---
>>>>    include/uapi/linux/io_uring.h |  7 ++++
>>>>    io_uring/fs.c                 | 60 +++++++++++++++++++++++++++++++++++
>>>>    io_uring/fs.h                 |  3 ++
>>>>    io_uring/opdef.c              |  8 +++++
>>>>    4 files changed, 78 insertions(+)
>>>>
>>>> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
>>>> index 08720c7bd92f..6c0d521135a6 100644
>>>> --- a/include/uapi/linux/io_uring.h
>>>> +++ b/include/uapi/linux/io_uring.h
>>>> @@ -65,6 +65,7 @@ struct io_uring_sqe {
>>>>    		__u32		xattr_flags;
>>>>    		__u32		msg_ring_flags;
>>>>    		__u32		uring_cmd_flags;
>>>> +		__u32		getdents_flags;
>>>>    	};
>>>>    	__u64	user_data;	/* data to be passed back at completion time */
>>>>    	/* pack this to avoid bogus arm OABI complaints */
>>>> @@ -235,6 +236,7 @@ enum io_uring_op {
>>>>    	IORING_OP_URING_CMD,
>>>>    	IORING_OP_SEND_ZC,
>>>>    	IORING_OP_SENDMSG_ZC,
>>>> +	IORING_OP_GETDENTS,
>>>>    	/* this goes last, obviously */
>>>>    	IORING_OP_LAST,
>>>> @@ -273,6 +275,11 @@ enum io_uring_op {
>>>>     */
>>>>    #define SPLICE_F_FD_IN_FIXED	(1U << 31) /* the last bit of __u32 */
>>>> +/*
>>>> + * sqe->getdents_flags
>>>> + */
>>>> +#define IORING_GETDENTS_REWIND	(1U << 0)
>>>> +
>>>>    /*
>>>>     * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the
>>>>     * command flags for POLL_ADD are stored in sqe->len.
>>>> diff --git a/io_uring/fs.c b/io_uring/fs.c
>>>> index f6a69a549fd4..77f00577e09c 100644
>>>> --- a/io_uring/fs.c
>>>> +++ b/io_uring/fs.c
>>>> @@ -47,6 +47,13 @@ struct io_link {
>>>>    	int				flags;
>>>>    };
>>>> +struct io_getdents {
>>>> +	struct file			*file;
>>>> +	struct linux_dirent64 __user	*dirent;
>>>> +	unsigned int			count;
>>>> +	int				flags;
>>>> +};
>>>> +
>>>>    int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>>>    {
>>>>    	struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename);
>>>> @@ -291,3 +298,56 @@ void io_link_cleanup(struct io_kiocb *req)
>>>>    	putname(sl->oldpath);
>>>>    	putname(sl->newpath);
>>>>    }
>>>> +
>>>> +int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>>> +{
>>>> +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
>>>> +
>>>> +	if (READ_ONCE(sqe->off) != 0)
>>>> +		return -EINVAL;
>>>> +
>>>> +	gd->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr));
>>>> +	gd->count = READ_ONCE(sqe->len);
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
>>>> +{
>>>> +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
>>>> +	struct file *file;
>>>> +	unsigned long getdents_flags = 0;
>>>> +	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
>>>> +	bool should_lock = false;
>>>> +	int ret;
>>>> +
>>>> +	if (force_nonblock) {
>>>> +		if (!(req->file->f_mode & FMODE_NOWAIT))
>>>> +			return -EAGAIN;
>>>> +
>>>> +		getdents_flags = DIR_CONTEXT_F_NOWAIT;
>>>
>>> I mentioned this on the other patch but it seems really pointless to
>>> have that extra flag. I would really like to hear a good reason for
>>> this.
>>>
>>>> +	}
>>>> +
>>>> +	file = req->file;
>>>> +	if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
>>>> +		if (file_count(file) > 1)
>>>
>>> Assume we have a regular non-threaded process that just opens an fd to a
>>> file. The process registers an async readdir request via that fd for the
>>> file with io_uring and goes to do other stuff while waiting for the
>>> result.
>>>
>>> Some time later, io_uring gets to io_getdents() and the task is still
>>> single threaded and the file hasn't been shared in the meantime. So
>>> io_getdents() doesn't take the lock and starts the readdir() call.
>>>
>>> Concurrently, the process that registered the io_uring request was free
>>> to do other stuff and issued a synchronous readdir() system call which
>>> calls fdget_pos(). Since the fdtable still isn't shared it doesn't
>>> increment f_count and doesn't acquire the mutex. Now there's another
>>> concurrent readdir() going on.
>>>
>>> (Similar thing can happen if the process creates a thread for example.)
>>>
>>> Two readdir() requests now proceed concurrently which is not intended.
>>> Now to verify that this race can't happen with io_uring:
>>>
>>> * regular fds:
>>>     It seems that io_uring calls fget() on each regular file descriptor
>>>     when an async request is registered. So that means that io_uring
>>>     always hold its own explicit reference here.
>>>     So as long as the original task is alive or another thread is alive
>>>     f_count is guaranteed to be > 1 and so the mutex would always be
>>>     acquired.
>>>
>>>     If the registering process dies right before io_uring gets to the
>>>     io_getdents() request no other process can steal the fd anymore and in
>>>     that case the readdir call would not lock. But that's fine.
>>>
>>> * fixed fds:
>>>     I don't know the reference counting rules here. io_uring would need to
>>>     ensure that it's impossible for two async readdir requests via a fixed
>>>     fd to race because f_count is == 1.
>>>
>>>     Iiuc, if a process registers a file it opened as a fixed file and
>>>     immediately closes the fd afterwards - without anyone else holding a
>>>     reference to that file - and only uses the fixed fd going forward, the
>>>     f_count of that file in io_uring's fixed file table is always 1.
>>>
>>>     So one could issue any number of concurrent readdir requests with no
>>>     mutual exclusion. So for fixed files there definitely is a race, no?
>>
>> Hi Christian,
>> The ref logic for fixed file is that it does fdget() when registering
> 
> It absolutely can't be the case that io_uring uses fdget()/fdput() for
> long-term file references. fdget() internally use __fget_light() which
> avoids taking a reference on the file if the file table isn't shared. So
> should that file be stashed anywhere for async work its a UAF waiting to
> happen.
> 

Yes, I typed the wrong name, should be fget() not fdget().

>> the file, and fdput() when unregistering it. So the ref in between is
>> always > 1. The fixed file feature is to reduce frequent fdget/fdput,
>> but it does call them at the register/unregister time.
> 
> So consider:
> 
> // Caller opens some file.
> fd_register = open("/some/file", ...); // f_count == 1
> 
> // Caller registers that file as a fixed file
> IORING_REGISTER_FILES
> -> io_sqe_files_register()
>     -> fget(fd_register) // f_count == 2
>     -> io_fixed_file_set()
> 
> // Caller trades regular fd reference for fixed file reference completely.
> close(fd_register);
> -> close_fd(fd_register)
>     -> file = pick_file()
>     -> filp_close(file)
>        -> fput(file)    // f_count == 1
> 
> 
> // Caller spawns a second thread. Both treads issue async getdents via
> // fixed file.
> T1                                              T2
> IORING_OP_GETDENTS                              IORING_OP_GETDENTS
> 
> // At some point io_assign_file() must be called which has:
> 
>            if (req->flags & REQ_F_FIXED_FILE)
>                    req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags);
>            else
>                    req->file = io_file_get_normal(req, req->cqe.fd);
> 
> // Since this is REQ_F_FIXED_FILE f_count == 1
> 
> if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
>          if (file_count(file) > 1)
> 
> // No lock is taken; T1 and T2 issue getdents concurrently without any
> // locking. -> race on f_pos
> 
> I'm happy to be convinced that this is safe, but please someone explain
> in detail why this can't happen and where that extra f_count reference
> for fixed files that this code wants to rely on is coming from.
> 
> Afaik, the whole point is that fixed files don't ever call fget()/fput()
> after having been registered anymore. Consequently, f_count should be 1
> once io_uring has taken full ownership of the file and the file can only
> be addressed via a fixed file reference.

Thanks for explanation, I now realize it's an issue, even for non-fixed 
files when io_uring takes full ownership. for example:

io_uring submit a getdents          --> f_count == 2, get the lock
nowait submission fails             --> f_count == 2, release the lock
punt it to io-wq thread and return to userspace
close(fd)                           --> f_count == 1
call sync getdents64                --> doing getdents without lock
the io-wq thread begins to run      --> f_count == 1, doing getdents
                                         without lock.

Though this looks like a silly use case but users can do that anyway.

How about remove this f_count > 1 small optimization in io_uring and 
always get the lock, looks like it makes big trouble for async
situation. and there may often be parallel io_uring getdents in the
same time for a file [1], it may be not very meaningful to do this
file count optimization.

[1] I believe users will issue multiple async getdents at same time 
rather than issue them one by one to get better performance.

Thanks,
Hao

> 
>>
>>
>>>
>>> All of that could ofc be simplified if we could just always acquire the
>>> mutex in fdget_pos() and other places and drop that file_count(file) > 1
>>> optimization everywhere. But I have no idea if the optimization for not
>>> acquiring the mutex if f_count == 1 is worth it?
>>>
>>> I hope I didn't confuse myself here.
>>>
>>> Jens, do yo have any input here?
>>>
>>>> +			should_lock = true;
>>>> +	}
>>>> +	if (should_lock) {
>>>> +		if (!force_nonblock)
>>>> +			mutex_lock(&file->f_pos_lock);
>>>> +		else if (!mutex_trylock(&file->f_pos_lock))
>>>> +			return -EAGAIN;
>>>> +	}
>>>
>>> Open-coding this seems extremely brittle with an invitation for subtle
>>> bugs.
>>
>> Could you elaborate on this, I'm not sure that I understand it quite
>> well. Sorry for my poor English.
> 
> No need to apologize. I'm wondering whether this should be moved into a
> tiny helper and actually be exposed via a vfs header if we go this
> route is all.
Christian Brauner July 13, 2023, 3:14 p.m. UTC | #11
On Thu, Jul 13, 2023 at 05:06:32PM +0800, Hao Xu wrote:
> Hi Christian,
> 
> On 7/13/23 15:10, Christian Brauner wrote:
> > On Thu, Jul 13, 2023 at 12:35:07PM +0800, Hao Xu wrote:
> > > On 7/12/23 23:27, Christian Brauner wrote:
> > > > On Tue, Jul 11, 2023 at 07:40:27PM +0800, Hao Xu wrote:
> > > > > From: Hao Xu <howeyxu@tencent.com>
> > > > > 
> > > > > This add support for getdents64 to io_uring, acting exactly like the
> > > > > syscall: the directory is iterated from it's current's position as
> > > > > stored in the file struct, and the file's position is updated exactly as
> > > > > if getdents64 had been called.
> > > > > 
> > > > > For filesystems that support NOWAIT in iterate_shared(), try to use it
> > > > > first; if a user already knows the filesystem they use do not support
> > > > > nowait they can force async through IOSQE_ASYNC in the sqe flags,
> > > > > avoiding the need to bounce back through a useless EAGAIN return.
> > > > > 
> > > > > Co-developed-by: Dominique Martinet <asmadeus@codewreck.org>
> > > > > Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
> > > > > Signed-off-by: Hao Xu <howeyxu@tencent.com>
> > > > > ---
> > > > >    include/uapi/linux/io_uring.h |  7 ++++
> > > > >    io_uring/fs.c                 | 60 +++++++++++++++++++++++++++++++++++
> > > > >    io_uring/fs.h                 |  3 ++
> > > > >    io_uring/opdef.c              |  8 +++++
> > > > >    4 files changed, 78 insertions(+)
> > > > > 
> > > > > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> > > > > index 08720c7bd92f..6c0d521135a6 100644
> > > > > --- a/include/uapi/linux/io_uring.h
> > > > > +++ b/include/uapi/linux/io_uring.h
> > > > > @@ -65,6 +65,7 @@ struct io_uring_sqe {
> > > > >    		__u32		xattr_flags;
> > > > >    		__u32		msg_ring_flags;
> > > > >    		__u32		uring_cmd_flags;
> > > > > +		__u32		getdents_flags;
> > > > >    	};
> > > > >    	__u64	user_data;	/* data to be passed back at completion time */
> > > > >    	/* pack this to avoid bogus arm OABI complaints */
> > > > > @@ -235,6 +236,7 @@ enum io_uring_op {
> > > > >    	IORING_OP_URING_CMD,
> > > > >    	IORING_OP_SEND_ZC,
> > > > >    	IORING_OP_SENDMSG_ZC,
> > > > > +	IORING_OP_GETDENTS,
> > > > >    	/* this goes last, obviously */
> > > > >    	IORING_OP_LAST,
> > > > > @@ -273,6 +275,11 @@ enum io_uring_op {
> > > > >     */
> > > > >    #define SPLICE_F_FD_IN_FIXED	(1U << 31) /* the last bit of __u32 */
> > > > > +/*
> > > > > + * sqe->getdents_flags
> > > > > + */
> > > > > +#define IORING_GETDENTS_REWIND	(1U << 0)
> > > > > +
> > > > >    /*
> > > > >     * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the
> > > > >     * command flags for POLL_ADD are stored in sqe->len.
> > > > > diff --git a/io_uring/fs.c b/io_uring/fs.c
> > > > > index f6a69a549fd4..77f00577e09c 100644
> > > > > --- a/io_uring/fs.c
> > > > > +++ b/io_uring/fs.c
> > > > > @@ -47,6 +47,13 @@ struct io_link {
> > > > >    	int				flags;
> > > > >    };
> > > > > +struct io_getdents {
> > > > > +	struct file			*file;
> > > > > +	struct linux_dirent64 __user	*dirent;
> > > > > +	unsigned int			count;
> > > > > +	int				flags;
> > > > > +};
> > > > > +
> > > > >    int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> > > > >    {
> > > > >    	struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename);
> > > > > @@ -291,3 +298,56 @@ void io_link_cleanup(struct io_kiocb *req)
> > > > >    	putname(sl->oldpath);
> > > > >    	putname(sl->newpath);
> > > > >    }
> > > > > +
> > > > > +int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> > > > > +{
> > > > > +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
> > > > > +
> > > > > +	if (READ_ONCE(sqe->off) != 0)
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	gd->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr));
> > > > > +	gd->count = READ_ONCE(sqe->len);
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
> > > > > +{
> > > > > +	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
> > > > > +	struct file *file;
> > > > > +	unsigned long getdents_flags = 0;
> > > > > +	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
> > > > > +	bool should_lock = false;
> > > > > +	int ret;
> > > > > +
> > > > > +	if (force_nonblock) {
> > > > > +		if (!(req->file->f_mode & FMODE_NOWAIT))
> > > > > +			return -EAGAIN;
> > > > > +
> > > > > +		getdents_flags = DIR_CONTEXT_F_NOWAIT;
> > > > 
> > > > I mentioned this on the other patch but it seems really pointless to
> > > > have that extra flag. I would really like to hear a good reason for
> > > > this.
> > > > 
> > > > > +	}
> > > > > +
> > > > > +	file = req->file;
> > > > > +	if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
> > > > > +		if (file_count(file) > 1)
> > > > 
> > > > Assume we have a regular non-threaded process that just opens an fd to a
> > > > file. The process registers an async readdir request via that fd for the
> > > > file with io_uring and goes to do other stuff while waiting for the
> > > > result.
> > > > 
> > > > Some time later, io_uring gets to io_getdents() and the task is still
> > > > single threaded and the file hasn't been shared in the meantime. So
> > > > io_getdents() doesn't take the lock and starts the readdir() call.
> > > > 
> > > > Concurrently, the process that registered the io_uring request was free
> > > > to do other stuff and issued a synchronous readdir() system call which
> > > > calls fdget_pos(). Since the fdtable still isn't shared it doesn't
> > > > increment f_count and doesn't acquire the mutex. Now there's another
> > > > concurrent readdir() going on.
> > > > 
> > > > (Similar thing can happen if the process creates a thread for example.)
> > > > 
> > > > Two readdir() requests now proceed concurrently which is not intended.
> > > > Now to verify that this race can't happen with io_uring:
> > > > 
> > > > * regular fds:
> > > >     It seems that io_uring calls fget() on each regular file descriptor
> > > >     when an async request is registered. So that means that io_uring
> > > >     always hold its own explicit reference here.
> > > >     So as long as the original task is alive or another thread is alive
> > > >     f_count is guaranteed to be > 1 and so the mutex would always be
> > > >     acquired.
> > > > 
> > > >     If the registering process dies right before io_uring gets to the
> > > >     io_getdents() request no other process can steal the fd anymore and in
> > > >     that case the readdir call would not lock. But that's fine.
> > > > 
> > > > * fixed fds:
> > > >     I don't know the reference counting rules here. io_uring would need to
> > > >     ensure that it's impossible for two async readdir requests via a fixed
> > > >     fd to race because f_count is == 1.
> > > > 
> > > >     Iiuc, if a process registers a file it opened as a fixed file and
> > > >     immediately closes the fd afterwards - without anyone else holding a
> > > >     reference to that file - and only uses the fixed fd going forward, the
> > > >     f_count of that file in io_uring's fixed file table is always 1.
> > > > 
> > > >     So one could issue any number of concurrent readdir requests with no
> > > >     mutual exclusion. So for fixed files there definitely is a race, no?
> > > 
> > > Hi Christian,
> > > The ref logic for fixed file is that it does fdget() when registering
> > 
> > It absolutely can't be the case that io_uring uses fdget()/fdput() for
> > long-term file references. fdget() internally use __fget_light() which
> > avoids taking a reference on the file if the file table isn't shared. So
> > should that file be stashed anywhere for async work its a UAF waiting to
> > happen.
> > 
> 
> Yes, I typed the wrong name, should be fget() not fdget().
> 
> > > the file, and fdput() when unregistering it. So the ref in between is
> > > always > 1. The fixed file feature is to reduce frequent fdget/fdput,
> > > but it does call them at the register/unregister time.
> > 
> > So consider:
> > 
> > // Caller opens some file.
> > fd_register = open("/some/file", ...); // f_count == 1
> > 
> > // Caller registers that file as a fixed file
> > IORING_REGISTER_FILES
> > -> io_sqe_files_register()
> >     -> fget(fd_register) // f_count == 2
> >     -> io_fixed_file_set()
> > 
> > // Caller trades regular fd reference for fixed file reference completely.
> > close(fd_register);
> > -> close_fd(fd_register)
> >     -> file = pick_file()
> >     -> filp_close(file)
> >        -> fput(file)    // f_count == 1
> > 
> > 
> > // Caller spawns a second thread. Both treads issue async getdents via
> > // fixed file.
> > T1                                              T2
> > IORING_OP_GETDENTS                              IORING_OP_GETDENTS
> > 
> > // At some point io_assign_file() must be called which has:
> > 
> >            if (req->flags & REQ_F_FIXED_FILE)
> >                    req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags);
> >            else
> >                    req->file = io_file_get_normal(req, req->cqe.fd);
> > 
> > // Since this is REQ_F_FIXED_FILE f_count == 1
> > 
> > if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
> >          if (file_count(file) > 1)
> > 
> > // No lock is taken; T1 and T2 issue getdents concurrently without any
> > // locking. -> race on f_pos
> > 
> > I'm happy to be convinced that this is safe, but please someone explain
> > in detail why this can't happen and where that extra f_count reference
> > for fixed files that this code wants to rely on is coming from.
> > 
> > Afaik, the whole point is that fixed files don't ever call fget()/fput()
> > after having been registered anymore. Consequently, f_count should be 1
> > once io_uring has taken full ownership of the file and the file can only
> > be addressed via a fixed file reference.
> 
> Thanks for explanation, I now realize it's an issue, even for non-fixed
> files when io_uring takes full ownership. for example:
> 
> io_uring submit a getdents          --> f_count == 2, get the lock
> nowait submission fails             --> f_count == 2, release the lock
> punt it to io-wq thread and return to userspace
> close(fd)                           --> f_count == 1
> call sync getdents64                --> doing getdents without lock
> the io-wq thread begins to run      --> f_count == 1, doing getdents
>                                         without lock.
> 
> Though this looks like a silly use case but users can do that anyway.
> 
> How about remove this f_count > 1 small optimization in io_uring and always
> get the lock, looks like it makes big trouble for async
> situation. and there may often be parallel io_uring getdents in the
> same time for a file [1], it may be not very meaningful to do this
> file count optimization.
> 
> [1] I believe users will issue multiple async getdents at same time rather
> than issue them one by one to get better performance.

Could someone with perf experience try and remove that f_count == 1
optimization from __fdget_pos() completely and make it always acquire
the mutex? I wonder what the performance impact of that is.
Hao Xu July 16, 2023, 11:57 a.m. UTC | #12
On 7/13/23 23:14, Christian Brauner wrote:

> Could someone with perf experience try and remove that f_count == 1
> optimization from __fdget_pos() completely and make it always acquire
> the mutex? I wonder what the performance impact of that is.

Hi Christian,
For your reference, I did a simple test: writed a c program that open a
directory which has 1000 empty files, then call sync getdents64 on it
repeatedly until we get all the entries. I run this program 10 times for
"with f_count==1
optimization" and "always do the lock" version.
Got below data:
with f_count==1:

time cost: 0.000379 

time cost: 0.000116 

time cost: 0.000090 

time cost: 0.000101 

time cost: 0.000095 

time cost: 0.000092 

time cost: 0.000092 

time cost: 0.000095 

time cost: 0.000092 

time cost: 0.000121 

time cost: 0.000092 
 

time cost avg: 0.00009859999999999998

always do the lock:
time cost: 0.000095 

time cost: 0.000099 

time cost: 0.000123 

time cost: 0.000124 

time cost: 0.000092 

time cost: 0.000099 

time cost: 0.000092 

time cost: 0.000092 

time cost: 0.000093 

time cost: 0.000094 
 
             time cost avg: 0.00010029999999999997

So about 1.724% increment

[1] the first run is not showed here since that try does real IO
     and diff a lot.
[2] the time cost calculation is by gettimeofday()
[3] run it in a VM which has 2 CPUs and 1GB memory.

Regards,
Hao
Hao Xu July 18, 2023, 6:55 a.m. UTC | #13
On 7/16/23 19:57, Hao Xu wrote:
> On 7/13/23 23:14, Christian Brauner wrote:
> 
>> Could someone with perf experience try and remove that f_count == 1
>> optimization from __fdget_pos() completely and make it always acquire
>> the mutex? I wonder what the performance impact of that is.
> 
> Hi Christian,
> For your reference, I did a simple test: writed a c program that open a
> directory which has 1000 empty files, then call sync getdents64 on it
> repeatedly until we get all the entries. I run this program 10 times for
> "with f_count==1
> optimization" and "always do the lock" version.
> Got below data:
> with f_count==1:
> 
> time cost: 0.000379
> time cost: 0.000116
> time cost: 0.000090
> time cost: 0.000101
> time cost: 0.000095
> time cost: 0.000092
> time cost: 0.000092
> time cost: 0.000095
> time cost: 0.000092
> time cost: 0.000121
> time cost: 0.000092
> 
> time cost avg: 0.00009859999999999998
> 
> always do the lock:
> time cost: 0.000095
> time cost: 0.000099
> time cost: 0.000123
> time cost: 0.000124
> time cost: 0.000092
> time cost: 0.000099
> time cost: 0.000092
> time cost: 0.000092
> time cost: 0.000093
> time cost: 0.000094
>              time cost avg: 0.00010029999999999997
> 
> So about 1.724% increment
> 
> [1] the first run is not showed here since that try does real IO
>      and diff a lot.
> [2] the time cost calculation is by gettimeofday()
> [3] run it in a VM which has 2 CPUs and 1GB memory.
> 
> Regards,
> Hao

Did another similar test for more times(100 rounds), about 1.4%
increment. How about:
if CONFIG_IO_URING: remove the f_count==1 logic
else: do the old logic.
diff mbox series

Patch

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 08720c7bd92f..6c0d521135a6 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -65,6 +65,7 @@  struct io_uring_sqe {
 		__u32		xattr_flags;
 		__u32		msg_ring_flags;
 		__u32		uring_cmd_flags;
+		__u32		getdents_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	/* pack this to avoid bogus arm OABI complaints */
@@ -235,6 +236,7 @@  enum io_uring_op {
 	IORING_OP_URING_CMD,
 	IORING_OP_SEND_ZC,
 	IORING_OP_SENDMSG_ZC,
+	IORING_OP_GETDENTS,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
@@ -273,6 +275,11 @@  enum io_uring_op {
  */
 #define SPLICE_F_FD_IN_FIXED	(1U << 31) /* the last bit of __u32 */
 
+/*
+ * sqe->getdents_flags
+ */
+#define IORING_GETDENTS_REWIND	(1U << 0)
+
 /*
  * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the
  * command flags for POLL_ADD are stored in sqe->len.
diff --git a/io_uring/fs.c b/io_uring/fs.c
index f6a69a549fd4..77f00577e09c 100644
--- a/io_uring/fs.c
+++ b/io_uring/fs.c
@@ -47,6 +47,13 @@  struct io_link {
 	int				flags;
 };
 
+struct io_getdents {
+	struct file			*file;
+	struct linux_dirent64 __user	*dirent;
+	unsigned int			count;
+	int				flags;
+};
+
 int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename);
@@ -291,3 +298,56 @@  void io_link_cleanup(struct io_kiocb *req)
 	putname(sl->oldpath);
 	putname(sl->newpath);
 }
+
+int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
+
+	if (READ_ONCE(sqe->off) != 0)
+		return -EINVAL;
+
+	gd->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	gd->count = READ_ONCE(sqe->len);
+
+	return 0;
+}
+
+int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
+	struct file *file;
+	unsigned long getdents_flags = 0;
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+	bool should_lock = false;
+	int ret;
+
+	if (force_nonblock) {
+		if (!(req->file->f_mode & FMODE_NOWAIT))
+			return -EAGAIN;
+
+		getdents_flags = DIR_CONTEXT_F_NOWAIT;
+	}
+
+	file = req->file;
+	if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
+		if (file_count(file) > 1)
+			should_lock = true;
+	}
+	if (should_lock) {
+		if (!force_nonblock)
+			mutex_lock(&file->f_pos_lock);
+		else if (!mutex_trylock(&file->f_pos_lock))
+			return -EAGAIN;
+	}
+
+	ret = vfs_getdents(file, gd->dirent, gd->count, getdents_flags);
+	if (should_lock)
+		mutex_unlock(&file->f_pos_lock);
+
+	if (ret == -EAGAIN && force_nonblock)
+		return -EAGAIN;
+
+	io_req_set_res(req, ret, 0);
+	return 0;
+}
+
diff --git a/io_uring/fs.h b/io_uring/fs.h
index 0bb5efe3d6bb..f83a6f3a678d 100644
--- a/io_uring/fs.h
+++ b/io_uring/fs.h
@@ -18,3 +18,6 @@  int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags);
 int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_linkat(struct io_kiocb *req, unsigned int issue_flags);
 void io_link_cleanup(struct io_kiocb *req);
+
+int io_getdents_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_getdents(struct io_kiocb *req, unsigned int issue_flags);
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 3b9c6489b8b6..1bae6b2a8d0b 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -428,6 +428,11 @@  const struct io_issue_def io_issue_defs[] = {
 		.prep			= io_eopnotsupp_prep,
 #endif
 	},
+	[IORING_OP_GETDENTS] = {
+		.needs_file		= 1,
+		.prep			= io_getdents_prep,
+		.issue			= io_getdents,
+	},
 };
 
 
@@ -648,6 +653,9 @@  const struct io_cold_def io_cold_defs[] = {
 		.fail			= io_sendrecv_fail,
 #endif
 	},
+	[IORING_OP_GETDENTS] = {
+		.name			= "GETDENTS",
+	},
 };
 
 const char *io_uring_get_opcode(u8 opcode)