diff mbox series

[v4] eventfd: convert to f_op->read_iter()

Message ID 6b29f015-bd7c-0601-cf94-2c077285b933@kernel.dk (mailing list archive)
State New, archived
Headers show
Series [v4] eventfd: convert to f_op->read_iter() | expand

Commit Message

Jens Axboe May 1, 2020, 7:11 p.m. UTC
eventfd is using ->read() as it's file_operations read handler, but
this prevents passing in information about whether a given IO operation
is blocking or not. We can only use the file flags for that. To support
async (-EAGAIN/poll based) retries for io_uring, we need ->read_iter()
support. Convert eventfd to using ->read_iter().

With ->read_iter(), we can support IOCB_NOWAIT. Ensure the fd setup
is done such that we set file->f_mode with FMODE_NOWAIT.

Signed-off-by: Jens Axboe <axboe@kernel.dk>

---

Since v3:

- Ensure we fiddle ->f_mode before doing fd_install() on the fd

Since v2:

- Cleanup eventfd_read() as per Al's suggestions

Since v1:

- Add FMODE_NOWAIT to the eventfd file

Comments

Al Viro May 1, 2020, 11:12 p.m. UTC | #1
On Fri, May 01, 2020 at 01:11:09PM -0600, Jens Axboe wrote:
> +	flags &= EFD_SHARED_FCNTL_FLAGS;
> +	flags |= O_RDWR;
> +	fd = get_unused_fd_flags(flags);
>  	if (fd < 0)
> -		eventfd_free_ctx(ctx);
> +		goto err;
> +
> +	file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags);
> +	if (IS_ERR(file)) {
> +		put_unused_fd(fd);
> +		fd = PTR_ERR(file);
> +		goto err;
> +	}
>  
> +	file->f_mode |= FMODE_NOWAIT;
> +	fd_install(fd, file);
> +	return fd;
> +err:
> +	eventfd_free_ctx(ctx);
>  	return fd;
>  }

Looks sane...  I can take it via vfs.git, or leave it for you if you
have other stuff in the same area...
Jens Axboe May 1, 2020, 11:54 p.m. UTC | #2
On 5/1/20 5:12 PM, Al Viro wrote:
> On Fri, May 01, 2020 at 01:11:09PM -0600, Jens Axboe wrote:
>> +	flags &= EFD_SHARED_FCNTL_FLAGS;
>> +	flags |= O_RDWR;
>> +	fd = get_unused_fd_flags(flags);
>>  	if (fd < 0)
>> -		eventfd_free_ctx(ctx);
>> +		goto err;
>> +
>> +	file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags);
>> +	if (IS_ERR(file)) {
>> +		put_unused_fd(fd);
>> +		fd = PTR_ERR(file);
>> +		goto err;
>> +	}
>>  
>> +	file->f_mode |= FMODE_NOWAIT;
>> +	fd_install(fd, file);
>> +	return fd;
>> +err:
>> +	eventfd_free_ctx(ctx);
>>  	return fd;
>>  }
> 
> Looks sane...  I can take it via vfs.git, or leave it for you if you
> have other stuff in the same area...

Would be great if you can queue it up in vfs.git, thanks! Don't have
anything else that'd conflict with this.
Al Viro May 3, 2020, 1:46 p.m. UTC | #3
On Fri, May 01, 2020 at 05:54:09PM -0600, Jens Axboe wrote:
> On 5/1/20 5:12 PM, Al Viro wrote:
> > On Fri, May 01, 2020 at 01:11:09PM -0600, Jens Axboe wrote:
> >> +	flags &= EFD_SHARED_FCNTL_FLAGS;
> >> +	flags |= O_RDWR;
> >> +	fd = get_unused_fd_flags(flags);
> >>  	if (fd < 0)
> >> -		eventfd_free_ctx(ctx);
> >> +		goto err;
> >> +
> >> +	file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags);
> >> +	if (IS_ERR(file)) {
> >> +		put_unused_fd(fd);
> >> +		fd = PTR_ERR(file);
> >> +		goto err;
> >> +	}
> >>  
> >> +	file->f_mode |= FMODE_NOWAIT;
> >> +	fd_install(fd, file);
> >> +	return fd;
> >> +err:
> >> +	eventfd_free_ctx(ctx);
> >>  	return fd;
> >>  }
> > 
> > Looks sane...  I can take it via vfs.git, or leave it for you if you
> > have other stuff in the same area...
> 
> Would be great if you can queue it up in vfs.git, thanks! Don't have
> anything else that'd conflict with this.

Applied; BTW, what happens if
        ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
fails?  Quitely succeed with BS value (-ENOSPC/-ENOMEM) shown by
eventfd_show_fdinfo()?
Jens Axboe May 3, 2020, 2:42 p.m. UTC | #4
On 5/3/20 7:46 AM, Al Viro wrote:
> On Fri, May 01, 2020 at 05:54:09PM -0600, Jens Axboe wrote:
>> On 5/1/20 5:12 PM, Al Viro wrote:
>>> On Fri, May 01, 2020 at 01:11:09PM -0600, Jens Axboe wrote:
>>>> +	flags &= EFD_SHARED_FCNTL_FLAGS;
>>>> +	flags |= O_RDWR;
>>>> +	fd = get_unused_fd_flags(flags);
>>>>  	if (fd < 0)
>>>> -		eventfd_free_ctx(ctx);
>>>> +		goto err;
>>>> +
>>>> +	file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags);
>>>> +	if (IS_ERR(file)) {
>>>> +		put_unused_fd(fd);
>>>> +		fd = PTR_ERR(file);
>>>> +		goto err;
>>>> +	}
>>>>  
>>>> +	file->f_mode |= FMODE_NOWAIT;
>>>> +	fd_install(fd, file);
>>>> +	return fd;
>>>> +err:
>>>> +	eventfd_free_ctx(ctx);
>>>>  	return fd;
>>>>  }
>>>
>>> Looks sane...  I can take it via vfs.git, or leave it for you if you
>>> have other stuff in the same area...
>>
>> Would be great if you can queue it up in vfs.git, thanks! Don't have
>> anything else that'd conflict with this.
> 
> Applied; BTW, what happens if
>         ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
> fails?  Quitely succeed with BS value (-ENOSPC/-ENOMEM) shown by
> eventfd_show_fdinfo()?

Huh yeah that's odd, not sure how I missed that when touching code
near it. Want a followup patch to fix that issue?
Jens Axboe May 3, 2020, 4:50 p.m. UTC | #5
On 5/3/20 8:42 AM, Jens Axboe wrote:
> On 5/3/20 7:46 AM, Al Viro wrote:
>> On Fri, May 01, 2020 at 05:54:09PM -0600, Jens Axboe wrote:
>>> On 5/1/20 5:12 PM, Al Viro wrote:
>>>> On Fri, May 01, 2020 at 01:11:09PM -0600, Jens Axboe wrote:
>>>>> +	flags &= EFD_SHARED_FCNTL_FLAGS;
>>>>> +	flags |= O_RDWR;
>>>>> +	fd = get_unused_fd_flags(flags);
>>>>>  	if (fd < 0)
>>>>> -		eventfd_free_ctx(ctx);
>>>>> +		goto err;
>>>>> +
>>>>> +	file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags);
>>>>> +	if (IS_ERR(file)) {
>>>>> +		put_unused_fd(fd);
>>>>> +		fd = PTR_ERR(file);
>>>>> +		goto err;
>>>>> +	}
>>>>>  
>>>>> +	file->f_mode |= FMODE_NOWAIT;
>>>>> +	fd_install(fd, file);
>>>>> +	return fd;
>>>>> +err:
>>>>> +	eventfd_free_ctx(ctx);
>>>>>  	return fd;
>>>>>  }
>>>>
>>>> Looks sane...  I can take it via vfs.git, or leave it for you if you
>>>> have other stuff in the same area...
>>>
>>> Would be great if you can queue it up in vfs.git, thanks! Don't have
>>> anything else that'd conflict with this.
>>
>> Applied; BTW, what happens if
>>         ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
>> fails?  Quitely succeed with BS value (-ENOSPC/-ENOMEM) shown by
>> eventfd_show_fdinfo()?
> 
> Huh yeah that's odd, not sure how I missed that when touching code
> near it. Want a followup patch to fix that issue?

This should do the trick. Ideally we'd change the order of these, and
shove this fix into 5.7, but not sure it's super important since this
bug is pretty old. Would make stable backports easier, though...
Let me know how you want to handle it, as it'll impact the one you
have already queued up.


diff --git a/fs/eventfd.c b/fs/eventfd.c
index 20f0fd4d56e1..96081efdd0c9 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -422,6 +422,10 @@ static int do_eventfd(unsigned int count, int flags)
 	ctx->count = count;
 	ctx->flags = flags;
 	ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
+	if (ctx->id < 0) {
+		fd = ctx->id;
+		goto err;
+	}
 
 	flags &= EFD_SHARED_FCNTL_FLAGS;
 	flags |= O_RDWR;
Christoph Hellwig May 4, 2020, 12:39 p.m. UTC | #6
On Fri, May 01, 2020 at 01:11:09PM -0600, Jens Axboe wrote:
> eventfd is using ->read() as it's file_operations read handler, but
> this prevents passing in information about whether a given IO operation
> is blocking or not. We can only use the file flags for that. To support
> async (-EAGAIN/poll based) retries for io_uring, we need ->read_iter()
> support. Convert eventfd to using ->read_iter().
> 
> With ->read_iter(), we can support IOCB_NOWAIT. Ensure the fd setup
> is done such that we set file->f_mode with FMODE_NOWAIT.

Can you add a anon_inode_getfd_mode that passes extra flags for f_mode
instead of opencoding it?  Especially as I expect more users that might
want to handle IOCB_NOWAIT.
diff mbox series

Patch

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 78e41c7c3d05..20f0fd4d56e1 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -216,32 +216,32 @@  int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *w
 }
 EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
 
-static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
-			    loff_t *ppos)
+static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
 {
+	struct file *file = iocb->ki_filp;
 	struct eventfd_ctx *ctx = file->private_data;
-	ssize_t res;
 	__u64 ucnt = 0;
 	DECLARE_WAITQUEUE(wait, current);
 
-	if (count < sizeof(ucnt))
+	if (iov_iter_count(to) < sizeof(ucnt))
 		return -EINVAL;
-
 	spin_lock_irq(&ctx->wqh.lock);
-	res = -EAGAIN;
-	if (ctx->count > 0)
-		res = sizeof(ucnt);
-	else if (!(file->f_flags & O_NONBLOCK)) {
+	if (!ctx->count) {
+		if ((file->f_flags & O_NONBLOCK) ||
+		    (iocb->ki_flags & IOCB_NOWAIT)) {
+			spin_unlock_irq(&ctx->wqh.lock);
+			return -EAGAIN;
+		}
 		__add_wait_queue(&ctx->wqh, &wait);
 		for (;;) {
 			set_current_state(TASK_INTERRUPTIBLE);
-			if (ctx->count > 0) {
-				res = sizeof(ucnt);
+			if (ctx->count)
 				break;
-			}
 			if (signal_pending(current)) {
-				res = -ERESTARTSYS;
-				break;
+				__remove_wait_queue(&ctx->wqh, &wait);
+				__set_current_state(TASK_RUNNING);
+				spin_unlock_irq(&ctx->wqh.lock);
+				return -ERESTARTSYS;
 			}
 			spin_unlock_irq(&ctx->wqh.lock);
 			schedule();
@@ -250,17 +250,14 @@  static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
 		__remove_wait_queue(&ctx->wqh, &wait);
 		__set_current_state(TASK_RUNNING);
 	}
-	if (likely(res > 0)) {
-		eventfd_ctx_do_read(ctx, &ucnt);
-		if (waitqueue_active(&ctx->wqh))
-			wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
-	}
+	eventfd_ctx_do_read(ctx, &ucnt);
+	if (waitqueue_active(&ctx->wqh))
+		wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
 	spin_unlock_irq(&ctx->wqh.lock);
-
-	if (res > 0 && put_user(ucnt, (__u64 __user *)buf))
+	if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt)))
 		return -EFAULT;
 
-	return res;
+	return sizeof(ucnt);
 }
 
 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
@@ -329,7 +326,7 @@  static const struct file_operations eventfd_fops = {
 #endif
 	.release	= eventfd_release,
 	.poll		= eventfd_poll,
-	.read		= eventfd_read,
+	.read_iter	= eventfd_read,
 	.write		= eventfd_write,
 	.llseek		= noop_llseek,
 };
@@ -406,6 +403,7 @@  EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
 static int do_eventfd(unsigned int count, int flags)
 {
 	struct eventfd_ctx *ctx;
+	struct file *file;
 	int fd;
 
 	/* Check the EFD_* constants for consistency.  */
@@ -425,11 +423,24 @@  static int do_eventfd(unsigned int count, int flags)
 	ctx->flags = flags;
 	ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
 
-	fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
-			      O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));
+	flags &= EFD_SHARED_FCNTL_FLAGS;
+	flags |= O_RDWR;
+	fd = get_unused_fd_flags(flags);
 	if (fd < 0)
-		eventfd_free_ctx(ctx);
+		goto err;
+
+	file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags);
+	if (IS_ERR(file)) {
+		put_unused_fd(fd);
+		fd = PTR_ERR(file);
+		goto err;
+	}
 
+	file->f_mode |= FMODE_NOWAIT;
+	fd_install(fd, file);
+	return fd;
+err:
+	eventfd_free_ctx(ctx);
 	return fd;
 }