diff mbox series

eventfd: Introduce EFD_ZERO_ON_WAKE

Message ID 20210310215023.4129753-1-bgeffon@google.com (mailing list archive)
State New, archived
Headers show
Series eventfd: Introduce EFD_ZERO_ON_WAKE | expand

Commit Message

Brian Geffon March 10, 2021, 9:50 p.m. UTC
This patch introduces a new flag to eventfd, called EFD_ZERO_ON_WAKE.
This change is primarily introduced for use cases which do not care about
the value stored in the eventfd itself. Such existing use cases require an
additional read syscall to clear the count.

This flag provides the following guarantees:

(1) Writes can never block or return EAGAIN.
    The reason this is true is because we don't actually need to store the
    value and as a result the internal value is only changed between 0 and
    1 and back to 0. Therefore POLLERR and POLLOUT are never possible
    outcomes. A poll with POLLOUT or a write will always immediately
    return regardless of EFD_NONBLOCK.

(2) Read / POLLIN result in the internal value being reset to 0.
    When EFD_NONBLOCK is set reads when the internal value is 0 will
    immediately return with EAGAIN, as it always has. Similiarly, when
    a read is performed without EFD_NONBLOCK it will block until a write
    occurs. In both cases after the read completes successfully the
    internal value is reset to 0. When polling with POLLIN, upon return
    of a POLLIN event the internal value will be reset to 0.

Signed-off-by: Brian Geffon <bgeffon@google.com>
---
 fs/eventfd.c            | 39 +++++++++++++++++++++++++++++++++++++--
 include/linux/eventfd.h |  8 +++++++-
 2 files changed, 44 insertions(+), 3 deletions(-)
diff mbox series

Patch

diff --git a/fs/eventfd.c b/fs/eventfd.c
index e265b6dd4f34..56bf04d6461e 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -172,8 +172,21 @@  static __poll_t eventfd_poll(struct file *file, poll_table *wait)
 	 */
 	count = READ_ONCE(ctx->count);
 
-	if (count > 0)
+	if (count > 0) {
+		if ((ctx->flags & EFD_ZERO_ON_WAKE) &&
+				(poll_requested_events(wait) & EPOLLIN)) {
+			/*
+			 * We're going to cause a wake on EPOLLIN, we need to zero the count.
+			 * We validate that EPOLLIN is a requested event because if the user
+			 * did something odd like POLLPRI we wouldn't want to zero the count
+			 * if no wake happens.
+			 */
+			spin_lock_irq(&ctx->wqh.lock);
+			ctx->count = 0;
+			spin_unlock_irq(&ctx->wqh.lock);
+		}
 		events |= EPOLLIN;
+	}
 	if (count == ULLONG_MAX)
 		events |= EPOLLERR;
 	if (ULLONG_MAX - 1 > count)
@@ -239,8 +252,11 @@  static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
 		__add_wait_queue(&ctx->wqh, &wait);
 		for (;;) {
 			set_current_state(TASK_INTERRUPTIBLE);
-			if (ctx->count)
+			if (ctx->count) {
+				if (ctx->flags & EFD_ZERO_ON_WAKE)
+					ctx->count = 0;
 				break;
+			}
 			if (signal_pending(current)) {
 				__remove_wait_queue(&ctx->wqh, &wait);
 				__set_current_state(TASK_RUNNING);
@@ -280,6 +296,18 @@  static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
 		return -EINVAL;
 	spin_lock_irq(&ctx->wqh.lock);
 	res = -EAGAIN;
+
+	/*
+	 * In the case of EFD_ZERO_ON_WAKE the actual count is never needed, for this
+	 * reason we only adjust it to set it from 0 to 1 or 1 to 0. This means that
+	 * write will never return EWOULDBLOCK or block, because there is always
+	 * going to be enough space to write as the amount we will increment could
+	 * be at most 1 as it's clamped below. Additionally, we know that POLLERR
+	 * cannot be returned when EFD_ZERO_ON_WAKE is used for the same reason.
+	 */
+	if (ctx->flags & EFD_ZERO_ON_WAKE)
+		ucnt = (ctx->count == 0) ? 1 : 0;
+
 	if (ULLONG_MAX - ctx->count > ucnt)
 		res = sizeof(ucnt);
 	else if (!(file->f_flags & O_NONBLOCK)) {
@@ -414,9 +442,16 @@  static int do_eventfd(unsigned int count, int flags)
 	BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
 	BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
 
+	/* O_NOFOLLOW has been repurposed as EFD_ZERO_ON_WAKE */
+	BUILD_BUG_ON(EFD_ZERO_ON_WAKE != O_NOFOLLOW);
+
 	if (flags & ~EFD_FLAGS_SET)
 		return -EINVAL;
 
+	/* The semaphore semantics would be lost if using EFD_ZERO_ON_WAKE */
+	if ((flags & EFD_ZERO_ON_WAKE) && (flags & EFD_SEMAPHORE))
+		return -EINVAL;
+
 	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
 		return -ENOMEM;
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index fa0a524baed0..19cab0b654a4 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -26,8 +26,14 @@ 
 #define EFD_CLOEXEC O_CLOEXEC
 #define EFD_NONBLOCK O_NONBLOCK
 
+/*
+ * We intentionally use the value of O_NOFOLLOW for EFD_ZERO_ON_WAKE
+ * because O_NOFOLLOW would have no meaning with an eventfd.
+ */
+#define EFD_ZERO_ON_WAKE O_NOFOLLOW
+
 #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
-#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)
+#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE | EFD_ZERO_ON_WAKE)
 
 struct eventfd_ctx;
 struct file;