[RFC,11/15] epoll: offload polling to a work in case of epfd polled from userspace

Message ID	20190109164025.24554-12-rpenyaev@suse.de (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-fsdevel-owner@kernel.org> From: Roman Penyaev <rpenyaev@suse.de> Cc: Roman Penyaev <rpenyaev@suse.de>, Andrew Morton <akpm@linux-foundation.org>, Davidlohr Bueso <dbueso@suse.de>, Jason Baron <jbaron@akamai.com>, Al Viro <viro@zeniv.linux.org.uk>, "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>, Linus Torvalds <torvalds@linux-foundation.org>, Andrea Parri <andrea.parri@amarulasolutions.com>, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org Subject: [RFC PATCH 11/15] epoll: offload polling to a work in case of epfd polled from userspace Date: Wed, 9 Jan 2019 17:40:21 +0100 Message-Id: <20190109164025.24554-12-rpenyaev@suse.de> In-Reply-To: <20190109164025.24554-1-rpenyaev@suse.de> References: <20190109164025.24554-1-rpenyaev@suse.de> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit To: unlisted-recipients:; (no To-header on input) Sender: linux-fsdevel-owner@vger.kernel.org Precedence: bulk
Series	epoll: support pollable epoll from userspace \| expand [RFC,00/15] epoll: support pollable epoll from userspace [RFC,04/15] epoll: move private helpers from a header to the source [RFC,05/15] epoll: introduce user header structure and user index for polling from userspace [RFC,06/15] epoll: introduce various of helpers for user structure lengths calculations [RFC,07/15] epoll: extend epitem struct with new members for polling from userspace [RFC,08/15] epoll: some sanity flags checks for epoll syscalls for polled epfd from userspace [RFC,09/15] epoll: introduce stand-alone helpers for polling from userspace [RFC,10/15] epoll: support polling from userspace for ep_insert() [RFC,11/15] epoll: offload polling to a work in case of epfd polled from userspace [RFC,12/15] epoll: support polling from userspace for ep_remove() [RFC,13/15] epoll: support polling from userspace for ep_modify() [RFC,14/15] epoll: support polling from userspace for ep_poll() [RFC,15/15] epoll: support mapping for epfd when polled from userspace

diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4618db9c077c..2af849e6c7a5 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1624,9 +1624,8 @@ static inline bool chain_epi_lockless(struct epitem *epi) } /* - * This is the callback that is passed to the wait queue wakeup - * mechanism. It is called by the stored file descriptors when they - * have events to report. + * This is the callback that is called directly from wake queue wakeup or + * from a work. * * This callback takes a read lock in order not to content with concurrent * events from another file descriptors, thus all modifications to ->rdllist @@ -1641,14 +1640,11 @@ static inline bool chain_epi_lockless(struct epitem *epi) * queues are used should be detected accordingly. This is detected using * cmpxchg() operation. */ -static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) +static int ep_poll_callback(struct epitem *epi, __poll_t pollflags) { - int pwake = 0; - struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; - __poll_t pollflags = key_to_poll(key); + int pwake = 0, ewake = 0; unsigned long flags; - int ewake = 0; read_lock_irqsave(&ep->lock, flags); @@ -1666,12 +1662,32 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v /* * Check the events coming with the callback. At this stage, not * every device reports the events in the "key" parameter of the - * callback. We need to be able to handle both cases here, hence the - * test for "key" != NULL before the event match test. + * callback (for ep_poll_callback() case special worker is used). + * We need to be able to handle both cases here, hence the test + * for "key" != NULL before the event match test. */ if (pollflags && !(pollflags & epi->event.events)) goto out_unlock; + if (ep_polled_by_user(ep)) { + __poll_t revents; + + if (ep_events_routed_to_uring(ep)) { + ep_add_event_to_uring(epi, pollflags); + goto wakeup; + } + + WARN_ON(!pollflags); + revents = (epi->event.events & ~EP_PRIVATE_BITS) & pollflags; + + /* + * Keep active events up-to-date for further transfer from + * klists to uring. + */ + __atomic_fetch_or(&epi->ready_events, revents, + __ATOMIC_RELAXED); + } + /* * If we are transferring events to userspace, we can hold no locks * (because we're accessing user memory, and because of linux f_op->poll() @@ -1679,6 +1695,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * chained in ep->ovflist and requeued later on. */ if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { + WARN_ON(ep_polled_by_user(ep)); if (epi->next == EP_UNACTIVE_PTR && chain_epi_lockless(epi)) ep_pm_stay_awake_rcu(epi); @@ -1691,6 +1708,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v ep_pm_stay_awake_rcu(epi); } +wakeup: /* * Wake up ( if active ) both the eventpoll wait list and the ->poll() * wait list. @@ -1727,23 +1745,67 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v if (!(epi->event.events & EPOLLEXCLUSIVE)) ewake = 1; - if (pollflags & POLLFREE) { - /* - * If we race with ep_remove_wait_queue() it can miss - * ->whead = NULL and do another remove_wait_queue() after - * us, so we can't use __remove_wait_queue(). - */ - list_del_init(&wait->entry); + return ewake; +} + +static void ep_poll_callback_work(struct work_struct *work) +{ + struct epitem *epi = container_of(work, typeof(*epi), work); + __poll_t pollflags; + poll_table pt; + + WARN_ON(!ep_polled_by_user(epi->ep)); + + init_poll_funcptr(&pt, NULL); + pollflags = ep_item_poll(epi, &pt, 1); + + (void)ep_poll_callback(epi, pollflags); +} + +/* + * This is the callback that is passed to the wait queue wakeup + * mechanism. It is called by the stored file descriptors when they + * have events to report. + */ +static int ep_poll_wakeup(wait_queue_entry_t *wait, unsigned int mode, + int sync, void *key) +{ + + struct epitem *epi = ep_item_from_wait(wait); + struct eventpoll *ep = epi->ep; + __poll_t pollflags = key_to_poll(key); + int rc; + + if (!ep_polled_by_user(ep) || pollflags) { + rc = ep_poll_callback(epi, pollflags); + + if (pollflags & POLLFREE) { + /* + * If we race with ep_remove_wait_queue() it can miss + * ->whead = NULL and do another remove_wait_queue() + * after us, so we can't use __remove_wait_queue(). + */ + list_del_init(&wait->entry); + /* + * ->whead != NULL protects us from the race with + * ep_free() or ep_remove(), ep_remove_wait_queue() + * takes whead->lock held by the caller. Once we nullify + * it, nothing protects ep/epi or even wait. + */ + smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL); + } + } else { + schedule_work(&epi->work); + /* - * ->whead != NULL protects us from the race with ep_free() - * or ep_remove(), ep_remove_wait_queue() takes whead->lock - * held by the caller. Once we nullify it, nothing protects - * ep/epi or even wait. + * Here on this path we are absolutely sure that for file + * descriptors* which are pollable from userspace we do not + * support EPOLLEXCLUSIVE, so it is safe to return 1. */ - smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL); + rc = 1; } - return ewake; + return rc; } /* @@ -1757,7 +1819,7 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, struct eppoll_entry *pwq; if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) { - init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); + init_waitqueue_func_entry(&pwq->wait, ep_poll_wakeup); pwq->whead = whead; pwq->base = epi; if (epi->event.events & EPOLLEXCLUSIVE) @@ -1990,6 +2052,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, INIT_LIST_HEAD(&epi->rdllink); INIT_LIST_HEAD(&epi->fllink); INIT_LIST_HEAD(&epi->pwqlist); + INIT_WORK(&epi->work, ep_poll_callback_work); epi->ep = ep; ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event;

[RFC,11/15] epoll: offload polling to a work in case of epfd polled from userspace

Commit Message

Patch