Message ID | 20230821202016.2910321-5-willy@infradead.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Convert perf ringbuffer to folios | expand |
On 8/22/23 04:20, Matthew Wilcox (Oracle) wrote: > Instead of allocating a non-compound page and splitting it, allocate > a folio and make its refcount the count of the number of pages in it. > That way, when we free each page in the folio, we'll only actually free > it when the last page in the folio is freed. Keeping the memory intact > is better for the MM system than allocating it and splitting it. > > Now, instead of setting each page->mapping, we only set folio->mapping > which is better for our cacheline usage, as well as helping towards the > goal of eliminating page->mapping. We remove the setting of page->index; > I do not believe this is needed. And we return with the folio locked, > which the fault handler should have been doing all along. > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > --- > kernel/events/core.c | 13 +++++++--- > kernel/events/ring_buffer.c | 51 ++++++++++++++++--------------------- > 2 files changed, 31 insertions(+), 33 deletions(-) > > diff --git a/kernel/events/core.c b/kernel/events/core.c > index 4c72a41f11af..59d4f7c48c8c 100644 > --- a/kernel/events/core.c > +++ b/kernel/events/core.c > @@ -29,6 +29,7 @@ > #include <linux/vmalloc.h> > #include <linux/hardirq.h> > #include <linux/hugetlb.h> > +#include <linux/pagemap.h> > #include <linux/rculist.h> > #include <linux/uaccess.h> > #include <linux/syscalls.h> > @@ -6083,6 +6084,7 @@ static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) > { > struct perf_event *event = vmf->vma->vm_file->private_data; > struct perf_buffer *rb; > + struct folio *folio; > vm_fault_t ret = VM_FAULT_SIGBUS; > > if (vmf->flags & FAULT_FLAG_MKWRITE) { > @@ -6102,12 +6104,15 @@ static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) > vmf->page = perf_mmap_to_page(rb, vmf->pgoff); > if (!vmf->page) > goto unlock; > + folio = page_folio(vmf->page); > > - get_page(vmf->page); > - vmf->page->mapping = vmf->vma->vm_file->f_mapping; > - vmf->page->index = vmf->pgoff; > + folio_get(folio); > + rcu_read_unlock(); > + folio_lock(folio); > + if (!folio->mapping) > + folio->mapping = vmf->vma->vm_file->f_mapping; > > - ret = 0; > + return VM_FAULT_LOCKED; In __do_fault(): if (unlikely(!(ret & VM_FAULT_LOCKED))) lock_page(vmf->page); else VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page); As we lock folio, not sure whether !PageLocked(vmf->page) can be true here. My understanding is yes if vmf->pgoff belongs to tail pages. Did I can miss something here? Regards Yin, Fengwei > unlock: > rcu_read_unlock(); > > diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c > index 56939dc3bf33..0a026e5ff4f5 100644 > --- a/kernel/events/ring_buffer.c > +++ b/kernel/events/ring_buffer.c > @@ -606,39 +606,28 @@ long perf_output_copy_aux(struct perf_output_handle *aux_handle, > > #define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) > > -static struct page *rb_alloc_aux_page(int node, int order) > +static struct folio *rb_alloc_aux_folio(int node, int order) > { > - struct page *page; > + struct folio *folio; > > if (order > MAX_ORDER) > order = MAX_ORDER; > > do { > - page = alloc_pages_node(node, PERF_AUX_GFP, order); > - } while (!page && order--); > - > - if (page && order) { > - /* > - * Communicate the allocation size to the driver: > - * if we managed to secure a high-order allocation, > - * set its first page's private to this order; > - * !PagePrivate(page) means it's just a normal page. > - */ > - split_page(page, order); > - SetPagePrivate(page); > - set_page_private(page, order); > - } > + folio = __folio_alloc_node(PERF_AUX_GFP, order, node); > + } while (!folio && order--); > > - return page; > + if (order) > + folio_ref_add(folio, (1 << order) - 1); > + return folio; > } > > static void rb_free_aux_page(struct perf_buffer *rb, int idx) > { > - struct page *page = virt_to_page(rb->aux_pages[idx]); > + struct folio *folio = virt_to_folio(rb->aux_pages[idx]); > > - ClearPagePrivate(page); > - page->mapping = NULL; > - __free_page(page); > + folio->mapping = NULL; > + folio_put(folio); > } > > static void __rb_free_aux(struct perf_buffer *rb) > @@ -672,7 +661,7 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, > pgoff_t pgoff, int nr_pages, long watermark, int flags) > { > bool overwrite = !(flags & RING_BUFFER_WRITABLE); > - int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); > + int node = (event->cpu == -1) ? numa_mem_id() : cpu_to_node(event->cpu); > int ret = -ENOMEM, max_order; > > if (!has_aux(event)) > @@ -707,17 +696,21 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, > > rb->free_aux = event->pmu->free_aux; > for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) { > - struct page *page; > - int last, order; > + struct folio *folio; > + unsigned int i, nr, order; > + void *addr; > > order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages)); > - page = rb_alloc_aux_page(node, order); > - if (!page) > + folio = rb_alloc_aux_folio(node, order); > + if (!folio) > goto out; > + addr = folio_address(folio); > + nr = folio_nr_pages(folio); > > - for (last = rb->aux_nr_pages + (1 << page_private(page)); > - last > rb->aux_nr_pages; rb->aux_nr_pages++) > - rb->aux_pages[rb->aux_nr_pages] = page_address(page++); > + for (i = 0; i < nr; i++) { > + rb->aux_pages[rb->aux_nr_pages++] = addr; > + addr += PAGE_SIZE; > + } > } > > /*
On Wed, Aug 23, 2023 at 03:38:13PM +0800, Yin Fengwei wrote: > On 8/22/23 04:20, Matthew Wilcox (Oracle) wrote: > > - get_page(vmf->page); > > - vmf->page->mapping = vmf->vma->vm_file->f_mapping; > > - vmf->page->index = vmf->pgoff; > > + folio_get(folio); > > + rcu_read_unlock(); > > + folio_lock(folio); > > + if (!folio->mapping) > > + folio->mapping = vmf->vma->vm_file->f_mapping; > > > > - ret = 0; > > + return VM_FAULT_LOCKED; > In __do_fault(): > > if (unlikely(!(ret & VM_FAULT_LOCKED))) > lock_page(vmf->page); > else > VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page); > > As we lock folio, not sure whether !PageLocked(vmf->page) can be true > here. My understanding is yes if vmf->pgoff belongs to tail pages. Did > I can miss something here? There's only one lock bit per folio; there's no lock bit for individual pages. When we check PageLocked() on a tail page, it redirects to the head page. __PAGEFLAG(Locked, locked, PF_NO_TAIL) #define PF_NO_TAIL(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \ PF_POISONED_CHECK(compound_head(page)); }) #define TESTPAGEFLAG(uname, lname, policy) \ static __always_inline int Page##uname(struct page *page) \ { return test_bit(PG_##lname, &policy(page, 0)->flags); } and that expands out to: static __always_inline int PageLocked(struct page *page) { return test_bit(PG_##locked, &compound_head(page)->flags); }
On 8/23/2023 8:23 PM, Matthew Wilcox wrote: > On Wed, Aug 23, 2023 at 03:38:13PM +0800, Yin Fengwei wrote: >> On 8/22/23 04:20, Matthew Wilcox (Oracle) wrote: >>> - get_page(vmf->page); >>> - vmf->page->mapping = vmf->vma->vm_file->f_mapping; >>> - vmf->page->index = vmf->pgoff; >>> + folio_get(folio); >>> + rcu_read_unlock(); >>> + folio_lock(folio); >>> + if (!folio->mapping) >>> + folio->mapping = vmf->vma->vm_file->f_mapping; >>> >>> - ret = 0; >>> + return VM_FAULT_LOCKED; >> In __do_fault(): >> >> if (unlikely(!(ret & VM_FAULT_LOCKED))) >> lock_page(vmf->page); >> else >> VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page); >> >> As we lock folio, not sure whether !PageLocked(vmf->page) can be true >> here. My understanding is yes if vmf->pgoff belongs to tail pages. Did >> I can miss something here? > > There's only one lock bit per folio; there's no lock bit for individual > pages. When we check PageLocked() on a tail page, it redirects to the > head page. > > __PAGEFLAG(Locked, locked, PF_NO_TAIL) > > #define PF_NO_TAIL(page, enforce) ({ \ > VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \ > PF_POISONED_CHECK(compound_head(page)); }) > > #define TESTPAGEFLAG(uname, lname, policy) \ > static __always_inline int Page##uname(struct page *page) \ > { return test_bit(PG_##lname, &policy(page, 0)->flags); } > > and that expands out to: > > static __always_inline int PageLocked(struct page *page) > { return test_bit(PG_##locked, &compound_head(page)->flags); } > Ah. Here is the trick. Thanks a lot for detail explanation. Regards Yin, Fengwei
On Mon, Aug 21, 2023 at 09:20:16PM +0100, Matthew Wilcox (Oracle) wrote: > Instead of allocating a non-compound page and splitting it, allocate > a folio and make its refcount the count of the number of pages in it. > That way, when we free each page in the folio, we'll only actually free > it when the last page in the folio is freed. Keeping the memory intact > is better for the MM system than allocating it and splitting it. > > Now, instead of setting each page->mapping, we only set folio->mapping > which is better for our cacheline usage, as well as helping towards the > goal of eliminating page->mapping. We remove the setting of page->index; > I do not believe this is needed. And we return with the folio locked, > which the fault handler should have been doing all along. > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> > --- > kernel/events/core.c | 13 +++++++--- > kernel/events/ring_buffer.c | 51 ++++++++++++++++--------------------- > 2 files changed, 31 insertions(+), 33 deletions(-) > > diff --git a/kernel/events/core.c b/kernel/events/core.c > index 4c72a41f11af..59d4f7c48c8c 100644 > --- a/kernel/events/core.c > +++ b/kernel/events/core.c > @@ -29,6 +29,7 @@ > #include <linux/vmalloc.h> > #include <linux/hardirq.h> > #include <linux/hugetlb.h> > +#include <linux/pagemap.h> > #include <linux/rculist.h> > #include <linux/uaccess.h> > #include <linux/syscalls.h> > @@ -6083,6 +6084,7 @@ static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) > { > struct perf_event *event = vmf->vma->vm_file->private_data; > struct perf_buffer *rb; > + struct folio *folio; > vm_fault_t ret = VM_FAULT_SIGBUS; Since we're explicitly returning VM_FAULT_LOCKED on success, perhaps worth simply renaming the unlock label to error and returning VM_FAULT_SIGBUS there? The FAULT_FLAG_MKWRITE branch can simply return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS; > > if (vmf->flags & FAULT_FLAG_MKWRITE) { > @@ -6102,12 +6104,15 @@ static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) > vmf->page = perf_mmap_to_page(rb, vmf->pgoff); > if (!vmf->page) > goto unlock; > + folio = page_folio(vmf->page); > > - get_page(vmf->page); > - vmf->page->mapping = vmf->vma->vm_file->f_mapping; > - vmf->page->index = vmf->pgoff; > + folio_get(folio); > + rcu_read_unlock(); > + folio_lock(folio); > + if (!folio->mapping) > + folio->mapping = vmf->vma->vm_file->f_mapping; > > - ret = 0; > + return VM_FAULT_LOCKED; > unlock: > rcu_read_unlock(); > > diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c > index 56939dc3bf33..0a026e5ff4f5 100644 > --- a/kernel/events/ring_buffer.c > +++ b/kernel/events/ring_buffer.c > @@ -606,39 +606,28 @@ long perf_output_copy_aux(struct perf_output_handle *aux_handle, > > #define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) > > -static struct page *rb_alloc_aux_page(int node, int order) > +static struct folio *rb_alloc_aux_folio(int node, int order) > { > - struct page *page; > + struct folio *folio; > > if (order > MAX_ORDER) > order = MAX_ORDER; > > do { > - page = alloc_pages_node(node, PERF_AUX_GFP, order); > - } while (!page && order--); > - > - if (page && order) { > - /* > - * Communicate the allocation size to the driver: > - * if we managed to secure a high-order allocation, > - * set its first page's private to this order; > - * !PagePrivate(page) means it's just a normal page. > - */ > - split_page(page, order); > - SetPagePrivate(page); > - set_page_private(page, order); I'm guessing this was used in conjunction with the page_private() logic that existed below and can simply be discarded now? > - } > + folio = __folio_alloc_node(PERF_AUX_GFP, order, node); > + } while (!folio && order--); > > - return page; > + if (order) > + folio_ref_add(folio, (1 << order) - 1); Can't order go to -1 if we continue to fail to allocate a folio? > + return folio; > } > > static void rb_free_aux_page(struct perf_buffer *rb, int idx) > { > - struct page *page = virt_to_page(rb->aux_pages[idx]); > + struct folio *folio = virt_to_folio(rb->aux_pages[idx]); > > - ClearPagePrivate(page); > - page->mapping = NULL; > - __free_page(page); > + folio->mapping = NULL; > + folio_put(folio); > } > > static void __rb_free_aux(struct perf_buffer *rb) > @@ -672,7 +661,7 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, > pgoff_t pgoff, int nr_pages, long watermark, int flags) > { > bool overwrite = !(flags & RING_BUFFER_WRITABLE); > - int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); > + int node = (event->cpu == -1) ? numa_mem_id() : cpu_to_node(event->cpu); > int ret = -ENOMEM, max_order; > > if (!has_aux(event)) > @@ -707,17 +696,21 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, > > rb->free_aux = event->pmu->free_aux; > for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) { > - struct page *page; > - int last, order; > + struct folio *folio; > + unsigned int i, nr, order; > + void *addr; > > order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages)); > - page = rb_alloc_aux_page(node, order); > - if (!page) > + folio = rb_alloc_aux_folio(node, order); > + if (!folio) > goto out; > + addr = folio_address(folio); > + nr = folio_nr_pages(folio); I was going to raise the unspeakably annoying nit about this function returning a long, but then that made me wonder why, given folio->_folio_nr_pages is an unsigned int folio_nr_pages() returns a long in the first instance? > > - for (last = rb->aux_nr_pages + (1 << page_private(page)); > - last > rb->aux_nr_pages; rb->aux_nr_pages++) > - rb->aux_pages[rb->aux_nr_pages] = page_address(page++); > + for (i = 0; i < nr; i++) { > + rb->aux_pages[rb->aux_nr_pages++] = addr; > + addr += PAGE_SIZE; > + } > } > > /* > -- > 2.40.1 >
On Sun, Aug 27, 2023 at 09:01:04AM +0100, Lorenzo Stoakes wrote: > On Mon, Aug 21, 2023 at 09:20:16PM +0100, Matthew Wilcox (Oracle) wrote: > > - if (page && order) { > > - /* > > - * Communicate the allocation size to the driver: > > - * if we managed to secure a high-order allocation, > > - * set its first page's private to this order; > > - * !PagePrivate(page) means it's just a normal page. > > - */ > > - split_page(page, order); > > - SetPagePrivate(page); > > - set_page_private(page, order); > > I'm guessing this was used in conjunction with the page_private() logic > that existed below and can simply be discarded now? As far as I can tell, yes. > > - } > > + folio = __folio_alloc_node(PERF_AUX_GFP, order, node); > > + } while (!folio && order--); > > > > - return page; > > + if (order) > > + folio_ref_add(folio, (1 << order) - 1); > > Can't order go to -1 if we continue to fail to allocate a folio? Hm, yes. > > @@ -707,17 +696,21 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, > > > > rb->free_aux = event->pmu->free_aux; > > for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) { > > - struct page *page; > > - int last, order; > > + struct folio *folio; > > + unsigned int i, nr, order; > > + void *addr; > > > > order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages)); > > - page = rb_alloc_aux_page(node, order); > > - if (!page) > > + folio = rb_alloc_aux_folio(node, order); > > + if (!folio) > > goto out; > > + addr = folio_address(folio); > > + nr = folio_nr_pages(folio); > > I was going to raise the unspeakably annoying nit about this function returning > a long, but then that made me wonder why, given folio->_folio_nr_pages is an > unsigned int folio_nr_pages() returns a long in the first instance? Futureproofing the API. While I don't expect us to be allocating order-32 folios any time soon, and x86 has excluded p4d-sizes from the architecture, I don't want to have to go through and audit everywhere when it turns out we do want to support such a thing on some architecture. (on x86 PMD - order 9, PUD - order 18, P4D - order 27, so we'd need a hypothetical P5D level, or a machine with, say 16k pages, which would go PMD-11, PUD-22, P4D-33, and be managing a folio of size 2^57 bytes) But supporting nr_pages stored directly in the otherwise wasted space in the folio seemed like a cheap win, and there was no reason to take up the extra 32 bits. Anyway, here, we know we're not getting back an arbitrary folio that somebody else allocated, we're allocating for ourselves, and we know we're allocating something rather smaller than that, so it's fine to calculate in terms of unsigned int. If I were writing in rust, I'd use usize, but since nobody's done the work to make rust types available in C yet, I haven't done that either. We actually use usize as a variable name in a couple of hundred places, so turning it into a generally available type is harder than one might like.
diff --git a/kernel/events/core.c b/kernel/events/core.c index 4c72a41f11af..59d4f7c48c8c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -29,6 +29,7 @@ #include <linux/vmalloc.h> #include <linux/hardirq.h> #include <linux/hugetlb.h> +#include <linux/pagemap.h> #include <linux/rculist.h> #include <linux/uaccess.h> #include <linux/syscalls.h> @@ -6083,6 +6084,7 @@ static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) { struct perf_event *event = vmf->vma->vm_file->private_data; struct perf_buffer *rb; + struct folio *folio; vm_fault_t ret = VM_FAULT_SIGBUS; if (vmf->flags & FAULT_FLAG_MKWRITE) { @@ -6102,12 +6104,15 @@ static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) vmf->page = perf_mmap_to_page(rb, vmf->pgoff); if (!vmf->page) goto unlock; + folio = page_folio(vmf->page); - get_page(vmf->page); - vmf->page->mapping = vmf->vma->vm_file->f_mapping; - vmf->page->index = vmf->pgoff; + folio_get(folio); + rcu_read_unlock(); + folio_lock(folio); + if (!folio->mapping) + folio->mapping = vmf->vma->vm_file->f_mapping; - ret = 0; + return VM_FAULT_LOCKED; unlock: rcu_read_unlock(); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 56939dc3bf33..0a026e5ff4f5 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -606,39 +606,28 @@ long perf_output_copy_aux(struct perf_output_handle *aux_handle, #define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) -static struct page *rb_alloc_aux_page(int node, int order) +static struct folio *rb_alloc_aux_folio(int node, int order) { - struct page *page; + struct folio *folio; if (order > MAX_ORDER) order = MAX_ORDER; do { - page = alloc_pages_node(node, PERF_AUX_GFP, order); - } while (!page && order--); - - if (page && order) { - /* - * Communicate the allocation size to the driver: - * if we managed to secure a high-order allocation, - * set its first page's private to this order; - * !PagePrivate(page) means it's just a normal page. - */ - split_page(page, order); - SetPagePrivate(page); - set_page_private(page, order); - } + folio = __folio_alloc_node(PERF_AUX_GFP, order, node); + } while (!folio && order--); - return page; + if (order) + folio_ref_add(folio, (1 << order) - 1); + return folio; } static void rb_free_aux_page(struct perf_buffer *rb, int idx) { - struct page *page = virt_to_page(rb->aux_pages[idx]); + struct folio *folio = virt_to_folio(rb->aux_pages[idx]); - ClearPagePrivate(page); - page->mapping = NULL; - __free_page(page); + folio->mapping = NULL; + folio_put(folio); } static void __rb_free_aux(struct perf_buffer *rb) @@ -672,7 +661,7 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, pgoff_t pgoff, int nr_pages, long watermark, int flags) { bool overwrite = !(flags & RING_BUFFER_WRITABLE); - int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); + int node = (event->cpu == -1) ? numa_mem_id() : cpu_to_node(event->cpu); int ret = -ENOMEM, max_order; if (!has_aux(event)) @@ -707,17 +696,21 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, rb->free_aux = event->pmu->free_aux; for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) { - struct page *page; - int last, order; + struct folio *folio; + unsigned int i, nr, order; + void *addr; order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages)); - page = rb_alloc_aux_page(node, order); - if (!page) + folio = rb_alloc_aux_folio(node, order); + if (!folio) goto out; + addr = folio_address(folio); + nr = folio_nr_pages(folio); - for (last = rb->aux_nr_pages + (1 << page_private(page)); - last > rb->aux_nr_pages; rb->aux_nr_pages++) - rb->aux_pages[rb->aux_nr_pages] = page_address(page++); + for (i = 0; i < nr; i++) { + rb->aux_pages[rb->aux_nr_pages++] = addr; + addr += PAGE_SIZE; + } } /*
Instead of allocating a non-compound page and splitting it, allocate a folio and make its refcount the count of the number of pages in it. That way, when we free each page in the folio, we'll only actually free it when the last page in the folio is freed. Keeping the memory intact is better for the MM system than allocating it and splitting it. Now, instead of setting each page->mapping, we only set folio->mapping which is better for our cacheline usage, as well as helping towards the goal of eliminating page->mapping. We remove the setting of page->index; I do not believe this is needed. And we return with the folio locked, which the fault handler should have been doing all along. Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> --- kernel/events/core.c | 13 +++++++--- kernel/events/ring_buffer.c | 51 ++++++++++++++++--------------------- 2 files changed, 31 insertions(+), 33 deletions(-)